1
cc7756789 OP 我想试试能不能插入高亮 额。。。。。
import requests from bs4 import BeautifulSoup import threading url_num = 0 url_list = ['http://ubuntuforums.org/forumdisplay.php?f=333',] for x in range(1, 50): url_num += 1 raw_url = 'http://ubuntuforums.org/forumdisplay.php?f=333&page=%d' % url_num url_list.append(raw_url) class MyThread(threading.Thread): def __init__(self, func, args, name=""): threading.Thread.__init__(self) self.func = func self.args = args self.name = name def run(self): apply(self.func, self.args) def running(url): # lock.acquire() html = requests.get(url) if html.status_code == 200: html_text = html.text soup = BeautifulSoup(html_text) with open('/home/zhg/Pictures/cao.txt', 'a+') as f: for link in soup.find_all('a', 'title'): s = 'http://ubuntuforums.org/' + str(link.get('href')) + ' ' + str(link.get_text().encode('utf-8')) f.writelines(s) f.writelines('\n') # lock.release() if __name__ == '__main__': thread_list = [ MyThread(running, (url, ), running.__name__) for url in url_list ] for t in thread_list: t.setDaemon(True) t.start() for i in thread_list: i.join() print "process ended" with open('/home/zhg/Pictures/cao.txt', 'r') as f: f_list = f.readlines() set_list = set(f_list) for x in set_list: if f_list.count(x) > 1: print "the <%s> has found <%d>" % (x, f_list.count(x)) |
2
fy 2015-05-18 16:08:38 +08:00
楼主一看就不是VIP用户,看我的:
```python def foo(): pass ``` |
3
Earthman 2015-05-18 16:43:47 +08:00
@fy 你也一样213,v2ex回帖没有markdown功能,发帖才有的。发帖格式有问题可以考虑请管理员编辑一下或者重新发帖(不推荐)
|
4
cc7756789 OP 问题找到了,就是要爬取的网站有置顶的帖子,所以置顶的记录被重复了n次
|
5
chairuosen 2015-05-18 17:15:31 +08:00
有个东西叫gist
|
7
kingname 2015-05-18 18:28:29 +08:00
使用scrapy天然自带去重复
|
8
withrock 2015-05-19 10:02:43 +08:00
多线程多进程抓取,都放到一个队列里,开一个进程从队列里取抓到的数据入库。
|