又在腾云阁发现一篇 Python 的爬虫文章,顺便存了。
收录待用,修改转载已取得腾讯云授权
节选:
...
这是用来下载美图网上 100 个页面的所有的图片
import requests
import re
import time
from redis import Redis
headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' }
def push_redis_list():
r = Redis(host='10.66.149.8',port=6379,password='')
for i in range(100):
num = 5100+i;
url ='http://www.meizitu.com/a/'+ str(num) +'.html'
img_url = requests.get(url,timeout=30)
#print img_url.text
#time.sleep(10)
img_url_list = re.findall('http://mm.howkuai.com/wp-content/uploads/201.*.jpg',img_url.text)
print(img_url_list)
for temp_img_url in img_url_list:
l = len(re.findall('limg',temp_img_url))
#print l
if(l == 0):
print("url: ",temp_img_url)
r.lpush('meizitu',temp_img_url)
print(r.llen('meizitu'))
return 0
def get_big_img_url():
r = Redis(host='10.66.149.8',port=6379,password='')
while(1):
try:
url = r.lpop('meizitu')
download(url)
time.sleep(1)
print(url)
except:
print("请求求发送失败重试")
time.sleep(10)
continue
return 0
def download(url):
try:
r = requests.get(url,headers=headers,timeout = 50)
name = int(time.time())
f = open('./pic/'+str(name)+'.jpg','wb')
f.write(r.content)
f.close()
except Exception as e:
print(Exception,":",e)
if __name__ == '__main__':
url = 'http://www.meizitu.com/a/list_1_'
print("begin")
push_redis_list()#开启则加任务队列
#get_big_img_url()#开启则运行爬取任务
...
原文链接: https://www.qcloud.com/community/article/337567001488804157