1
usedel 2015-07-23 17:55:33 +08:00
我是通过两个正则表达式来做,看有没有大神可以一个正则表达式就做到
#coding=utf-8 import re import urllib2 import sys reload(sys) sys.setdefaultencoding("utf-8") url = "http://www.zreading.cn/archives/4993.html" htmlCode = urllib2.urlopen(url).read() #提取文章部分html pattern = re.compile(r'window\.adsbygoogle([\s\S]*?)<div class="wumii-hook">',re.M) res = re.findall(pattern, htmlCode) #从每个p标签中提取文章 pattern = re.compile(r'<p>(.*?)<') for p in res: art = re.findall(pattern,str(p)) for x in art: print x |