import urllib2
import urllib
import re
import thread
import time
class SpiderModel:
def __init_(self):
self.page = 1
self.pages = []
self.enable = False
def GetPage(self,page):
myUrl = "http://qiushibaike.com/hot/page/" + page
#'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'下面这个是我在网页源码看的
user_agent = 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/43.0.2357.134 Safari/537.36'
headers = {'User-Agent': user_agent}
req = urllib2.Request(myUrl,headers = headers)
myResponse = urllib2.urlopen(req)
myPage = myResponse.read()
unicodePage = myPage.decode('utf-8')
myItems = re.findall('<div class="content">(.*?)</div>',unicodePage,re.S)
items = []
for item in myItems:
items.appen([item[0].replace("\n","")])
return items
def LoadPage(self):
while self.enable:
if len(self.pages) < 2:
try:
myPage = self.GetPage(str(self.pagg))
self.page += 1
self.pages.append(myPage)
except:
print '无法加载'
else:
time.sleep(1)
def ShowPage(self,nowPage,page):
for items in nowPage:
print u'第%d页' %page, items[0], items[1]
myInput = raw_input()
if myInput =="quit":
self.enable = False
break
def Start(self):
self.enable = True
page = self.page
print u'正在加载。。'
thread.start_new_thread(self.LoadPage())
while self.enable:
if self.pages:
nowPage = self.pages[0]
del self.pages[0]
self.ShowPage(nowPage, page)
page += 1
糗事百科
"""
print u'按下回车键浏览内容:'
raw_input(' ')
myModel = Spider_Model()
myModel.Start()
代码如上。编译器没有报错。然后按enter键显示无法加载,不知道错误出在哪了
新人刚刚开始学python,在这请教各位了。麻烦了~~~
1
rrrkren 2015-07-22 04:03:51 +08:00
|
2
WKPlus 2015-07-22 12:47:30 +08:00 1
self.GetPage(str(self.pagg)) 这里应该是self.page吧
另外,遇到问题可以把try except去掉,看看异常提示 |