requests_html 包爬虫求助

今天学习 requests_html 包网上找俩个爬虫分别爬慕课网和 51cto 的爬虫结果发现传入的时候一个可以直接传 res.html，另一个必须要用 PyQuery 转换一下 res.text 才能传入这个是为什么啊上代码。。。。
抓取 51 的
from requests_html import AsyncHTMLSession # 导入异步模块

asession = AsyncHTMLSession()

BASE_URL = "http://edu.51cto.com/courselist/index-p{}.html"

async def get_html():
for i in range(1,2):
r = await asession.get(BASE_URL.format(i)) # 异步等待
get_item(r.html)

def get_item(html):
c_list = html.find('.cList',first=True)
if c_list:
items = c_list.find('.cList_Item')
for item in items:
title = item.find("h3",first=True).text # 课程名称
href = item.find('h3>a',first=True).attrs["href"] # 课程的链接地址
dict = {
"title":title,
"href":href,
}
print(dict)

if __name__ == '__main__':
result = asession.run(get_html)

抓取慕课的

from requests_html import AsyncHTMLSession
from pyquery import PyQuery as pq

s = AsyncHTMLSession()
url = "https://www.imooc.com/course/list?page={i}"

async def get_html():
for i in range(1,2):
res = await s.get(url.format(i=i))
d = pq(res.text)
get_content(d)

def get_content(d):
courses = d.items(".course-card-container")
for course in courses:
title = course.find(".course-card-name").text() # 查找 title
des = course.find(".course-card-desc").text()

dict = {
"title":title,
"des":des
}
print(dict)

if __name__ == '__main__':
ret = s.run(get_html)

title

Text

dict

get_html

6 条回复