Python爬虫入门(3)————ajax加载的知乎(GET方法)

关键词:爬虫,知乎

这一节,我们讨论一下ajax加载的知乎如何爬取

如何得出是ajax加载的:

  • 点击翻页,浏览器的URL不发生变化,说明是ajax加载
  • 在安全内,关闭js加载,重新刷新,若网页无法加载出之前那么多数据,也能说明

我们知道,知乎的反爬措施做的还是可以的,使用ajax加载,并且需要cookie模拟登录

我们以饶毅老师的关注者为例子

双击network里的follower之后,会出现如下页面

简单的一页数据的爬取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import requests
from lxml import etree
import json
#要爬取的URL
url = 'https://www.zhihu.com/api/v4/members/zhi-shi-fen-zi-2015/followers?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20'
#使用cookie等模拟登录,headers的格式是字典格式
headers = {
'Cookie':'_zap=5e5da3cd-545c-4642-8392-534f021f53b1; d_c0="AFCCWQOIuQuPTlmLLuby8TEFDROSBhQJEio=|1494209355"; r_cap_id="ODg2OWQ5MGIyMjdkNDkwMDgzNzY1OTE5ZWYzODBhMDE=|1508471442|9a0726321ff2ff543d47c12b9554f6eb4dcff47d"; cap_id="NzEzNGQ3NTVhYjdiNGY5MGEzNzcxZTE0OWY5Nzk2YTM=|1508471442|bf83c58212c39c654e693236611272f7d007cc73"; z_c0=Mi4xQ01rZEFBQUFBQUFBVUlKWkE0aTVDeGNBQUFCaEFsVk5tY0RXV2dEMXZGbTZpbGJzX2tma0Itamh2RWhWdVdPQzhR|1508471449|a86d19990d47a31f3a9e9024bb1d1a15e8f9a53e; q_c1=7a8336678f744f338ba84397145d8ce1|1508746676000|1492264806000; q_c1=7a8336678f744f338ba84397145d8ce1|1508746676000|1492264806000; aliyungf_tc=AQAAAOcCB0GUxAAAlRet3gc99cDV6Npw; s-t=autocomplete; s-q=%E9%A5%B6%E6%AF%85; s-i=1; sid=j2258eo5; __utma=51854390.612755380.1510317297.1510317297.1510637147.2; __utmb=51854390.0.10.1510637147; __utmc=51854390; __utmz=51854390.1510637147.2.2.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=51854390.100-1|2=registration_date=20131026=1^3=entry_date=20131026=1; _xsrf=13004504-77ba-4731-9350-da5c5580eb20',
'Host':'www.zhihu.com',
'Referer':'https://www.zhihu.com/people/zhi-shi-fen-zi-2015/followers?page=2',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
'X-UDID':'AFCCWQOIuQuPTlmLLuby8TEFDROSBhQJEio='
}
#使用requests库获取网页信息,因为是ajax的,所以数据内容是json不需要使用xpath进行解析
zhihuHtml = requests.get(url,headers = headers)
zhihuDict = zhihuHtml.json() #经过.json()之后返回的是一个字典。
zhihuData = zhihuDict['data']#因为zhihuDict是一个字典,所以可以使用键来选择
for i in range(len(zhihuData)): #使用for循环输出所有的人名
print(zhihuData[i]['name'])

一个完整的例子

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import requests
from lxml import etree
import json
from pymongo import MongoClient
import time
#获得要爬取的URL
def getUrl():
urlList = []
startUrl = 'https://www.zhihu.com/api/v4/members/zhi-shi-fen-zi-2015/followers?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={}&limit=20'
for i in range(0,58100,20):
urlList.append(startUrl.format(i))
return urlList
#获取URL对应的所有网页的内容
def urlHtml(url):
#设置一个代理,防止过度操作IP被封
proxies = {'http':'125.31.19.27:80'}
#headers模拟登录
headers = {
'Cookie':'_zap=5e5da3cd-545c-4642-8392-534f021f53b1; d_c0="AFCCWQOIuQuPTlmLLuby8TEFDROSBhQJEio=|1494209355"; r_cap_id="ODg2OWQ5MGIyMjdkNDkwMDgzNzY1OTE5ZWYzODBhMDE=|1508471442|9a0726321ff2ff543d47c12b9554f6eb4dcff47d"; cap_id="NzEzNGQ3NTVhYjdiNGY5MGEzNzcxZTE0OWY5Nzk2YTM=|1508471442|bf83c58212c39c654e693236611272f7d007cc73"; z_c0=Mi4xQ01rZEFBQUFBQUFBVUlKWkE0aTVDeGNBQUFCaEFsVk5tY0RXV2dEMXZGbTZpbGJzX2tma0Itamh2RWhWdVdPQzhR|1508471449|a86d19990d47a31f3a9e9024bb1d1a15e8f9a53e; q_c1=7a8336678f744f338ba84397145d8ce1|1508746676000|1492264806000; q_c1=7a8336678f744f338ba84397145d8ce1|1508746676000|1492264806000; aliyungf_tc=AQAAAOcCB0GUxAAAlRet3gc99cDV6Npw; s-t=autocomplete; s-q=%E9%A5%B6%E6%AF%85; s-i=1; sid=j2258eo5; __utma=51854390.612755380.1510317297.1510317297.1510637147.2; __utmb=51854390.0.10.1510637147; __utmc=51854390; __utmz=51854390.1510637147.2.2.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=51854390.100-1|2=registration_date=20131026=1^3=entry_date=20131026=1; _xsrf=13004504-77ba-4731-9350-da5c5580eb20',
'Host':'www.zhihu.com',
'Referer':'https://www.zhihu.com/people/zhi-shi-fen-zi-2015/followers?page=2',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
'X-UDID':'AFCCWQOIuQuPTlmLLuby8TEFDROSBhQJEio='
}
#自己的IP出问题后就使用代理
try:
#使用requests方法得到的responses是一个<class 'requests.models.Response'>
responses = requests.get(url,headers = headers)
except:
responses = requests.get(url,headers = headers,proxies = proxies)
return responses
#解析网页数据并存储的函数
def parseHtml():
for url in getUrl():
responses = urlHtml(url)
#对获取的responses使用json方法,返回一个字典,再获得字典的 data 键对应的值
zhihuDict = responses.json()['data']
#在知乎中字典的data键对应的值是一个列表,里面包含若干字典,字典内有数据的键值对。
for item in zhihuDict:
dict = {
'昵称':item['name'],
'性别':item['gender'],
'回答问题数目':item['answer_count'],
'签名':item['headline'],
'关注者数量':item['follower_count']
}
print(dict)
client = MongoClient()
database = client.raoyi
sheet = database.faoyiFollowers
sheet.insert_one(dict)
time.sleep(3)
if __name__ == '__main__':
parseHtml()