拉勾爬虫输出

爬取拉勾所有数据代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import requests
from lxml import etree
import time
import random
from pymongo import MongoClient
from fake_useragent import UserAgent
#IP代理
proxiesList = [
'http://120.32.209.231:8118',
'http://210.82.36.142:80',
'http://123.125.116.151:80',
'http://113.5.80.144:8080',
'http://122.224.227.202:3128',
'http://220.191.214.176:8080'
]
proxies = {'http' : random.choice(proxiesList)}
#随机浏览器
userAgent = UserAgent().random
#headers文件
headers = {
'Cookie':'user_trace_token=20171110210441-b6e83f7b-c617-11e7-86f0-525400f775ce; LGUID=20171110210441-b6e847a6-c617-11e7-86f0-525400f775ce; X_HTTP_TOKEN=c2758cd47b96276a5a741c7c8aef7389; TG-TRACK-CODE=index_search; SEARCH_ID=d3457fbe235a4f4c8dd0b8a8e8ce93dd; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAACEBACDGB5AE0EFB5452955E2ED72F2B5D254B8E; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1510319081,1510652275; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1510665850; _gat=1; _ga=GA1.2.739478823.1510319081; _gid=GA1.2.1539303787.1510652275; LGSID=20171114212410-195b1e64-c93f-11e7-8eeb-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2F; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGRID=20171114212410-195b1fe3-c93f-11e7-8eeb-525400f775ce; _putrc=11E3FE8A2D6757DB; login=true; unick=%E6%9D%8E%E4%BA%8C%E8%8A%B1; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/',
'Upgrade-Insecure-Requests':'1',
'User-Agent':userAgent
}
#抓取首页所有网址
def getAllUrl():
mainPageUrl = 'https://www.lagou.com/'
responses = requests.get(mainPageUrl,headers = headers).text
lagouHtml = etree.HTML(responses)
urlNames = lagouHtml.xpath('//div[@class="sidebar"]/div/div/div[2]/dl/dd/a/text()')
urls = lagouHtml.xpath('//div[@class="sidebar"]/div/div/div[2]/dl/dd/a/@href')
for urlName,url in zip(urlNames,urls):
dict = {
'urlName' : urlName,
'url' : url
}
yield dict
# print(dict)
def saveSubUrl(dict):
client = MongoClient()
db = client.拉勾主站所有网址
sheet = db.网址明细
sheet.insert_one(dict)
#对首页抓取的每一个网址进行爬取
def urlResp():
for item in getAllUrl():
statusCode = 200
i = 1
while(statusCode == 200):
url = item['url'] + str(i)
try:
responses = requests.get(url,headers = headers, proxies = proxies)
except:
statusCode = 200
continue
statusCode = responses.status_code
resp = etree.HTML(responses.text)
i = i + 1
if item['urlName'] == '.NET':
name = 'NET'
else:
name = item['urlName']
# yield responseHtml
companies = resp.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[2]/div[1]/a/text()')
positions = resp.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[1]/div[1]/a/h3/text()')
places = resp.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[1]/div[1]/a/span/em/text()')
pays = resp.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[1]/div[2]/div/span/text()')
works = resp.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[2]/div[2]/text()')
descriptions = resp.xpath('//*[@id="s_position_list"]/ul/li/div[2]/div[2]/text()')
needs = resp.xpath('//*[@id="s_position_list"]/ul/li/div[2]/div[1]/span/text()')
for company,position,place,pay,work,description,need in zip(companies,positions,places,pays,works,descriptions,needs):
dict = {
'公司' : company.strip(),
'职位' : position.strip(),
'工作地点' : place.strip(),
'薪资' : pay.strip(),
'工作职务' : work.strip(),
'公司描述' : description.strip(),
'能力需要' : [i.strip() for i in need]
}
storeData(dict,name)
time.sleep(1)
def storeData(data,DbSheet):
client = MongoClient()
db = client['拉勾网']
mySet = db[DbSheet]
mySet.insert_one(data)
print("保存成功")
if __name__ == '__main__':
urlResp()