拉勾爬虫输出 发表于 2016-10-21 | 分类于 爬虫 爬取拉勾所有数据代码 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105import requestsfrom lxml import etreeimport timeimport randomfrom pymongo import MongoClientfrom fake_useragent import UserAgent#IP代理proxiesList = [ 'http://120.32.209.231:8118', 'http://210.82.36.142:80', 'http://123.125.116.151:80', 'http://113.5.80.144:8080', 'http://122.224.227.202:3128', 'http://220.191.214.176:8080']proxies = {'http' : random.choice(proxiesList)}#随机浏览器userAgent = UserAgent().random#headers文件headers = { 'Cookie':'user_trace_token=20171110210441-b6e83f7b-c617-11e7-86f0-525400f775ce; LGUID=20171110210441-b6e847a6-c617-11e7-86f0-525400f775ce; X_HTTP_TOKEN=c2758cd47b96276a5a741c7c8aef7389; TG-TRACK-CODE=index_search; SEARCH_ID=d3457fbe235a4f4c8dd0b8a8e8ce93dd; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAACEBACDGB5AE0EFB5452955E2ED72F2B5D254B8E; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1510319081,1510652275; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1510665850; _gat=1; _ga=GA1.2.739478823.1510319081; _gid=GA1.2.1539303787.1510652275; LGSID=20171114212410-195b1e64-c93f-11e7-8eeb-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2F; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGRID=20171114212410-195b1fe3-c93f-11e7-8eeb-525400f775ce; _putrc=11E3FE8A2D6757DB; login=true; unick=%E6%9D%8E%E4%BA%8C%E8%8A%B1; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0', 'Host':'www.lagou.com', 'Referer':'https://www.lagou.com/', 'Upgrade-Insecure-Requests':'1', 'User-Agent':userAgent}#抓取首页所有网址def getAllUrl(): mainPageUrl = 'https://www.lagou.com/' responses = requests.get(mainPageUrl,headers = headers).text lagouHtml = etree.HTML(responses) urlNames = lagouHtml.xpath('//div[@class="sidebar"]/div/div/div[2]/dl/dd/a/text()') urls = lagouHtml.xpath('//div[@class="sidebar"]/div/div/div[2]/dl/dd/a/@href') for urlName,url in zip(urlNames,urls): dict = { 'urlName' : urlName, 'url' : url } yield dict # print(dict)def saveSubUrl(dict): client = MongoClient() db = client.拉勾主站所有网址 sheet = db.网址明细 sheet.insert_one(dict)#对首页抓取的每一个网址进行爬取def urlResp(): for item in getAllUrl(): statusCode = 200 i = 1 while(statusCode == 200): url = item['url'] + str(i) try: responses = requests.get(url,headers = headers, proxies = proxies) except: statusCode = 200 continue statusCode = responses.status_code resp = etree.HTML(responses.text) i = i + 1 if item['urlName'] == '.NET': name = 'NET' else: name = item['urlName'] # yield responseHtml companies = resp.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[2]/div[1]/a/text()') positions = resp.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[1]/div[1]/a/h3/text()') places = resp.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[1]/div[1]/a/span/em/text()') pays = resp.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[1]/div[2]/div/span/text()') works = resp.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[2]/div[2]/text()') descriptions = resp.xpath('//*[@id="s_position_list"]/ul/li/div[2]/div[2]/text()') needs = resp.xpath('//*[@id="s_position_list"]/ul/li/div[2]/div[1]/span/text()') for company,position,place,pay,work,description,need in zip(companies,positions,places,pays,works,descriptions,needs): dict = { '公司' : company.strip(), '职位' : position.strip(), '工作地点' : place.strip(), '薪资' : pay.strip(), '工作职务' : work.strip(), '公司描述' : description.strip(), '能力需要' : [i.strip() for i in need] } storeData(dict,name) time.sleep(1)def storeData(data,DbSheet): client = MongoClient() db = client['拉勾网'] mySet = db[DbSheet] mySet.insert_one(data) print("保存成功")if __name__ == '__main__': urlResp() 打赏 微信支付 支付宝