拉勾爬虫输出

爬取拉勾所有数据代码
import requests
from lxml import etree
import time
import random
from pymongo import MongoClient
from fake_useragent import UserAgent
#IP代理
proxiesList = [
    'http://120.32.209.231:8118',
    'http://210.82.36.142:80',
    'http://123.125.116.151:80',
    'http://113.5.80.144:8080',
    'http://122.224.227.202:3128',
    'http://220.191.214.176:8080'
]
proxies = {'http' : random.choice(proxiesList)}
#随机浏览器
userAgent = UserAgent().random
#headers文件
headers = {
    'Cookie':'user_trace_token=20171110210441-b6e83f7b-c617-11e7-86f0-525400f775ce; LGUID=20171110210441-b6e847a6-c617-11e7-86f0-525400f775ce; X_HTTP_TOKEN=c2758cd47b96276a5a741c7c8aef7389; TG-TRACK-CODE=index_search; SEARCH_ID=d3457fbe235a4f4c8dd0b8a8e8ce93dd; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAACEBACDGB5AE0EFB5452955E2ED72F2B5D254B8E; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1510319081,1510652275; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1510665850; _gat=1; _ga=GA1.2.739478823.1510319081; _gid=GA1.2.1539303787.1510652275; LGSID=20171114212410-195b1e64-c93f-11e7-8eeb-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2F; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGRID=20171114212410-195b1fe3-c93f-11e7-8eeb-525400f775ce; _putrc=11E3FE8A2D6757DB; login=true; unick=%E6%9D%8E%E4%BA%8C%E8%8A%B1; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0',
    'Host':'www.lagou.com',
    'Referer':'https://www.lagou.com/',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':userAgent
}
#抓取首页所有网址
def getAllUrl():
    mainPageUrl = 'https://www.lagou.com/'
    responses = requests.get(mainPageUrl,headers = headers).text
    lagouHtml = etree.HTML(responses)
    urlNames = lagouHtml.xpath('//div[@class="sidebar"]/div/div/div[2]/dl/dd/a/text()')
    urls = lagouHtml.xpath('//div[@class="sidebar"]/div/div/div[2]/dl/dd/a/@href')
    for urlName,url in zip(urlNames,urls):
        dict = {
            'urlName' : urlName,
            'url' : url
        }
        yield dict
        # print(dict)
def saveSubUrl(dict):
    client =  MongoClient()
    db = client.拉勾主站所有网址
    sheet = db.网址明细
    sheet.insert_one(dict)
#对首页抓取的每一个网址进行爬取
def urlResp():
    for item in getAllUrl():
        statusCode = 200
        i = 1
        while(statusCode == 200):
            url = item['url'] + str(i)
            try:
                responses = requests.get(url,headers = headers, proxies = proxies)
            except:
                statusCode = 200
                continue
            statusCode = responses.status_code
            resp = etree.HTML(responses.text)
            i = i + 1
            if item['urlName'] == '.NET':
                name = 'NET'
            else:
                name = item['urlName']
            # yield responseHtml
            companies = resp.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[2]/div[1]/a/text()')
            positions = resp.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[1]/div[1]/a/h3/text()')
            places = resp.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[1]/div[1]/a/span/em/text()')
            pays = resp.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[1]/div[2]/div/span/text()')
            works = resp.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[2]/div[2]/text()')
            descriptions = resp.xpath('//*[@id="s_position_list"]/ul/li/div[2]/div[2]/text()')
            needs = resp.xpath('//*[@id="s_position_list"]/ul/li/div[2]/div[1]/span/text()')
            for company,position,place,pay,work,description,need in zip(companies,positions,places,pays,works,descriptions,needs):
                dict = {
                    '公司' : company.strip(),
                    '职位' : position.strip(),
                    '工作地点' : place.strip(),
                    '薪资' : pay.strip(),
                    '工作职务' : work.strip(),
                    '公司描述' : description.strip(),
                    '能力需要' : [i.strip() for i in need]
                }
                storeData(dict,name)
            time.sleep(1)
def storeData(data,DbSheet):
    client = MongoClient()
    db = client['拉勾网']
    mySet = db[DbSheet]
    mySet.insert_one(data)
    print("保存成功")
if __name__ == '__main__':
    urlResp()