jd商品数据爬取

python · 04-06

code

程序运行状态:
请输入图片描述
爬取的数据如下:
请输入图片描述
以下是代码,可以跑一下。

import re
import json
import time
from tqdm import tqdm
import requests
import parsel
import keyboard
# import sys,io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030'
thinks = ""
o = 0
while True:
    if '[' not in thinks:

        thinks = input("请输入")
    if thinks =="quit":
        break
    url = f"https://search.jd.com/Search?keyword={thinks}&enc=utf-8&spm=a.0.0&pvid=afbd6301d462459e8b9c76c6971b7970"
    if '[' in thinks:
        re.sub(r'\[ | ]','',thinks)
        test = thinks.split(',')
        try :
            name = test[o]

            url = f"https://search.jd.com/Search?keyword={name}&enc=utf-8&spm=a.0.0&pvid=afbd6301d462459e8b9c76c6971b7970"
            o+=1
        except:
            print("交付任务已完成")
            thinks = ""
            continue

    print(url)

    heders = {


        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
        'cookie': 'shshshfpb=dbs9Xefv-Aljqb4flP-PPqQ; __jdu=1671543727423255788985; shshshfpa=1538e575-d61c-f03f-2b6e-48741699893e-1672842985; shshshfpx=1538e575-d61c-f03f-2b6e-48741699893e-1672842985; qrsc=3; __jdv=76161171|cn.bing.com|-|referral|-|1680667832249; CA1AN5BV0CA8DS2EPC=341665da0557845c31bacbfba2d76092; PCA9D23F7A4B3CSS=bc87c9fb4a213479511892c356b60be5; areaId=16; ipLoc-djd=16-1315-0-0; 3AB9D23F7A4B3CSS=jdd03JCL3XWFR65YR36N5CGNWTKQK7DFVAXLFCM4UVVT65274UVDPWTC34ZROQHJKYSKENFK3Q6GGXGNDYAWJHRWE36WGEUAAAAMHJ6N7UVAAAAAAC3FSSQFGPBYPNUX; _gia_d=1; PCSYCityID=CN_350000_350200_0; xapieid=jdd03JCL3XWFR65YR36N5CGNWTKQK7DFVAXLFCM4UVVT65274UVDPWTC34ZROQHJKYSKENFK3Q6GGXGNDYAWJHRWE36WGEUAAAAMHJ6N7UVAAAAAAC3FSSQFGPBYPNUX; jsavif=0; rkv=1.0; __jda=76161171.1671543727423255788985.1671543727.1678016252.1680667832.10; __jdb=76161171.3.1671543727423255788985|10.1680667832; __jdc=76161171; shshshfp=826cc8bb32f2b10b172a709a99548fa8; shshshsID=98543cc16e240ca3d073234ac8adb74c_3_1680667848353; 3AB9D23F7A4B3C9B=JCL3XWFR65YR36N5CGNWTKQK7DFVAXLFCM4UVVT65274UVDPWTC34ZROQHJKYSKENFK3Q6GGXGNDYAWJHRWE36WGEU'
    }

    reponse = requests.get(url,headers=heders)
    #print(reponse.text)
    xpath = parsel.Selector(reponse.text)

    price = xpath.xpath('//div[@class="p-price"]/strong/i/text()').getall()
    name = xpath.xpath('//div[@class="p-name p-name-type-2"]/a/em').getall()
    commant__index = xpath.xpath('//div[@class="p-name p-name-type-2"]/a/@href').getall()
    urls = xpath.xpath('//div[@class="p-name p-name-type-2"]/a/@href').getall()
    # print(len(price))
    # print(len(name))
    score = [0 for i in range(100)]
    poorcomment = ['o' for i in range(100)]
    totalcomment = ['o' for i in range(100)]
    goodcomment = ['o' for i in range(100)]
    addcomment = ['o' for i in range(100)]
    midcomment = ['o' for i in range(100)]
    recommend = ['o' for i in range(100)]
    urls_list = [urls[i] for i in range(len(urls))]
    print('正在获取评论信息,并估算购买系数')


    for i in tqdm(range(len(commant__index))):

        commant__index[i] = re.sub(r'.*com/|.html', '', commant__index[i])
        commant__index[i] = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds='+commant__index[i]
        reponse_comment = requests.get(commant__index[i],headers=heders)
        score[i] = (json.loads(reponse_comment.text))['CommentsCount'][0]['GoodRateShow']
        poorcomment[i] = (json.loads(reponse_comment.text))['CommentsCount'][0]['PoorCountStr']
        totalcomment[i] = (json.loads(reponse_comment.text))['CommentsCount'][0]['DefaultGoodCountStr']
        goodcomment[i] = (json.loads(reponse_comment.text))['CommentsCount'][0]['GoodCountStr']
        addcomment[i] = (json.loads(reponse_comment.text))['CommentsCount'][0]['AfterCountStr']
        midcomment[i] = (json.loads(reponse_comment.text))['CommentsCount'][0]['GeneralCountStr']
        try:
            recommends = (int(re.sub('\D','',poorcomment[i]))*30+int(re.sub('\D','',goodcomment[i]))*20+int(re.sub('\D','',addcomment[i]))*40+int(re.sub('\D','',midcomment[i]))*10)/(100*int(re.sub('\D','',totalcomment[i])))
            recommend[i] = str(recommends)
        except:
            recommend[i] = str(0)

            #comment = xpath1.xpath('//div[@class="percent-con"]/text()').getall()

        #print(commant__index)

    print('正在写入文档')
    for i in tqdm(range(len(name))):

        name[i] = re.sub(r'<em>|<font.*?>|</font>|</em>|<span.*</span>','',name[i])

    with open(f'{thinks}.csv',"w",encoding='gb18030') as f:
        f.write('商品名称,'+'价格,'+'评分,'+'差评数,'+'总评论数,'+'推荐购买系数,'+'核验网址'+'\n')
        for i in range(len(name)):
            if price[i]:

                f.write(name[i]+","+price[i]+','+str(score[i])+','+poorcomment[i]+','+totalcomment[i]+','+recommend[i]+','+urls_list[i]+'\n')
            else:
                continue
Theme Jasmine by Kent Liao