code
程序运行状态:
爬取的数据如下:
以下是代码,可以跑一下。
import re
import json
import time
from tqdm import tqdm
import requests
import parsel
import keyboard
# import sys,io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030'
thinks = ""
o = 0
while True:
if '[' not in thinks:
thinks = input("请输入")
if thinks =="quit":
break
url = f"https://search.jd.com/Search?keyword={thinks}&enc=utf-8&spm=a.0.0&pvid=afbd6301d462459e8b9c76c6971b7970"
if '[' in thinks:
re.sub(r'\[ | ]','',thinks)
test = thinks.split(',')
try :
name = test[o]
url = f"https://search.jd.com/Search?keyword={name}&enc=utf-8&spm=a.0.0&pvid=afbd6301d462459e8b9c76c6971b7970"
o+=1
except:
print("交付任务已完成")
thinks = ""
continue
print(url)
heders = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
'cookie': 'shshshfpb=dbs9Xefv-Aljqb4flP-PPqQ; __jdu=1671543727423255788985; shshshfpa=1538e575-d61c-f03f-2b6e-48741699893e-1672842985; shshshfpx=1538e575-d61c-f03f-2b6e-48741699893e-1672842985; qrsc=3; __jdv=76161171|cn.bing.com|-|referral|-|1680667832249; CA1AN5BV0CA8DS2EPC=341665da0557845c31bacbfba2d76092; PCA9D23F7A4B3CSS=bc87c9fb4a213479511892c356b60be5; areaId=16; ipLoc-djd=16-1315-0-0; 3AB9D23F7A4B3CSS=jdd03JCL3XWFR65YR36N5CGNWTKQK7DFVAXLFCM4UVVT65274UVDPWTC34ZROQHJKYSKENFK3Q6GGXGNDYAWJHRWE36WGEUAAAAMHJ6N7UVAAAAAAC3FSSQFGPBYPNUX; _gia_d=1; PCSYCityID=CN_350000_350200_0; xapieid=jdd03JCL3XWFR65YR36N5CGNWTKQK7DFVAXLFCM4UVVT65274UVDPWTC34ZROQHJKYSKENFK3Q6GGXGNDYAWJHRWE36WGEUAAAAMHJ6N7UVAAAAAAC3FSSQFGPBYPNUX; jsavif=0; rkv=1.0; __jda=76161171.1671543727423255788985.1671543727.1678016252.1680667832.10; __jdb=76161171.3.1671543727423255788985|10.1680667832; __jdc=76161171; shshshfp=826cc8bb32f2b10b172a709a99548fa8; shshshsID=98543cc16e240ca3d073234ac8adb74c_3_1680667848353; 3AB9D23F7A4B3C9B=JCL3XWFR65YR36N5CGNWTKQK7DFVAXLFCM4UVVT65274UVDPWTC34ZROQHJKYSKENFK3Q6GGXGNDYAWJHRWE36WGEU'
}
reponse = requests.get(url,headers=heders)
#print(reponse.text)
xpath = parsel.Selector(reponse.text)
price = xpath.xpath('//div[@class="p-price"]/strong/i/text()').getall()
name = xpath.xpath('//div[@class="p-name p-name-type-2"]/a/em').getall()
commant__index = xpath.xpath('//div[@class="p-name p-name-type-2"]/a/@href').getall()
urls = xpath.xpath('//div[@class="p-name p-name-type-2"]/a/@href').getall()
# print(len(price))
# print(len(name))
score = [0 for i in range(100)]
poorcomment = ['o' for i in range(100)]
totalcomment = ['o' for i in range(100)]
goodcomment = ['o' for i in range(100)]
addcomment = ['o' for i in range(100)]
midcomment = ['o' for i in range(100)]
recommend = ['o' for i in range(100)]
urls_list = [urls[i] for i in range(len(urls))]
print('正在获取评论信息,并估算购买系数')
for i in tqdm(range(len(commant__index))):
commant__index[i] = re.sub(r'.*com/|.html', '', commant__index[i])
commant__index[i] = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds='+commant__index[i]
reponse_comment = requests.get(commant__index[i],headers=heders)
score[i] = (json.loads(reponse_comment.text))['CommentsCount'][0]['GoodRateShow']
poorcomment[i] = (json.loads(reponse_comment.text))['CommentsCount'][0]['PoorCountStr']
totalcomment[i] = (json.loads(reponse_comment.text))['CommentsCount'][0]['DefaultGoodCountStr']
goodcomment[i] = (json.loads(reponse_comment.text))['CommentsCount'][0]['GoodCountStr']
addcomment[i] = (json.loads(reponse_comment.text))['CommentsCount'][0]['AfterCountStr']
midcomment[i] = (json.loads(reponse_comment.text))['CommentsCount'][0]['GeneralCountStr']
try:
recommends = (int(re.sub('\D','',poorcomment[i]))*30+int(re.sub('\D','',goodcomment[i]))*20+int(re.sub('\D','',addcomment[i]))*40+int(re.sub('\D','',midcomment[i]))*10)/(100*int(re.sub('\D','',totalcomment[i])))
recommend[i] = str(recommends)
except:
recommend[i] = str(0)
#comment = xpath1.xpath('//div[@class="percent-con"]/text()').getall()
#print(commant__index)
print('正在写入文档')
for i in tqdm(range(len(name))):
name[i] = re.sub(r'<em>|<font.*?>|</font>|</em>|<span.*</span>','',name[i])
with open(f'{thinks}.csv',"w",encoding='gb18030') as f:
f.write('商品名称,'+'价格,'+'评分,'+'差评数,'+'总评论数,'+'推荐购买系数,'+'核验网址'+'\n')
for i in range(len(name)):
if price[i]:
f.write(name[i]+","+price[i]+','+str(score[i])+','+poorcomment[i]+','+totalcomment[i]+','+recommend[i]+','+urls_list[i]+'\n')
else:
continue