本文實(shí)例講述了Python實(shí)現(xiàn)爬取亞馬遜數(shù)據(jù)并打印出Excel文件操作。分享給大家供大家參考,具體如下:
python大神們別噴,代碼寫(xiě)的很粗糙,主要是完成功能,能夠借鑒就看下吧,我是學(xué)java的,畢竟不是學(xué)python的,自己自學(xué)看了一點(diǎn)點(diǎn)python,望諒解。
#!/usr/bin/env python3
# encoding=UTF-8
import sys
import re
import urllib.request
import json
import time
import zlib
from html import unescape
import threading
import os
import xlwt
import math
import requests
#例如這里設(shè)置遞歸為一百萬(wàn)
sys.setrecursionlimit(1000000000)
##獲取所有列別
def getProUrl():
urlList = []
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"}
session = requests.Session()
furl="https://www.amazon.cn/?tag=baidu250-23&hvadid={creative}&ref=pz_ic_22fvxh4dwf_e&page="
for i in range(0,1):
html=""
html = session.post(furl+str(i),headers = headers)
html.encoding = 'utf-8'
s=html.text.encode('gb2312','ignore').decode('gb2312')
url=r'
'
reg=re.compile(url,re.M)
name='"category" : "' + '(.*?)' + '"'
reg1=re.compile(name,re.S)
urlList = reg1.findall(html.text)
return urlList
##根據(jù)類別獲取數(shù)據(jù)鏈接
def getUrlData(ci):
url="https://www.amazon.cn/s/ref=nb_sb_noss_2?__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&url=search-alias%3Daps&field-keywords="+ci+"&page=1&sort=review-rank"
return url
##定時(shí)任務(wù),等待1秒在進(jìn)行
def fun_timer():
time.sleep(3)
##根據(jù)鏈接進(jìn)行查詢每個(gè)類別的網(wǎng)頁(yè)內(nèi)容
def getProData(allUrlList):
webContentHtmlList = []
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"}
for ci in allUrlList:
session = requests.Session()
fun_timer()
html = session.get(getUrlData(ci),headers = headers)
# 設(shè)置編碼
html.encoding = 'utf-8'
html.text.encode('gb2312', 'ignore').decode('gb2312')
gxg = r'
'
reg = re.compile(gxg, re.M)
items = reg.findall(html.text)
print(html.text)
webContentHtmlList.append(html.text)
return webContentHtmlList
##根據(jù)網(wǎng)頁(yè)內(nèi)容過(guò)濾需要的屬性和值
def getProValue():
list1 = [] * 5
list2 = [] * 5
list3 = [] * 5
list4 = [] * 5
list5 = [] * 5
list6 = [] * 5
list7 = [] * 5
list8 = [] * 5
urlList = getProUrl();
urlList.remove('全部分類')
urlList.remove('Prime會(huì)員優(yōu)先購(gòu)')
index = 0
for head in urlList:
if index >= 0 and index < 5:
list1.append(head)
index = index + 1
if index >= 5 and index < 10:
list2.append(head)
index = index + 1
if index >= 10 and index < 15:
list3.append(head)
index = index + 1
if index >= 15 and index < 20:
list4.append(head)
index = index + 1
if index >= 20 and index < 25:
list5.append(head)
index = index + 1
if index >= 25 and index < 30:
list6.append(head)
index = index + 1
if index >= 30 and index < 35:
list7.append(head)
index = index + 1
if index >= 35 and index < 40:
list8.append(head)
index = index + 1
webContentHtmlList1 = []
webContentHtmlList1 = getProData(list1)
webContentHtmlList2 = []
webContentHtmlList2 = getProData(list2)
webContentHtmlList3 = []
webContentHtmlList3 = getProData(list3)
webContentHtmlList4 = []
webContentHtmlList4 = getProData(list4)
webContentHtmlList5 = []
webContentHtmlList5 = getProData(list5)
webContentHtmlList6 = []
webContentHtmlList6 = getProData(list6)
webContentHtmlList7 = []
webContentHtmlList7 = getProData(list7)
webContentHtmlList8 = []
webContentHtmlList8 = getProData(list8)
##存儲(chǔ)所有數(shù)據(jù)的集合
dataTwoAllList1 = []
print("開(kāi)始檢索數(shù)據(jù),檢索數(shù)據(jù)中..........")
##網(wǎng)頁(yè)內(nèi)容1
for html in webContentHtmlList1:
for i in range(15):
dataList = []
dataList.append(unescape(getProCategory(html,i)))
dataList.append(unescape(getProTitle(html,i)))
dataList.append(getProPrice(html,i))
dataList.append(getSellerCount(html,i))
dataList.append(getProStar(html,i))
dataList.append(getProCommentCount(html,i))
print(dataList)
dataTwoAllList1.append(dataList)
##網(wǎng)頁(yè)內(nèi)容2
for html in webContentHtmlList2:
for i in range(15):
dataList = []
dataList.append(unescape(getProCategory(html,i)))
dataList.append(unescape(getProTitle(html,i)))
dataList.append(getProPrice(html,i))
dataList.append(getSellerCount(html,i))
dataList.append(getProStar(html,i))
dataList.append(getProCommentCount(html,i))
print(dataList)
dataTwoAllList1.append(dataList)
##網(wǎng)頁(yè)內(nèi)容3
for html in webContentHtmlList3:
for i in range(15):
dataList = []
dataList.append(unescape(getProCategory(html,i)))
dataList.append(unescape(getProTitle(html,i)))
dataList.append(getProPrice(html,i))
dataList.append(getSellerCount(html,i))
dataList.append(getProStar(html,i))
dataList.append(getProCommentCount(html,i))
print(dataList)
dataTwoAllList1.append(dataList)
##網(wǎng)頁(yè)內(nèi)容4
for html in webContentHtmlList4:
for i in range(15):
dataList = []
dataList.append(unescape(getProCategory(html,i)))
dataList.append(unescape(getProTitle(html,i)))
dataList.append(getProPrice(html,i))
dataList.append(getSellerCount(html,i))
dataList.append(getProStar(html,i))
dataList.append(getProCommentCount(html,i))
print(dataList)
dataTwoAllList1.append(dataList)
##網(wǎng)頁(yè)內(nèi)容5
for html in webContentHtmlList5:
for i in range(15):
dataList = []
dataList.append(unescape(getProCategory(html,i)))
dataList.append(unescape(getProTitle(html,i)))
dataList.append(getProPrice(html,i))
dataList.append(getSellerCount(html,i))
dataList.append(getProStar(html,i))
dataList.append(getProCommentCount(html,i))
print(dataList)
dataTwoAllList1.append(dataList)
##網(wǎng)頁(yè)內(nèi)容6
for html in webContentHtmlList6:
for i in range(15):
dataList = []
dataList.append(unescape(getProCategory(html,i)))
dataList.append(unescape(getProTitle(html,i)))
dataList.append(getProPrice(html,i))
dataList.append(getSellerCount(html,i))
dataList.append(getProStar(html,i))
dataList.append(getProCommentCount(html,i))
print(dataList)
dataTwoAllList1.append(dataList)
##網(wǎng)頁(yè)內(nèi)容7
for html in webContentHtmlList7:
for i in range(15):
dataList = []
dataList.append(unescape(getProCategory(html,i)))
dataList.append(unescape(getProTitle(html,i)))
dataList.append(getProPrice(html,i))
dataList.append(getSellerCount(html,i))
dataList.append(getProStar(html,i))
dataList.append(getProCommentCount(html,i))
print(dataList)
dataTwoAllList1.append(dataList)
##網(wǎng)頁(yè)內(nèi)容8
for html in webContentHtmlList8:
for i in range(15):
dataList = []
dataList.append(unescape(getProCategory(html,i)))
dataList.append(unescape(getProTitle(html,i)))
dataList.append(getProPrice(html,i))
dataList.append(getSellerCount(html,i))
dataList.append(getProStar(html,i))
dataList.append(getProCommentCount(html,i))
print(dataList)
dataTwoAllList1.append(dataList)
print("檢索數(shù)據(jù)完成!!!!")
print("開(kāi)始保存并打印Excel文檔數(shù)據(jù)!!!!")
##保存文檔
createTable(time.strftime("%Y%m%d") + '亞馬遜銷量數(shù)據(jù)統(tǒng)計(jì).xls', dataTwoAllList1)
##抽取類別
def getProCategory(html,i):
i = 0;
name = '
' + '(.*?)' + '
'
reg=re.compile(name,re.S)
items = reg.findall(html)
if len(items)==0:
return ""
else:
if i
' + '(.*?)' + ''
reg=re.compile(name,re.S)
items = reg.findall(html)
if len(items)==0:
return "¥0"
else:
return items[0]
##抽取賣家統(tǒng)計(jì)
def getSellerCount(html,i):
html = getHtmlById(html,i)
name = '
' + '(.*?)' + '
'
reg=re.compile(name,re.S)
items = reg.findall(html)
if len(items)==0:
return "(0 賣家)"
else:
return checkSellerCount(items,0)
##檢查賣家統(tǒng)計(jì)
def checkSellerCount(items,i):
result = items[i].find('賣家') >= 0
if result:
if len(items[i])<=9:
return items[i]
else:
return '(0 賣家)'
else:
if i + 1 < len(items):
i = i + 1
result = items[i].find('賣家') >= 0
if result:
if len(items[i]) <= 9:
return items[i]
else:
return '(0 賣家)'
if i + 1 < len(items[i]):
i = i + 1
result = items[i].find('賣家') >= 0
if result:
if len(items[i]) <= 9:
return items[i]
else:
return '(0 賣家)'
else:
return '(0 賣家)'
else:
return '(0 賣家)'
else:
return '(0 賣家)'
else:
return '(0 賣家)'
return '(0 賣家)'
##抽取星級(jí)
def getProStar(html,i):
html = getHtmlById(html,i)
name = '
' + '(.*?)' + '
'
reg=re.compile(name,re.S)
items = reg.findall(html)
if len(items)==0:
return "平均 0 星"
else:
return checkProStar(items,0)
##檢查星級(jí)
def checkProStar(items,i):
result = items[i].find('星') >= 0
if result:
return items[i]
else:
if i + 1 < len(items):
i = i + 1
result = items[i].find('星') >= 0
if result:
return items[i]
else:
return '平均 0 星'
else:
return '平均 0 星'
return '平均 0 星'
##抽取商品評(píng)論數(shù)量 銷量
##
56
def getProCommentCount(html,i):
name = '
'
reg=re.compile(name,re.S)
items = reg.findall(html)
if len(items)==0:
return "0"
else:
if i
")
else:
return "0"
##根據(jù)id取出html里面的內(nèi)容
def get_id_tag(content, id_name):
id_name = id_name.strip()
patt_id_tag = """<[^>]*id=['"]?""" + id_name + """['" ][^>]*>"""
id_tag = re.findall(patt_id_tag, content, re.DOTALL|re.IGNORECASE)
if id_tag:
id_tag = id_tag[0]
else:
id_tag=""
return id_tag
##縮小范圍 定位值
def getHtmlById(html,i):
start = get_id_tag(html,"result_"+str(i))
i=i+1
end = get_id_tag(html, "result_" + str(i))
name = start + '.*?'+end
reg = re.compile(name, re.S)
html = html.strip()
items = reg.findall(html)
if len(items) == 0:
return ""
else:
return items[0]
##生成word文檔
def createTable(tableName,dataTwoAllList):
flag = 1
results = []
results.append("類別,標(biāo)題,價(jià)格,賣家統(tǒng)計(jì),星級(jí),評(píng)論數(shù)")
columnName = results[0].split(',')
# 創(chuàng)建一個(gè)excel工作簿,編碼utf-8,表格中支持中文
wb = xlwt.Workbook(encoding='utf-8')
# 創(chuàng)建一個(gè)sheet
sheet = wb.add_sheet('sheet 1')
# 獲取行數(shù)
rows = math.ceil(len(dataTwoAllList))
# 獲取列數(shù)
columns = len(columnName)
# 創(chuàng)建格式style
style = xlwt.XFStyle()
# 創(chuàng)建font,設(shè)置字體
font = xlwt.Font()
# 字體格式
font.name = 'Times New Roman'
# 將字體font,應(yīng)用到格式style
style.font = font
# 創(chuàng)建alignment,居中
alignment = xlwt.Alignment()
# 居中
alignment.horz = xlwt.Alignment.HORZ_CENTER
# 應(yīng)用到格式style
style.alignment = alignment
style1 = xlwt.XFStyle()
font1 = xlwt.Font()
font1.name = 'Times New Roman'
# 字體顏色(綠色)
# font1.colour_index = 3
# 字體加粗
font1.bold = True
style1.font = font1
style1.alignment = alignment
for i in range(columns):
# 設(shè)置列的寬度
sheet.col(i).width = 5000
# 插入列名
for i in range(columns):
sheet.write(0, i, columnName[i], style1)
for i in range(1,rows):
for j in range(0,columns):
sheet.write(i, j, dataTwoAllList[i-1][j], style)
wb.save(tableName)
##入口開(kāi)始
input("按回車鍵開(kāi)始導(dǎo)出..........")
fun_timer()
print("三秒后開(kāi)始抓取數(shù)據(jù).......,請(qǐng)等待!")
getProValue();
print("數(shù)據(jù)導(dǎo)出成功!請(qǐng)注意查看!")
print("數(shù)據(jù)文檔《亞馬遜銷量數(shù)據(jù)統(tǒng)計(jì).xls》已經(jīng)存于C盤下面的C:\Windows\SysWOW64的該路徑下面!!!!")
input()
結(jié)果數(shù)據(jù):
打包成exe文件,直接可以點(diǎn)擊運(yùn)行:打包過(guò)程我就不一一說(shuō)了,都是一些命令操作:
要安裝pyinstaller,打成exe的操作命令:--inco是圖標(biāo),路徑和項(xiàng)目當(dāng)前路徑一樣
途中遇到很多問(wèn)題,都一一解決了,亂碼,ip限制,打包后引入模塊找不到,遞歸最大次數(shù),過(guò)濾的一些問(wèn)題
pyinstaller -F -c --icon=my.ico crawling.py
??? 這是打包命令
效果圖:
更多關(guān)于Python相關(guān)內(nèi)容可查看本站專題:《Python Socket編程技巧總結(jié)》、《Python正則表達(dá)式用法總結(jié)》、《Python數(shù)據(jù)結(jié)構(gòu)與算法教程》、《Python函數(shù)使用技巧總結(jié)》、《Python字符串操作技巧匯總》、《Python入門與進(jìn)階經(jīng)典教程》及《Python文件與目錄操作技巧匯總》
希望本文所述對(duì)大家Python程序設(shè)計(jì)有所幫助。