#!/usr/bin/env python
# -*- coding: UTF-8 -*-
#-------------------------------------------------------------------------
# 程序:search.py
# 版本:1
# 作者:ly
# 日期:编写日期2016/11/23
# 语言:Python 2.7.x
# 操作:python search.py 关键词 存储文件名 (排序呢方式)1或者2
# 功能:
#
#-------------------------------------------------------------------------
import requests
import time
import sys
import json
import os
import xlsxwriter
from sys import argv
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from lxml import etree
import re
# 中文编码设置
reload(sys)
sys.setdefaultencoding('utf-8')
Type = sys.getfilesystemencoding()
# 商品url汇总表
url_list = []
# def main(work,key,num,mkname):
def main(work, key, num, mkname, start_price, end_price, page_num):
# print type(num),num
if num == '1':
url = '''https://s.m.taobao.com/search?event_submit_do_new_search_auction=1\
&_input_charset=utf-8&topSearch=1&atype=b&searchfrom=1&action=home%3Aredirect_app_action&\
from=1&q=''' + key + '''&sst=1&n=20&buying=buyitnow&m=api4h5&abtest=10&wlsort=10&page=''' + str(page_num)
elif num == '3':
url = '''https://s.m.taobao.com/search?event_submit_do_new_search_auction=1\
&_input_charset=utf-8&topSearch=1&atype=b&searchfrom=1&action=home%3Aredirect_app_action&\
from=1&q=''' + key + '''&sst=1&n=20&buying=buyitnow&m=api4h5&abtest=3\
&wlsort=3&style=list&closeModues=nav%2Cselecthot%2Conesearch&\
start_price=''' + str(start_price) + '''&end_price=''' + str(end_price) + '''&page=''' + str(page_num)
else:
url = '''https://s.m.taobao.com/search?event_submit_do_new_search_auction=1\
&_input_charset=utf-8&topSearch=1&atype=b&searchfrom=1&action=home%3Aredirect_app_action&\
from=1&q=''' + key + '''&sst=1&n=20&buying=buyitnow&m=api4h5&abtest=14&\
wlsort=14&style=list&closeModues=nav%2Cselecthot%2Conesearch&sort=_sale&page=''' + str(page_num)
try:
body = requests.get(url)
body = body.text.encode('utf8')
dic_body = eval(body)
except Exception, e:
print "请求出错,请将下列url放于浏览器中看是否可以打开"
print url
print e
for i in range(20):
print "当前正在采集第 ", i + 1, " 个,第", str(page_num), ' 页'
try:
num_id = dic_body["listItem"][i]['item_id']
except:
num_id = ''
try:
act = dic_body["listItem"][i]['act'] # 付款数
except:
act = ''
try:
area = dic_body["listItem"][i]['area'] # 地区
except:
area = ''
try:
if dic_body["listItem"][i]['url'].find('https:') != -1:
auctionURL = dic_body["listItem"][i]['url'] # 商品url
else:
auctionURL = "https:" + dic_body["listItem"][i]['url'] # 商品url
# https://detail.tmall.com/item.htm?id="+str(num_id)
# print len(auctionURL)
if(len(auctionURL) > 250):
auctionURL_1 = auctionURL[:250]
auctionURL_2 = auctionURL[250::]
else:
auctionURL_1 = auctionURL
auctionURL_2 = ''
except:
auctionURL = ''
auctionURL_1 = ''
auctionURL_2 = ''
try:
name = dic_body["listItem"][i]['name'] # 商品名
except:
name = ''
try:
nick = dic_body["listItem"][i]['nick'] # 店铺名
except:
nick = ''
try:
originalPrice = dic_body["listItem"][i]['originalPrice'] # 原始价格
except:
originalPrice = ''
try:
price = dic_body["listItem"][i]['price'] # 当前价格
except:
price = ''
try:
pic_path = dic_body["listItem"][i]['pic_path'] # 当前价格
# print pic_path
pic_path = pic_path.replace('60x60', '210x210')
pic_name = str(i + 1 + (page_num - 1) * 20) + '-' + nick
img_download(pic_name, pic_path, mkname + '/pic')
except Exception, e:
print e
pic_path = ''
try:
zkType = dic_body["listItem"][i]['zkType'] # 当前价格
except:
zkType = ''
try:
if len(auctionURL_2) > 10:
first = 0
html_date = download_date(
auctionURL_1 + auctionURL_2, work, i + 2, first)
else:
first = 0
html_date = download_date(auctionURL_1, work, i + 2, first)
except:
html_date = ''
print html_date
date = [name, nick, act, price, originalPrice, zkType, area,
auctionURL_1, auctionURL_2, pic_path, html_date, num_id]
# print len(date)
num = i + 2 + (int(page_num) - 1) * 20
install_table(date, work, num)
# 商品名 店铺 付款人数 当前价格 原始价格 优惠类型 地区 商品url 图片url 详情数据#
# name nick act price originalPrice zkType area auctionURL pic_path
# html_date
def download_date(url, work, i, first):
# if first == 1:
'''导入商品url,进行详情页面解析'''
if(url.find("taobao") != -1 and first != 1):
print "检测为淘宝的页面"
try:
driver = webdriver.PhantomJS()
print "正在获取详情页面,url为"
#url ="https://item.taobao.com/item.htm?id=538287375253&abtest=10&rn=07abc745561bdfad6f726eb186dd990e&sid=46f938ba6d759f6e420440bf98b6caea"
url = url
print url
driver.get(url)
driver.implicitly_wait(40) # 设置智能超时时间
html = driver.page_source.encode('utf-8')
driver.quit()
except Exception, e:
print "页面加载失败", e
return 0
try:
print '正在解析页面'
try:
selector = etree.HTML(
html, parser=etree.HTMLParser(encoding='utf-8'))
except Exception, e:
print "页面加载失败", e
return 0
try: # 此部分用于采集每月销量的数据
print '正在解析页面1'
# context=selector.xpath('//div[@class="tm-indcon"]')
context = selector.xpath('//strong[@id="J_SellCounter"]')
xiaoliang_date = u''
for i in range(len(context)):
print '正在解析页面2'
temp_date = etree.tostring(
context[i], encoding="utf-8") # .encode('utf-8')
print '***', temp_date
# 去除一切html标签 attributes-list
re_h = re.compile('</?\w+[^>]*>')
s = re_h.sub('', temp_date) + ','
xiaoliang_date += s
print '正在解析页面3'
list_date = xiaoliang_date + ';'
except Exception, e:
print e
list_date = u''
context = selector.xpath('//ul[@class="attributes-list"]/li')
for i in range(len(context)): # attributes-list
# .encode('utf-8')
a = etree.tostring(context[i], encoding="utf-8")
b = a.split('>')
end = b[1].split('<')[0] + ';'
list_date += end
print '&&&&&&&&&&&', list_date.encode('utf8')
if len(list_date) < 50:
print "数据过少,尝试检测为天猫页面解析"
try:
driver = webdriver.PhantomJS()
print "正在获取详情页面,url为"
#url ="https://item.taobao.com/item.htm?id=538287375253&abtest=10&rn=07abc745561bdfad6f726eb186dd990e&sid=46f938ba6d759f6e420440bf98b6caea"
#num_id = re.findall('id=[0-9]+&',url)[0].replace('id=','').replace('&','')
#url = "https://detail.tmall.com/item.htm?id="+str(num_id)
print url
driver.get(url)
driver.implicitly_wait(40) # 设置智能超时时间
html = driver.page_source.encode('utf-8')
driver.quit()
except Exception, e:
print "页面加载失败", e
return 0
try:
print '正在解析页面'
try:
selector = etree.HTML(
html, parser=etree.HTMLParser(encoding='utf-8'))
except Exception, e:
print "页面加载失败", e
return 0
try:
# 此部分用于采集每月销量的数据
context = selector.xpath('//div[@class="tm-indcon"]')
xiaoliang_date = u''
for i in range(len(context)):
temp_date = etree.tostring(
context[i], encoding="utf-8") # .encode('utf-8')
re_h = re.compile('</?\w+[^>]*>') # 去除一切html标签
s = re_h.sub('', temp_date) + ','
xiaoliang_date += s
list_date += xiaoliang_date + ';'
except Exception, e:
print e
list_date += u''
context = selector.xpath('//ul[@id="J_AttrUL"]/li')
print list_date, len(context)
for i in range(len(context)):
# .encode('utf-8')
a = etree.tostring(context[i], encoding="utf-8")
b = a.split('>')
end = b[1].split('<')[0] + ';'
list_date += end
# print list_date.encode('utf8')
return list_date
except Exception, e:
print '页面解析失败'
return 0
return list_date
except:
print '页面解析失败'
return 0
elif(url.find("tmall") != -1 and first != 1):
print "检测为天猫页面,"
try:
driver = webdriver.PhantomJS()
print "正在获取详情页面,url为"
#url ="https://item.taobao.com/item.htm?id=538287375253&abtest=10&rn=07abc745561bdfad6f726eb186dd990e&sid=46f938ba6d759f6e420440bf98b6caea"
num_id = re.findall(
'id=[0-9]+&', url)[0].replace('id=', '').replace('&', '')
url = "https://detail.tmall.com/item.htm?id=" + str(num_id)
print url
driver.get(url)
driver.implicitly_wait(40) # 设置智能超时时间
html = driver.page_source.encode('utf-8')
driver.quit()
except Exception, e:
print "页面加载失败", e
return 0
try:
print '正在解析页面'
try:
selector = etree.HTML(
html, parser=etree.HTMLParser(encoding='utf-8'))
except Exception, e:
print "页面加载失败", e
return 0
try:
# 此部分用于采集每月销量的数据
context = selector.xpath('//div[@class="tm-indcon"]')
xiaoliang_date = u''
for i in range(len(context)):
temp_date = etree.tostring(
context[i], encoding="utf-8") # .encode('utf-8')
re_h = re.compile('</?\w+[^>]*>') # 去除一切html标签
s = re_h.sub('', temp_date) + ','
xiaoliang_date += s
list_date = xiaoliang_date + ';'
except Exception, e:
print e
list_date = u''
context = selector.xpath('//ul[@id="J_AttrUL"]/li')
print list_date, len(context)
for i in range(len(context)):
# .encode('utf-8')
a = etree.tostring(context[i], encoding="utf-8")
b = a.split('>')
end = b[1].split('<')[0] + ';'
list_date += end
# print list_date.encode('utf8')
return list_date
except Exception, e:
print '页面解析失败'
return 0
# if (first):
# print "检测为天猫页面,"
# try:
# driver = webdriver.PhantomJS()
# print "正在获取详情页面,url为"
# #url ="https://item.taobao.com/item.htm?id=538287375253&abtest=10&rn=07abc745561bdfad6f726eb186dd990e&sid=46f938ba6d759f6e420440bf98b6caea"
# #num_id = re.findall('id=[0-9]+&',url)[0].replace('id=','').replace('&','')
# #url = "https://detail.tmall.com/item.htm?id="+str(num_id)
# print url
# driver.implicitly_wait(40) #设置智能超时时间
# driver.get(url)
# html = driver.page_source.encode('utf-8')
# driver.quit()
# except Exception,e:
# print "页面加载失败",e
# return 0
# try:
# print '正在解析页面'
# selector=etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8'))
# try:
# #此部分用于采集每月销量的数据
# context=selector.xpath('//div[@class="tm-indcon"]')
# xiaoliang_date = u''
# for i in range(len(context)):
# temp_date = etree.tostring(context[i], encoding="utf-8")#.encode('utf-8')
# re_h=re.compile('</?\w+[^>]*>')#去除一切html标签
# s=re_h.sub('',temp_date)+','
# xiaoliang_date += s
# list_date = xiaoliang_date+';'
# except Exception,e:
# print e
# list_date = u''
#
# context=selector.xpath('//ul[@id="J_AttrUL"]/li')
# print list_date,len(context)
# for i in range(len(context)):
# a = etree.tostring(context[i], encoding="utf-8")#.encode('utf-8')
# b = a.split('>')
# end = b[1].split('<')[0]+';'
# list_date += end
# #print list_date.encode('utf8')
# return list_date
# except Exception,e:
# print '页面解析失败'
# return 0
def install_table(date, work, i):
'''导入数据列表存入表格中 '''
str_list = ['B', 'C', 'D', 'E', 'F', 'G',
'H', 'I', 'J', 'K', 'L', 'M', "O"]
#global worksheet1
try:
work.write('A' + str(i), int(i) - 1)
except Exception, e:
print '无法写入'
print e
for now_str, now_date in zip(str_list, date):
num = now_str + str(i)
try:
work.write(num, now_date)
except Exception, e:
print "无法写入"
print e
def img_download(id, url, mkname):
'''导入图片url,文件夹名,以id为图片名'''
try:
print "主图下载中"
#img = requests.get(url).context()
name = id
r = requests.get(url, timeout=50)
#name = int(time.time())
f = open('./' + mkname + '/' + str(name) + '.jpg', 'wb')
f.write(r.content)
f.close()
except:
print "主图下载失败"
def create_mkdir(name):
'''创建文件夹'''
try:
print "开始创建文件夹 ", name
os.mkdir(r'./' + name)
os.mkdir(r'./' + name + "/pic")
except Exception, e:
print e
def create_table(name):
''' 导入表格名字,在当前目录下创建该表格'''
try:
name = './' + name + '/' + name + '.xlsx'
workbook = xlsxwriter.Workbook(name)
worksheet1 = workbook.add_worksheet()
worksheet1.write('A1', 'ID')
worksheet1.write('B1', u"商品名")
worksheet1.write('C1', u'店铺')
worksheet1.write('D1', u'付款人数')
worksheet1.write('E1', u'当前价格')
worksheet1.write('F1', u'原始价格')
worksheet1.write('G1', u'优惠类型')
worksheet1.write('H1', u'地区')
worksheet1.write('I1', u'商品url_1')
worksheet1.write('J1', u'商品url_2')
worksheet1.write('K1', u'图片url')
worksheet1.write('L1', u'date')
worksheet1.write('M1', u'宝贝id')
# workbook.close()
print '表格构建完成,name', name
return worksheet1, workbook
except Exception, e:
print e
if __name__ == '__main__':
# print argv
try:
key = argv[1]
except:
print '请指定关键词作为第一个参数'
key = ''
try:
name = argv[2]
except:
print "请指定输出文件名问第二个参数"
name = ''
try:
num = argv[3]
# print num ,star_price , end_price
except:
print "请指定排序方式 1 为综合排序 2 为销量排序, 当前默认为综合排序"
num = 1
try:
page_num = int(argv[4])
except:
print '页码错误,默认值为1'
page_num = 1
try:
star_price = argv[5]
end_price = argv[6]
except:
star_price = ''
end_price = ''
#key = u'皮裤男'
print '启动采集,关键词为:', key, " 存入: ", name, "排序为 ", num, star_price, end_price
if (key == '' or name == '' or num == ''):
print '参数不正确'
print "请按顺序输入参数 关键词 输出文件名 排序方式(1或者2)页数 (价格区间)"
print "例如:python Search.py 皮裤男 皮裤男1 2 1"
else:
create_mkdir(name)
work, workbook = create_table(name)
# time.sleep(100)
print '开始采集请等待'
# main(work,key,num,name)
for now_page_num in range(1, page_num + 1):
main(work, key, num, name, star_price, end_price, now_page_num)
workbook.close()
print '采集完成'
使用方式:
淘宝搜索程序 Search.py
功能。给定关键词与排序方式将采集淘宝商品数据存入指定文件夹内。
命令 python Search.py 关键词 文件夹 排序方式 最大页码 (价格区间)
说明:
关键词可为:皮裤 毛衣 任意名词均可 但不可包含空格
文件夹 :将在当前目录下自动生成一个文件夹目录,包含pic文件(商品主图)待采集完成将出现一个excel文件
排序方式:有1、2、3三种选项。1为综合排序呢,2为销量排序。3为价格区间搜索,选择3的时候需要在补充两个参数作为价格区间。
使用样例:
python Search.py 皮裤 piku1 1 5 #综合排序 # 5页
python Search.py 皮裤 piku2 2 5 #销量排序
python Search.py 皮裤 piku3 3 5 100 300 #指定价格搜索
转载:
github:淘宝关键词采集器 博客原文:urlteam
打赏
微信扫一扫,打赏作者吧~