python爬虫爬取淘宝商品信息（selenum+phontomjs）_Python

本文实例为大家分享了python爬虫爬取淘宝商品的具体代码，供大家参考，具体内容如下

1、需求目标 ：

进去淘宝页面，搜索耐克关键词，抓取商品的标题，链接，价格，城市，旺旺号，付款人数，进去第二层，抓取商品的销售量，款号等。

python爬虫爬取淘宝商品信息（selenum+phontomjs）

2、结果展示

python爬虫爬取淘宝商品信息（selenum+phontomjs）

3、源代码

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

									# encoding: utf-8

									import sys

									reload(sys)

									sys.setdefaultencoding('utf-8')

									import time

									import pandas as pd

									time1=time.time()

									from lxml import etree

									from selenium import webdriver

									#########自动模拟

									driver=webdriver.phantomjs(executable_path='d:/python27/scripts/phantomjs.exe')

									import re

									#################定义列表存储#############

									title=[]

									price=[]

									city=[]

									shop_name=[]

									num=[]

									link=[]

									sale=[]

									number=[]

									#####输入关键词耐克(这里必须用unicode)

									keyword="%e8%80%90%e5%85%8b"

									for i in range(0,1):

									  try:

									    print "...............正在抓取第"+str(i)+"页..........................."

									    url="https://s.taobao.com/search?q=%e8%80%90%e5%85%8b&imgfile=&js=1&stats_click=search_radio_all%3a1&initiative_id=staobaoz_20170710&ie=utf8&bcoffset=4&ntoffset=4&p4ppushleft=1%2c48&s="+str(i*44)

									    driver.get(url)

									    time.sleep(5)

									    html=driver.page_source

									    selector=etree.html(html)

									    title1=selector.xpath('//div[@class="row row-2 title"]/a')

									    for each in title1:

									      print each.xpath('string(.)').strip()

									      title.append(each.xpath('string(.)').strip())

									    price1=selector.xpath('//div[@class="price g_price g_price-highlight"]/strong/text()')

									    for each in price1:

									      print each

									      price.append(each)

									    city1=selector.xpath('//div[@class="location"]/text()')

									    for each in city1:

									      print each

									      city.append(each)

									    num1=selector.xpath('//div[@class="deal-cnt"]/text()')

									    for each in num1:

									      print each

									      num.append(each)

									    shop_name1=selector.xpath('//div[@class="shop"]/a/span[2]/text()')

									    for each in shop_name1:

									      print each

									      shop_name.append(each)

									    link1=selector.xpath('//div[@class="row row-2 title"]/a/@href')

									    for each in link1:

									      kk="https://" + each

									      link.append("https://" + each)

									      if "https" in each:

									        print each

									        driver.get(each)

									      else:

									        print "https://" + each

									        driver.get("https://" + each)

									      time.sleep(3)

									      html2=driver.page_source

									      selector2=etree.html(html2)

									      sale1=selector2.xpath('//*[@id="j_detailmeta"]/div[1]/div[1]/div/ul/li[1]/div/span[2]/text()')

									      for each in sale1:

									        print each

									        sale.append(each)

									      sale2=selector2.xpath('//strong[@id="j_sellcounter"]/text()')

									      for each in sale2:

									        print each

									        sale.append(each)

									      if "tmall" in kk:

									        number1 = re.findall('<ul id="j_attrul">(.*?)</ul>', html2, re.s)

									        for each in number1:

									          m = re.findall('>*号: (.*?)</li>', str(each).strip(), re.s)

									          if len(m) > 0:

									            for each1 in m:

									              print each1

									              number.append(each1)

									          else:

									            number.append("null")

									      if "taobao" in kk:

									        number2=re.findall('<ul class="attributes-list">(.*?)</ul>',html2,re.s)

									        for each in number2:

									          h=re.findall('>*号: (.*?)</li>', str(each).strip(), re.s)

									          if len(m) > 0:

									            for each2 in h:

									              print each2

									              number.append(each2)

									          else:

									            number.append("null")

									      if "click" in kk:

									        number.append("null")

									  except:

									    pass

									print len(title),len(city),len(price),len(num),len(shop_name),len(link),len(sale),len(number)

									# #

									# ######数据框

									data1=pd.dataframe({"标题":title,"价格":price,"旺旺":shop_name,"城市":city,"付款人数":num,"链接":link,"销量":sale,"款号":number})

									print data1

									# 写出excel

									writer = pd.excelwriter(r'c:\\taobao_spider2.xlsx', engine='xlsxwriter', options={'strings_to_urls': false})

									data1.to_excel(writer, index=false)

									writer.close()

									time2 = time.time()

									print u'ok,爬虫结束!'

									print u'总共耗时：' + str(time2 - time1) + 's'

									####关闭浏览器

									driver.close()