本文主要分享关于python登录并爬取淘宝信息的相关代码,还是挺不错的,大家可以了解下。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
|
#!/usr/bin/env python # -*- coding:utf-8 -*- from selenium import webdriver import time import datetime import traceback import logging import os from selenium.webdriver.common.action_chains import ActionChains import codecs #登录 def login(driver,site): driver.get(site) time.sleep( 5 ) try : #点击请登录 driver.find_element_by_class_name( "h" ).click() time.sleep( 5 ) #输入账号和密码 driver.find_element_by_id( "TPL_username_1" ).send_keys(u "yourusername" ) time.sleep( 5 ) #print driver.find_element_by_id("TPL_username_1") driver.find_element_by_id( "TPL_password_1" ).send_keys(u "yourpsd" ) time.sleep( 5 ) #点击登录 driver.find_element_by_id( "J_SubmitStatic" ).click() time.sleep( 30 ) except : print u "failure" def crawlmarket(driver,filename,site): #driver = webdriver.Firefox() driver.get(site) driver.maximize_window() time.sleep( 10 ) driver.refresh() time.sleep( 10 ) test = driver.find_elements_by_xpath( "//a[@class='J_ItemLink']" ) #是否获取到消息,若无则登录 if len (test) = = 0 : login(driver,site) time.sleep( 30 ) resultstrall = "" resultstr = "" strinfo = "" for i in range ( 0 , len (test), 1 ): if test[i].text ! = "" : resultstr = test[i].text.strip() + '\n' print resultstr resultstrall + = resultstr #是否成功抓取 if resultstrall ! = "": f = codecs. open (filename, 'w' , 'utf-8' ) f.write(resultstrall) f.close() #若没有成功抓取将网站写入error else : strinfo = filename + "," + site print strinfo ferror = codecs. open ( "error.txt" , 'a' , 'utf-8' ) ferror.write(strinfo) ferror.close() driver.quit() def crawltaobaosousuo(driver,filename,site): #driver = webdriver.Firefox() driver.get(site) driver.maximize_window() time.sleep( 10 ) driver.get(site) time.sleep( 30 ) driver.refresh() test = driver.find_elements_by_xpath( "//a[@class='J_ClickStat']" ) resultstrall = "" resultstr = "" strinfo = "" for i in range ( 0 , len (test), 1 ): if test[i].text ! = "" : resultstr = test[i].text.strip() + '\n' print resultstr resultstrall + = resultstr if resultstrall ! = "": f = codecs. open (filename, 'w' , 'utf-8' ) f.write(resultstrall) f.close() else : strinfo = filename + "," + site print strinfo ferror = codecs. open ( "error.txt" , 'a' , 'utf-8' ) ferror.write(strinfo) ferror.close() driver.quit() def jiexi(driver): f = open ( "1.txt" , "r" ) for line in f: time.sleep( 60 ) info = line.split( "," ) href = info[ 1 ] filename = info[ 0 ].decode( "utf-8" ) print filename if "markets" in href: crawlmarket(driver,filename,href) else : crawltaobaosousuo(driver,filename,href) if __name__ = = '__main__' : driver = webdriver.Firefox() jiexi(driver) |
小结
有改进策略一起探讨,可以抓取淘宝部分网页内容,根据自己的需求改改吧,会被风控。个人觉得不登录的效果更好。
以上就是本文关于python 登录并爬取淘宝信息代码示例的全部内容,希望对大家有所帮助。感兴趣的朋友可以继续参阅本站其他相关专题。如有不足之处,欢迎留言指出。感谢朋友们对本站的支持!
原文链接:http://blog.csdn.net/Japan__/article/details/50821586