python Requsets下载开源网站的代码(带索引数据)_Python

python Requsets下载开源网站的代码(带索引数据)

2021-10-09 00:20TTMer Python

这篇文章主要介绍了python Requsets下载开源网站的代码(带索引数据),本文通过实例代码给大家介绍的非常详细，对大家的学习或工作具有一定的参考借鉴价值，需要的朋友可以参考下

环境搭建

python 3.x
requests 包
re 包
gooey包（用于可视化）

代码

				?

									import requests

									import re

									import os

									from gooey import Gooey, GooeyParser

									import time

									s = requests.Session()

									def judgeTypeOfPath(name):

									    '''

									    判断该路径是文件还是文件夹

									      :param name: 路径名称

									      :return:True->文件;False->文件夹

									    '''

									    if name[-1] == '/':

									        return False

									    else:

									        return True

									def makeDirOfPath(path):

									    '''

									    创建文件夹

									    :param path: 文件夹名称以及路径

									    :return: True->创建成功;False->创建失败

									    '''

									    if not os.path.isdir(path):

									        os.mkdir(path)

									    if not os.path.isdir(path):

									        return False

									    return True

									def getPath(url):

									    '''

									    获取网页路径列表

									    :param url: 当前网页路径

									    :return: 路径列表

									    '''

									    baseResponse = s.get(url=url, stream=True,verify=False).text

									    listOfDirOrFilesTemp = re.findall(r'<li><a href=".*?" rel="external nofollow" >', baseResponse)

									    listOfDirOrFiles = []

									    for i in range(len(listOfDirOrFilesTemp)):

									        listOfDirOrFiles.append(listOfDirOrFilesTemp[i].split("\"")[1])

									    return listOfDirOrFiles[1:len(listOfDirOrFiles) + 1]

									def rfSearch(listOfPath,url, nowPath):

									    '''

									    递归寻找目录、路径,并下载文件

									    :param listOfPath: 当前目录下文件以及文件夹目录列表

									    :param nowPath: 现在所在路径

									    :return:

									    '''

									    newList = listOfPath[:]

									    if not newList:

									        return

									    for i in range(len(newList)):

									        if not judgeTypeOfPath(newList[i]):

									            u = nowPath + newList[i][0:len(newList[i])]

									            makeDirOfPath(u)

									            tempPath=nowPath + newList[i][0:len(newList[i])+1]

									            tempUrl=url+newList[i][0:len(newList[i])+1]

									            u=getPath(tempUrl)

									            rfSearch(u,tempUrl,tempPath)

									        else:

									            print(f'开始下载{newList[i]}...')

									            t1=time.time()

									            u = nowPath + newList[i]

									            m=url+newList[i]

									            if not os.path.exists(u):

									                r = s.get(m, stream=True,verify=False)

									                f = open(u, "wb")

									                for chunk in r.iter_content(chunk_size=10240):

									                    if chunk:

									                        f.write(chunk)

									                f.close()

									            t2=time.time()

									            print(f'{newList[i]}下载完成\t\t用时  {t2-t1}')

									@Gooey(

									    program_name='isric数据下载器',

									    encoding="utf-8", )

									def main():

									    parser = GooeyParser(description="isric数据下载器")

									    parser.add_argument('--url',default=r'https://files.isric.org/soilgrids/latest/data/')

									    parser.add_argument('--path', widget="DirChooser", default=r'F:/isricData/')

									    args = parser.parse_args()

									    url=args.url

									    nowPath = args.path

									    u = getPath(url)

									    rfSearch(u, url,nowPath)

									###如果不需要可视化，则不用gooey，可以将上面部分替换如下

									#@Gooey(

									#    program_name='isric数据下载器',

									#   encoding="utf-8", )

									#上面三行删除即可

									###main函数替换成下面部分：

									# def main():

									#     url=r'https://files.isric.org/soilgrids/latest/data/'#在此处修改地址链接

									#     nowPath = r'F:/isricData/'#在此处修改文件保存地址

									#     u = getPath(url)

									#     rfSearch(u, url,nowPath)

									if __name__ == "__main__":

									    main()