python自动从arxiv下载paper的示例代码_Python

				?

									#!/usr/bin/env python

									# -*- coding: utf-8 -*-

									# @Time  : 2020/02/11 21:44

									# @Author : dangxusheng

									# @Email  : dangxusheng163@163.com

									# @File  : download_by_href.py

									'''

									自动从arxiv.org 下载文献

									'''

									import os

									import os.path as osp

									import requests

									from lxml import etree

									from pprint import pprint

									import re

									import time

									import glob

									headers = {

									  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36",

									  "Host": 'arxiv.org'

									}

									HREF_CN = 'http://cn.arxiv.org/pdf/'

									HREF_SRC = 'http://cn.arxiv.org/pdf/'

									SAVE_PATH = '/media/dangxs/E/Paper/download_at_20200730'

									os.makedirs(SAVE_PATH, exist_ok=True)

									FAIL_URLS = []

									FAIL_URLS_TXT = f'{SAVE_PATH}/fail_urls.txt'

									def download(url, title):

									  pattern = r'[\\/:*?"\'<>|\r\n]+'

									  new_title = re.sub(pattern, " ", title)

									  print(f'new title: {new_title}')

									  save_filepath = '%s/%s.pdf' % (SAVE_PATH, new_title)

									  if osp.exists(save_filepath) and osp.getsize(save_filepath) > 50 * 1024:

									    print(f'this pdf is be existed.')

									    return True

									  try:

									    with open(save_filepath, 'wb') as file:

									      # 分字节下载

									      r = requests.get(url, stream=True, timeout=None)

									      for i in r.iter_content(2048):

									        file.write(i)

									    if osp.getsize(save_filepath) >= 10 * 1024:

									      print('%s 下载成功.' % title)

									      return True

									  except Exception as e:

									    print(e)

									  return False

									# 从arxiv.org 去下载

									def search(start_size=0, title_keywords='Facial Expression'):

									  # 访问地址: https://arxiv.org/find/grp_eess,grp_stat,grp_cs,grp_econ,grp_math/1/ti:+Face/0/1/0/past,2018,2019/0/1?skip=200&query_id=1c582e6c8afc6146&client_host=cn.arxiv.org

									  req_url = 'https://arxiv.org/search/advanced'

									  req_data = {

									    'advanced': 1,

									    'terms-0-operator': 'AND',

									    'terms-0-term': title_keywords,

									    'terms-0-field': 'title',

									    'classification-computer_science': 'y',

									    'classification-physics_archives': 'all',

									    'classification-include_cross_list': 'include',

									    'date-filter_by': 'date_range', # date_range | specific_year

									    # 'date-year': DOWN_YEAR,

									    'date-year': '',

									    'date-from_date': '2015',

									    'date-to_date': '2020',

									    'date-date_type': 'announced_date_first', # submitted_date | submitted_date_first | announced_date_first

									    'abstracts': 'show',

									    'size': 50,

									    'order': '-announced_date_first',

									    'start': start_size,

									  }

									  res = requests.get(req_url, params=req_data, headers=headers)

									  html = res.content.decode()

									  html = etree.HTML(html)

									  total_text = html.xpath('//h1[@class="title is-clearfix"]/text()')

									  total_text = ''.join(total_text).replace('\n', '').lstrip(' ').strip(' ')

									  # i.e. : Showing 1–50 of 355 results

									  num = re.findall('\d+', total_text)

									  # Sorry, your query returned no results

									  if len(num) == 0: return [], 0

									  total = int(num[-1]) # 查询总条数

									  paper_list = html.xpath('//ol[@class="breathe-horizontal"]/li')

									  info_list = []

									  for p in paper_list:

									    title = p.xpath('./p[@class="title is-5 mathjax"]//text()')

									    title = ''.join(title).replace('\n', '').lstrip(' ').strip(' ')

									    href = p.xpath('./div/p/a/@href')[0]

									    info_list.append({'title': title, 'href': href})

									  return info_list, total

									# 去指定页面下载

									def search_special():

									  res = requests.get('https://gitee.com/weberyoung/the-gan-zoo?_from=gitee_search')

									  html = res.content.decode()

									  html = etree.HTML(html)

									  paper_list = html.xpath('//div[@class="file_content markdown-body"]//li')

									  info_list = []

									  for p in paper_list:

									    title = p.xpath('.//text()')

									    title = ''.join(title).replace('\n', '').lstrip(' ').strip(' ')

									    href = p.xpath('./a/@href')[0]

									    info_list.append({'title': title, 'href': href})

									  pprint(info_list)

									  return info_list

									if __name__ == '__main__':

									  page_idx = 0

									  total = 1000

									  keywords = 'Facial Action Unit'

									  while page_idx <= total // 50:

									    paper_list, total = search(page_idx * 50, keywords)

									    print(f'total: {total}')

									    if total == 0:

									      print('no found .')

									      exit(0)

									    for p in paper_list:

									      title = p['title']

									      href = HREF_CN + p['href'].split('/')[-1] + '.pdf'

									      print(href)

									      if not download(href, title):

									        print('从国内镜像下载失败，从源地址开始下载 >>>>')

									        # 使用国际URL再下载一次

									        href = HREF_SRC + p['href'].split('/')[-1] + '.pdf'

									        if not download(href, title):

									          FAIL_URLS.append(p)

									    page_idx += 1

									  # 下载最后的部分

									  last_1 = total - page_idx * 50

									  paper_list, total = search(last_1, keywords)

									  for p in paper_list:

									    title = p['title']

									    href = HREF_CN + p['href'].split('/')[-1] + '.pdf'

									    if not download(href, title):

									      FAIL_URLS.append(p)

									    time.sleep(1)

									  pprint(FAIL_URLS)

									  with open(FAIL_URLS_TXT, 'a+') as f:

									    for item in FAIL_URLS:

									      href = item['href']

									      title = item['title']

									      f.write(href + '\n')

									  print('done.')