前言

基于 Python3 requests包的下载器，因为需要挂代理爬一些文件，所以写了这个下载器

而代理隧道不稳定，经常断开连接，导致文件经常无法下载完整

此下载器支持网速显示、文件大小显示、下载百分比、失去连接重试、网络不佳重试、断点续传、支持走http/s或socks5代理隧道

支持url列表批量下载

没有多线程

适用于下载较大且链接不稳定的文件

批量url下载时，若遇到不能下载的url，将跳过，不影响后面的url下载

跳过的url将会再下载结束时全部显示

演示

连接不稳定，失去连接后会重试，断点续传，无需重新下载

retry

连接断开，将等待2s后重试

360截图17040515195637

获取响应头失败，将等待一段时间后自动重试，每等待一次自动加时

re_connect

参数设置简单，可设置代理地址，可设置重试时间，可设置referer

360截图17700518064345

未下载完成的文件将重命名为.downloading后缀

360截图180601178711380

程序

import requests
import os, time




#http/https代理
using_proxy = ""#形如127.0.0.1:19180
#socks5代理优先级大于http/https代理，当存在socks5代理，http/https代理将不起作用
socks5_proxy = "" 
json_headers_download = {"user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", "referer" : ""}

re_download = 0 #值0或1，设置为1时照片已经存在时也会重新下载
retry_times = 15 #下载中断时的重试下载次数


#状态输出
def show_state(state_type, inf_input):
  inf_str_type = ""
  if state_type == 0:
    inf_str_type = "Error! STOP AND EXIT"
  elif state_type == 1:
    inf_str_type = "START"
  elif state_type == 2:
    inf_str_type = "Working"
  elif state_type == 3:
    inf_str_type = "Finished"
  print("@STATE : {}({})".format(inf_str_type, inf_input))
  if state_type == 0:
    exit()
    
#单位转换
def file_size_trans(filesize_input_B):
  precent_unit = ''
  # KB
  if 1024 <= filesize_input_B < (1024 ** 2):
     precent_unit = 'KB'
     filesize_input_B = filesize_input_B / 1024
  # MB
  elif (1024 ** 2) <= filesize_input_B < (1024 ** 3):
     precent_unit = 'MB'
     filesize_input_B = filesize_input_B / (1024 ** 2)
  # GB
  elif (1024 ** 3) <= filesize_input_B < (1024 ** 4):
     precent_unit = 'GB'
     filesize_input_B = filesize_input_B / (1024 ** 3)
  # TB
  elif (1024 ** 4) <= filesize_input_B < (1024 ** 5):
     precent_unit = 'TB'
     filesize_input_B = filesize_input_B / (1024 ** 4)
  # B
  else:
     precent_unit = 'Byte'
  r_info = "{:.2f}".format(filesize_input_B) + ' ' + precent_unit
  return r_info

def download_file_with_speed_showing(download_url, file_dir_i, img_file, last_chunk = 0, retry_time = 0, t_size = 0, no_down = 0):
  if using_proxy == "":
    set_proxies = {}
  else:
    set_proxies = {'http': using_proxy, 'https': using_proxy}
    
  if socks5_proxy != "":
    set_proxies = {'http': "socks5://"+socks5_proxy, 'https': "socks5://"+socks5_proxy}

  pri_addr_file = "{}/{}.downloading".format(file_dir_i, img_file)#下载中的文件设置为.downloading后缀名
  finish_addr_file = "{}/{}".format(file_dir_i, img_file)#下载完成的文件名
  down_finish = False#下载完成的标志
  
  
  try:
    r = requests.Session()
    
    dl_header = {}
    dl_header['Range'] = 'bytes={}-'.format(last_chunk)#断点续传的准备
    dl_header.update(json_headers_download)
    
    # 已下载的字节大小
    content_size = last_chunk
    if content_size == 0:
      if os.path.isfile(pri_addr_file) == True:
        os.remove(pri_addr_file)
    
    # 流式请求
    try:
        r = r.get(download_url, headers=dl_header, stream=True, proxies=set_proxies, timeout=60)
    except:
        print("Wait for 2s, because connection was broken")
        time.sleep(2)
        
        r = download_file_with_speed_showing(download_url, file_dir_i, img_file, 0, retry_time+1, 0, 1)
    st_code = r.status_code
    '''
    print("*********")
    print(r.headers)
    print("*********")
    '''
    #如果请求状态码出错，不再继续（也没法继续）
    if st_code not in [200, 206]:
      show_state(0, "Error: HTTP NOT 200, state_code={}".format(st_code))
    
    
    total_size = 0
    if t_size == 0:
        # 获得文件的大小，单位字节B
        try:
          total_size = int(r.headers['content-length'])
        except:
          r.close()#释放连接
          if retry_time < retry_times:
            sleep_t = 10*(retry_time+1) #等待时间，每次加10
            print("Fail to fetch content-length, retry_count={} sleep_time={}(Sec) state_code={}".format(retry_time, sleep_t, r.status_code))
            time.sleep(sleep_t)

            #采用递归的方式重新请求
            #如果成功了，递归退回初始请求，将请求到的数据赋值给r
            r = download_file_with_speed_showing(download_url, file_dir_i, img_file, 0, retry_time+1, 0, 1)
            if no_down == 0:
              total_size = int(r.headers['content-length'])
          else:
            print_e(img_file, "Download an image error, err: content-length")
            #show_state(0, "Error at fetch content-length, count over target")
            
        if no_down == 0:
          print('Start download file:{} --- Size: {}'.format(pri_addr_file, file_size_trans(total_size)))
    else:
        total_size = t_size
    
    #如果有同名文件（后缀为.downloading），先测量大小，再继续下载
    if os.path.isfile(finish_addr_file) == True:
      size = os.path.getsize(pri_addr_file)
      r = download_file_with_speed_showing(download_url, file_dir_i, img_file, size, retry_time+1, total_size, 1)
    
    if no_down == 1:
      return r #return请求数据并就此结束，接下来交给初始请求处理
    with open(pri_addr_file, 'ab+') as file:
      # 进度下载完成的百分比
      process_p = 0
      # 记录开始时间
      start_time = time.time()
      # 到上一秒的下载的文件大小
      if last_chunk != 0:
        temp_size = last_chunk
      else:
        temp_size = 0
      # 开始下载每次请求1024*4字节
      for content in r.iter_content(chunk_size=1024*4):
        file.write(content)
        # 更新下载大小
        content_size += len(content)
        # 计算下载进度
        process_p = (content_size / total_size) * 100
        # 每一秒统计一次下载量
        if time.time() - start_time > 1:
          # 重置开始时间
          start_time = time.time()
          # 每秒的下载量
          content_speed = content_size - temp_size
          print('Process: {:.1f}% Speed: {}/S File_size={}'.format(process_p, file_size_trans(content_speed), file_size_trans(content_size)))
          
          if content_size > total_size:
            break
          else:
            # 重置已下载大小
            temp_size = content_size

      if content_size >= total_size:
        down_finish = True
        print('100% Size: {}'.format(file_size_trans(os.path.getsize(pri_addr_file))))
      r.close()
  except requests.exceptions.RequestException as e:
    print(e)
    show_state(0, "Error")
    
  if down_finish == True:
    #重命名后缀
    os.rename(pri_addr_file, finish_addr_file)#采用下载完才重命名的方法识别正在下载和已下载完成的图片
  else:
    #最大重试次数为retry_times次
    if content_size != 0 and retry_time < retry_times:
      size = os.path.getsize(pri_addr_file)
      print('Lost connection, Retry...Now: count={}, File_size={}'.format(retry_time, file_size_trans(size)))
      r.close()#释放
      download_file_with_speed_showing(download_url, file_dir_i, img_file, size, retry_time+1, total_size, 0)#下载中断将递归，完成断点续传
    else:
      print('Retry count is finished, skip it.')
      print_e(img_file, "Retry count is finished")
      #show_state(0, "State error")
    if os.path.isfile(pri_addr_file) == True:
      os.remove(pri_addr_file)

#传入的下载链接要求为list
def download_img(dl_img_url, file_dir_i):
  for img_url in dl_img_url:
    img_file = img_url[img_url.rindex("/")+1:]
    download_ctl = True
    
    #如果已有同名文件
    if os.path.isfile("{}/{}".format(file_dir_i, img_file)) == True:
      #此模式不重新下载，则跳过
      if re_download == 0:
         download_ctl = False
         print("Skiped:", img_file)
      else:
         #此模式是重新下载，则先删除以前的同名文件
         os.remove(finish_addr_file)
         print("Deleted:", img_file)
        
    if download_ctl:
      download_file_with_speed_showing(img_url, file_dir_i, img_file)
      print("Finished:", img_file)
#图片下载函数---END
    



#第一个传入类型为list，为下载地址；第二个参数传入类型为字符串，为保存地址
download_img(['https://xxx.xxx.xxx/file.xxx'], "./")