前言

基于 Python3 requests包的下载器,因为需要挂代理爬一些文件,所以写了这个下载器

而代理隧道不稳定,经常断开连接,导致文件经常无法下载完整

此下载器支持网速显示、文件大小显示、下载百分比、失去连接重试、网络不佳重试、断点续传、支持走http/s或socks5代理隧道

支持url列表批量下载

没有多线程

适用于下载较大且链接不稳定的文件

批量url下载时,若遇到不能下载的url,将跳过,不影响后面的url下载

跳过的url将会再下载结束时全部显示

演示

连接不稳定,失去连接后会重试,断点续传,无需重新下载

retry

连接断开,将等待2s后重试

360截图17040515195637

获取响应头失败,将等待一段时间后自动重试,每等待一次自动加时

re_connect

参数设置简单,可设置代理地址,可设置重试时间,可设置referer

360截图17700518064345

未下载完成的文件将重命名为.downloading后缀

360截图180601178711380

程序

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import requests
import os, time




#http/https代理
using_proxy = ""#形如127.0.0.1:19180
#socks5代理优先级大于http/https代理,当存在socks5代理,http/https代理将不起作用
socks5_proxy = ""
json_headers_download = {"user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", "referer" : ""}

re_download = 0 #值0或1,设置为1时照片已经存在时也会重新下载
retry_times = 15 #下载中断时的重试下载次数


#状态输出
def show_state(state_type, inf_input):
inf_str_type = ""
if state_type == 0:
inf_str_type = "Error! STOP AND EXIT"
elif state_type == 1:
inf_str_type = "START"
elif state_type == 2:
inf_str_type = "Working"
elif state_type == 3:
inf_str_type = "Finished"
print("@STATE : {}({})".format(inf_str_type, inf_input))
if state_type == 0:
exit()

#单位转换
def file_size_trans(filesize_input_B):
precent_unit = ''
# KB
if 1024 <= filesize_input_B < (1024 ** 2):
precent_unit = 'KB'
filesize_input_B = filesize_input_B / 1024
# MB
elif (1024 ** 2) <= filesize_input_B < (1024 ** 3):
precent_unit = 'MB'
filesize_input_B = filesize_input_B / (1024 ** 2)
# GB
elif (1024 ** 3) <= filesize_input_B < (1024 ** 4):
precent_unit = 'GB'
filesize_input_B = filesize_input_B / (1024 ** 3)
# TB
elif (1024 ** 4) <= filesize_input_B < (1024 ** 5):
precent_unit = 'TB'
filesize_input_B = filesize_input_B / (1024 ** 4)
# B
else:
precent_unit = 'Byte'
r_info = "{:.2f}".format(filesize_input_B) + ' ' + precent_unit
return r_info

def download_file_with_speed_showing(download_url, file_dir_i, img_file, last_chunk = 0, retry_time = 0, t_size = 0, no_down = 0):
if using_proxy == "":
set_proxies = {}
else:
set_proxies = {'http': using_proxy, 'https': using_proxy}

if socks5_proxy != "":
set_proxies = {'http': "socks5://"+socks5_proxy, 'https': "socks5://"+socks5_proxy}

pri_addr_file = "{}/{}.downloading".format(file_dir_i, img_file)#下载中的文件设置为.downloading后缀名
finish_addr_file = "{}/{}".format(file_dir_i, img_file)#下载完成的文件名
down_finish = False#下载完成的标志


try:
r = requests.Session()

dl_header = {}
dl_header['Range'] = 'bytes={}-'.format(last_chunk)#断点续传的准备
dl_header.update(json_headers_download)

# 已下载的字节大小
content_size = last_chunk
if content_size == 0:
if os.path.isfile(pri_addr_file) == True:
os.remove(pri_addr_file)

# 流式请求
try:
r = r.get(download_url, headers=dl_header, stream=True, proxies=set_proxies, timeout=60)
except:
print("Wait for 2s, because connection was broken")
time.sleep(2)

r = download_file_with_speed_showing(download_url, file_dir_i, img_file, 0, retry_time+1, 0, 1)
st_code = r.status_code
'''
print("*********")
print(r.headers)
print("*********")
'''
#如果请求状态码出错,不再继续(也没法继续)
if st_code not in [200, 206]:
show_state(0, "Error: HTTP NOT 200, state_code={}".format(st_code))


total_size = 0
if t_size == 0:
# 获得文件的大小,单位字节B
try:
total_size = int(r.headers['content-length'])
except:
r.close()#释放连接
if retry_time < retry_times:
sleep_t = 10*(retry_time+1) #等待时间,每次加10
print("Fail to fetch content-length, retry_count={} sleep_time={}(Sec) state_code={}".format(retry_time, sleep_t, r.status_code))
time.sleep(sleep_t)

#采用递归的方式重新请求
#如果成功了,递归退回初始请求,将请求到的数据赋值给r
r = download_file_with_speed_showing(download_url, file_dir_i, img_file, 0, retry_time+1, 0, 1)
if no_down == 0:
total_size = int(r.headers['content-length'])
else:
print_e(img_file, "Download an image error, err: content-length")
#show_state(0, "Error at fetch content-length, count over target")

if no_down == 0:
print('Start download file:{} --- Size: {}'.format(pri_addr_file, file_size_trans(total_size)))
else:
total_size = t_size

#如果有同名文件(后缀为.downloading),先测量大小,再继续下载
if os.path.isfile(finish_addr_file) == True:
size = os.path.getsize(pri_addr_file)
r = download_file_with_speed_showing(download_url, file_dir_i, img_file, size, retry_time+1, total_size, 1)

if no_down == 1:
return r #return请求数据并就此结束,接下来交给初始请求处理
with open(pri_addr_file, 'ab+') as file:
# 进度下载完成的百分比
process_p = 0
# 记录开始时间
start_time = time.time()
# 到上一秒的下载的文件大小
if last_chunk != 0:
temp_size = last_chunk
else:
temp_size = 0
# 开始下载每次请求1024*4字节
for content in r.iter_content(chunk_size=1024*4):
file.write(content)
# 更新下载大小
content_size += len(content)
# 计算下载进度
process_p = (content_size / total_size) * 100
# 每一秒统计一次下载量
if time.time() - start_time > 1:
# 重置开始时间
start_time = time.time()
# 每秒的下载量
content_speed = content_size - temp_size
print('Process: {:.1f}% Speed: {}/S File_size={}'.format(process_p, file_size_trans(content_speed), file_size_trans(content_size)))

if content_size > total_size:
break
else:
# 重置已下载大小
temp_size = content_size

if content_size >= total_size:
down_finish = True
print('100% Size: {}'.format(file_size_trans(os.path.getsize(pri_addr_file))))
r.close()
except requests.exceptions.RequestException as e:
print(e)
show_state(0, "Error")

if down_finish == True:
#重命名后缀
os.rename(pri_addr_file, finish_addr_file)#采用下载完才重命名的方法识别正在下载和已下载完成的图片
else:
#最大重试次数为retry_times次
if content_size != 0 and retry_time < retry_times:
size = os.path.getsize(pri_addr_file)
print('Lost connection, Retry...Now: count={}, File_size={}'.format(retry_time, file_size_trans(size)))
r.close()#释放
download_file_with_speed_showing(download_url, file_dir_i, img_file, size, retry_time+1, total_size, 0)#下载中断将递归,完成断点续传
else:
print('Retry count is finished, skip it.')
print_e(img_file, "Retry count is finished")
#show_state(0, "State error")
if os.path.isfile(pri_addr_file) == True:
os.remove(pri_addr_file)

#传入的下载链接要求为list
def download_img(dl_img_url, file_dir_i):
for img_url in dl_img_url:
img_file = img_url[img_url.rindex("/")+1:]
download_ctl = True

#如果已有同名文件
if os.path.isfile("{}/{}".format(file_dir_i, img_file)) == True:
#此模式不重新下载,则跳过
if re_download == 0:
download_ctl = False
print("Skiped:", img_file)
else:
#此模式是重新下载,则先删除以前的同名文件
os.remove(finish_addr_file)
print("Deleted:", img_file)

if download_ctl:
download_file_with_speed_showing(img_url, file_dir_i, img_file)
print("Finished:", img_file)
#图片下载函数---END




#第一个传入类型为list,为下载地址;第二个参数传入类型为字符串,为保存地址
download_img(['https://xxx.xxx.xxx/file.xxx'], "./")

结束

希望能帮到你

EOF