最近瞟了xx接近一个T的本地存储空间,准备好好利用,放点什么于是就
![]()
这是第一次写爬虫
import requests
import re
import socket
import time
import traceback
munber = 1
tab = 50#爬取的页面
sleepTime = 5
inUrl = []
proxies = {"http": "http://127.0.0.1:10810", "https": "http://127.0.0.1:10810"}
s = requests.session()
s.keep_alive = False
requests.adapters.DEFAULT_RETRIES = 5
socket.setdefaulttimeout(5)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0'
}
for t in range(1, tab):
print("开始下载第"+str(t)+"页")
try:
response = requests.get('https://konachan.net/post?page=' + str(t), headers=headers, proxies=proxies)
outHtml = response.text
# print(outHtml)
tuUrl = re.findall('<a class="thumb" href=".*?">', outHtml)
print(tuUrl)
print("图片数量"+str(len(tuUrl)))
# 获取地址
for i in tuUrl:
print("列表切片"+i)
inUrl.append(i[23:91]) # 24 92
# 地址切片
for p in inUrl:
try:
response = requests.get('https://konachan.net/' + str(p), headers=headers, proxies= proxies)
outHtml = response.text
pngurl = re.findall('<a class="original-file-.*?" href=".*?" id=".*?">Download larger version .*?/a>', outHtml)
print(pngurl)
pngurl = re.findall('href=".*?" id="highres">',str(pngurl))
print(pngurl)
pngDown = str(pngurl)[8:-17]
print(pngDown)
# png地址截取
print("开始下载")
png = requests.get(pngDown, headers=headers, proxies=proxies)#在requests的get方法中加verify=False:ProxyError
fileName = str(munber) + pngDown[-4:]
print(fileName)
with open(fileName, 'ab') as f:
f.write(png.content)
f.close()
print("第" + str(munber) + "张下载完成")
munber += 1
time.sleep(sleepTime)
# 下载图片
except:
print("can't download or save of " + str(munber))
traceback.print_exc()
print(inUrl)
time.sleep(sleepTime)
inUrl = []
print(t)
except:
print("can;t")
需要一点合格的魔法
各位绅士可以根据自己的需求改.net为.坑
目前bug有一点点多不过能用

已下抽风

但不影响
就是会慢亿点
成果


Comments NOTHING