最近瞟了xx接近一个T的本地存储空间,准备好好利用,放点什么于是就
这是第一次写爬虫
import requests import re import socket import time import traceback munber = 1 tab = 50#爬取的页面 sleepTime = 5 inUrl = [] proxies = {"http": "http://127.0.0.1:10810", "https": "http://127.0.0.1:10810"} s = requests.session() s.keep_alive = False requests.adapters.DEFAULT_RETRIES = 5 socket.setdefaulttimeout(5) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0' } for t in range(1, tab): print("开始下载第"+str(t)+"页") try: response = requests.get('https://konachan.net/post?page=' + str(t), headers=headers, proxies=proxies) outHtml = response.text # print(outHtml) tuUrl = re.findall('<a class="thumb" href=".*?">', outHtml) print(tuUrl) print("图片数量"+str(len(tuUrl))) # 获取地址 for i in tuUrl: print("列表切片"+i) inUrl.append(i[23:91]) # 24 92 # 地址切片 for p in inUrl: try: response = requests.get('https://konachan.net/' + str(p), headers=headers, proxies= proxies) outHtml = response.text pngurl = re.findall('<a class="original-file-.*?" href=".*?" id=".*?">Download larger version .*?/a>', outHtml) print(pngurl) pngurl = re.findall('href=".*?" id="highres">',str(pngurl)) print(pngurl) pngDown = str(pngurl)[8:-17] print(pngDown) # png地址截取 print("开始下载") png = requests.get(pngDown, headers=headers, proxies=proxies)#在requests的get方法中加verify=False:ProxyError fileName = str(munber) + pngDown[-4:] print(fileName) with open(fileName, 'ab') as f: f.write(png.content) f.close() print("第" + str(munber) + "张下载完成") munber += 1 time.sleep(sleepTime) # 下载图片 except: print("can't download or save of " + str(munber)) traceback.print_exc() print(inUrl) time.sleep(sleepTime) inUrl = [] print(t) except: print("can;t")
需要一点合格的魔法
各位绅士可以根据自己的需求改.net为.坑
目前bug有一点点多不过能用
已下抽风
但不影响
就是会慢亿点
成果
Comments NOTHING