最近瞟了xx接近一个T的本地存储空间,准备好好利用,放点什么于是就

这是第一次写爬虫

import requests
import re
import socket
import time
import traceback

munber = 1
tab = 50#爬取的页面
sleepTime = 5

inUrl = []
proxies = {"http": "http://127.0.0.1:10810", "https": "http://127.0.0.1:10810"}
s = requests.session()
s.keep_alive = False
requests.adapters.DEFAULT_RETRIES = 5
socket.setdefaulttimeout(5)
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0'
}

for t in range(1, tab):
    print("开始下载第"+str(t)+"页")
    try:
        response = requests.get('https://konachan.net/post?page=' + str(t), headers=headers, proxies=proxies)
        outHtml = response.text
        # print(outHtml)
        tuUrl = re.findall('<a class="thumb" href=".*?">', outHtml)
        print(tuUrl)

        print("图片数量"+str(len(tuUrl)))

        # 获取地址

        for i in tuUrl:
            print("列表切片"+i)
            inUrl.append(i[23:91])  # 24 92

        # 地址切片

        for p in inUrl:
            try:
                response = requests.get('https://konachan.net/' + str(p), headers=headers, proxies= proxies)
                outHtml = response.text
                pngurl = re.findall('<a class="original-file-.*?" href=".*?" id=".*?"&gtDownload larger version .*?/a>', outHtml)
                print(pngurl)
                pngurl = re.findall('href=".*?" id="highres">',str(pngurl))
                print(pngurl)
                pngDown = str(pngurl)[8:-17]
                print(pngDown)
                # png地址截取
                print("开始下载")
                png = requests.get(pngDown, headers=headers, proxies=proxies)#在requests的get方法中加verify=False:ProxyError
                fileName = str(munber) + pngDown[-4:]
                print(fileName)

                with open(fileName, 'ab') as f:
                    f.write(png.content)
                    f.close()
                print("第" + str(munber) + "张下载完成")
                munber += 1
                time.sleep(sleepTime)
                    # 下载图片
            except:
                print("can't download or save of " + str(munber))
                traceback.print_exc()


        print(inUrl)
        time.sleep(sleepTime)
        inUrl = []
        print(t)
    except:
        print("can;t")

需要一点合格的魔法
各位绅士可以根据自己的需求改.net为.坑
目前bug有一点点多不过能用

已下抽风

但不影响

就是会慢亿点

成果

 

此作者没有提供个人介绍
最后更新于 2021-04-03