Python爬虫 - 图集谷图片爬取 - 霜冷的秘密基地

Python爬虫 - 图集谷图片爬取

程序设计 0 评

简介

本程序是我之前为一个朋友做的,具体做什么,懂得都懂,顺便学习了下Python爬虫的简单使用,主要是re和requests
修改图下下载根目录则查找FILE_PATH,修改即可,程序可以在根目录下创建单独文件夹保存不同图包

# Python批量下载图集谷图片脚本 v1.4 by SmileSL
# ==== 参数说明 ====
# 主文件夹下载位置:FILE_PATH "D:/PixivDownload/自下/woMan/"
# URL示例:https://www.tuji001.com/t/1000 / https://www.tuji001.com/x/1000/
# 创建文件夹示例:测试名称

# ==== getImage v1.3 ====
# 2020.10.23 - 优化了下载文件的反馈代码(文件块、加载条...)

# ==== getImage v1.4 ====
# 2021.2.23 - 修正了图片直链正则式
# 2021.2.23 - 新增了按照图集分类保存图片的功能
# 2021.2.23 - 修复了分类保存功能中正则匹配偶尔匹配错误的BUG

# ==== getImage v1.4 ====
# 2021.7.13 - 更新图源为 tjg.gzhuibei.com
# 2021.7.13 - 放弃计算文件大小【content_size】 KeyError: 'content-length'
# 2021.7.13 - 添加了文件存在跳过功能

# ==== getImage v1.5 ====
# 2021.7.21 - 新增了按照副页(单独图集)下载功能
# 2021.7.21 - 修订了爬取时间间隔
# 2021.9.5  - 更新网站 www.tujigu.com 为 www.tujigu.net,并修正了正则式

# ==== getImage v1.5 ====
# 2021.12.19 - 更新网站 www.tujigu.net 为 www.tujigu001.com,并修正了正则式
# 2021.12.19 - 修改原始http请求下载为ariac多线程下载,并且更改存储方式为MacOS路径 【PS:下载速度快多啦,以前怎么没想到(º﹃º )】

# ==== 告示 ====
# 2022.1.7 - 目标网站已关闭对外开放,脚本失效

import requests
import os
import re
import time

website = "https://www.tuji001.com"

headers = {
    "Referer": "https://www.tujigu.net/",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/93.0.4577.63 Safari/537.36 "
}

#  图片根目录地址 - 记得更改
FILE_PATH = "/Volumes/Mac_Space/ACG/Girls/"


#  创建目录
def mkdir(name):
    path = FILE_PATH + name
    isExists = os.path.exists(path)

    # 判断结果
    if not isExists:
        os.makedirs(path)

        print('[OS]' + path + ' 创建成功')
    else:
        # 如果目录存在则不创建,并提示目录已存在
        print('[OS]' + path + ' 目录已存在')


def openUrl(openUrl):
    res = requests.get(openUrl, headers=headers)
    res.encoding = 'utf-8'
    html = res.text

    return html


def getFirstUrl(url):
    #  https://www.tujigu.net/x/82/
    html = openUrl(url)

    pagingUrl = re.findall("https:\/\/www.tuji001.com\/a\/[0-9]*\/", html)
    pagingUrl = sorted(set(pagingUrl), key=pagingUrl.index)  # 删除多余重复元素

    pagingUrl2 = []
    # isXT = url[url.index("com/") + 4:url.index("com/") + 5]  # 判断链接类型
    isXT = "t"
    if isXT == "x":
        pagingUrl2 = re.findall("/x/[0-9]*/index_[0-9]*.html", html)  # 个人主页上的分页链接
    elif isXT == "t":
        pagingUrl2 = re.findall("/t/[0-9]*/index_[0-9]*.html", html)
    else:
        print("[SMILE] 链接不匹配!")
        return

    #  获取分页链接
    if pagingUrl2:
        pagingUrl2.pop()  # 去除”下一页“的链接
        for i in pagingUrl2:
            html2 = openUrl(website + i)
            pagingUrl3 = re.findall("https:\/\/www.tuji001.com\/a\/[0-9]*\/", html2)
            pagingUrl3 = sorted(set(pagingUrl3), key=pagingUrl3.index)  # 删除多余重复元素
            pagingUrl += pagingUrl3
    print(pagingUrl)
    print("[FIRST OK]")

    return pagingUrl  # https://www.tuji001.com/a/33532/


# 优化强呀,啊sir
def getImage(pagingUrl):
    secondUrl = []
    x = 0

    for url in pagingUrl:  # https://www.tuji001.com/a/33532/
        secondUrl.append(url)
        html = openUrl(url)
        urlList = re.findall("https:\/\/www.tuji001.com\/a\/[0-9]*\/[0-9]*.html", html)
        urlList.pop()
        lastPageUrl = urlList[len(urlList) - 1]
        lastPage = lastPageUrl[lastPageUrl.index(".html") - 2: lastPageUrl.index(".html")]

        if '/' in lastPage:
            lastPage = lastPage[-1:]

        for i in range(2, int(lastPage) + 1):
            secondUrl.append(url + str(i) + ".html")  # https://www.tuji001.com/a/33532/2-n.html

        biLi = x / len(pagingUrl) * 100
        print("\r[SMILE] 已加载 %%%.2f" % (biLi), end=" ")
        x += 1
    print("\r[SMILE] 已加载 %100.00", end=" ")
    print("\n[SECOND OK]")
    print(secondUrl)
    return secondUrl


def downImage(urls):
    imgIndex = 1

    fileName = input("[SMILE] 创建文件夹:")
    mkdir(fileName)

    path = FILE_PATH + fileName

    #  urls = urls[urls.index('https://www.tuji001.com/a/25741/'): -1]

    for url in urls:
        html = openUrl(url)

        html2 = html[html.index('class="content"'): html.index('read2()')]
        imgUrl = re.findall("https:\/\/tjg.gzhuibei.com\/a\/1\/[0-9]*\/[0-9]*.jpg", html2)
        #  套图名称
        rName = re.findall("<h1>(.*?)</h1>", html)[0]
        #  分文件夹名存放套图
        try:
            #  第一种格式 《》
            tName = re.findall("(?<=《).*?(?=》)", rName)[0]
        except:
            #  第二种格式 - /
            if '-' in rName:
                tName = rName.split('-')[-1].replace(' ', '')
                if '/' in tName:  # 排除文件名建立文件夹错误
                    tName = tName[:tName.index('/')]
            elif '/' in rName:
                #  第三种格式 /
                tName = rName[rName.index('/') + 1:]
            else:
                tName = rName.replace(' ', '')

            #  排除文件名建立文件夹错误
            if '/' in tName:
                tName = rName[:tName.index('/')]

        #  创建独立套图文件夹
        if tName:
            isExists = os.path.exists(path + r'/' + tName)
            if not isExists:
                os.makedirs(path + r'/' + tName)
        else:
            pass

        #  排除文件名建立文件夹错误
        if '/' in rName:
            rName = rName[:rName.index('/')]

        for link in imgUrl:
            file_name = rName + str(imgIndex)
            file_dir = path + r'/' + tName
            if os.path.exists(path + r'/' + tName + r'/' + file_name + ".jpg"):
                print("[SMILE] 文件已存在")
                imgIndex += 1
                break
            else:
                aria2Down(link, file_name, file_dir)
                imgIndex += 1


# aria 调用模块
def aria2Down(link, file_name, file_dir):
    exe_path = r'/usr/local/Cellar/aria2/1.36.0/bin/aria2c'
    # 参数
    file_name = '-o "' + str(file_name) + '.jpg" '
    file_dir = '--dir="' + str(file_dir) + '" '
    order = exe_path + ' -s16 -x10 ' + file_dir + file_name + link
    os.system(order)


if __name__ == "__main__":
    #  https://www.tuji001.com/
    isEx = input("[SMILE] 1.按照主页下载模式 2.副页下载模式 : ")
    #  isEx = '1'

    if isEx == '1' or isEx == '1.':
        url = input("[SMILE] URL: https://www.tuji001.com/")

        firstUrl = getFirstUrl("https://www.tuji001.com/" + url + r"/")
        urls = getImage(firstUrl)
        downImage(urls)
    elif isEx == '2' or isEx == '2.':
        url = input("[SMILE] URL: https://www.tuji001.com/a/")

        firstUrl = []
        firstUrl.append("https://www.tuji001.com/a/" + url + r"/")
        urls = getImage(firstUrl)
        downImage(urls)

    print("\n[SMILE] 下载完成 100%")
Python - 简单加密文件
快来做第一个评论的人吧~
0:00