一个小爬虫
-
可以爬取这个站点的全部图片,并且放入指定文件夹
网站:https://zazhitaotu.cc
需要:python3.8+
模块:requests、bs4
脚本:import os import re import requests import random import bs4 # 生成一个全站页面链接的列表 urls = [f"https://zazhitaotu.cc/page/{url}/" for url in range(1, 26)] def random_ip(): a = random.randint(1, 255) b = random.randint(1, 255) c = random.randint(1, 255) d = random.randint(1, 255) return str(a) + "." + str(b) + "." + str(c) + "." + str(d) def parsing(url): agent = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92." "0.4515.107 Safari/537.36", "X-Forwarded-For": random_ip() } response = requests.get(url, headers=agent).content soup = bs4.BeautifulSoup(response, "html.parser") a_url = soup.find_all("a") title = soup.find("title").string img_url = soup.find_all("img") return a_url, img_url, title def download(): for i in urls: a = parsing(i) a_urls = re.findall(r"https://zazhitaotu.cc/archives/[0-9]*.html", str(a)) for img in a_urls: _, url, z = parsing(img) img_urls = re.findall(r"https://zazhitaotu.cc/usr/uploads/[0-9]*/[0-9]*/[0-9]*.jpg", str(url)) try: os.mkdir(z) except Exception: print("文件夹存在,跳过") continue for up in img_urls: get_up = requests.get(up).content filename = up.split("/")[-1] with open(f"{z}/{filename}", "wb+") as f: f.write(get_up) if __name__ == "__main__": download()