-
garvey说:
利用python爬取豆瓣Top250,并给出源代码
from bs4 import BeautifulSoup from lxml import etree import requests url = 'https://movie.douban.com/top250' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36' } def get_info(url): response = requests.get(url,headers=headers) html_str = response.content.decode('utf-8') html = etree.HTML(html_str) # 解析出来每一部电影的url movie_urls = html.xpath("//div[@class='hd']/a/@href") for movie_url in movie_urls: get_detail(movie_url) def get_detail(movie_url): response = requests.get(movie_url,headers=headers) html_str = response.content.decode('utf-8') soup = BeautifulSoup(html_str,'lxml') # 电影名字 title = soup.find('span',property="v:itemreviewed").text # 电影评分 score = soup.find('strong',class_="ll rating_num").text # 电影导演 director = soup.find('a',rel="v:directedBy").text # 电影主演 actors = [actor.text for actor in soup.find_all('a',rel="v:starring")] # 电影类型 movie_type = [movie_type.text for movie_type in soup.find_all('span',property="v:genre")] # 上映时间 date = soup.find('span',property="v:initialReleaseDate").text # 电影时长 runtime = soup.find('span',property="v:runtime").text # 电影简介 content = soup.find('span',property="v:summary") if content is not None: content = content.text # 电影海报 poster = soup.find('img',rel="v:image").get('src') # 打印结果 print('片名:{}'.format(title)) print('评分:{}'.format(score)) print('导演:{}'.format(director)) print('主演:{}'.format(actors)) print('类型:{}'.format(movie_type)) print('上映时间:{}'.format(date)) print('片长:{}'.format(runtime)) print('简介:{}'.format(content)) print('海报:{}'.format(poster)) if __name__ == '__main__': get_info(url)
-
garvey说:
利用python爬取豆瓣Top250保存到文件夹中,并给出源代码
import requests from bs4 import BeautifulSoup import os # 定义爬取页面 url = 'https://movie.douban.com/top250' # 爬取数据 response = requests.get(url) html = response.text # 数据处理 soup = BeautifulSoup(html, 'html.parser') movie_list = soup.find(class_='grid_view').find_all('li') # 创建文件夹 folder_name = '豆瓣电影TOP250' if not os.path.exists(folder_name): os.makedirs(folder_name) # 循环爬取信息,并保存到文件中 for movie in movie_list: # 电影名称 movie_name = movie.find(class_='title').string # 电影评分 movie_score = movie.find(class_='rating_num').string # 电影简介 movie_intro = movie.find(class_='inq').string # 保存到文件 with open(folder_name+'/'+movie_name+'.txt', 'w') as f: f.write('电影名称:'+movie_name+'\n') f.write('电影评分:'+movie_score+'\n') f.write('电影简介:'+movie_intro+'\n') print('保存完毕!')