爬取豆瓣电影top250和爬取当当网数据
立即下载
资源介绍:
(2)编写爬虫程序,使用Urllib或Requests库获取到服务器URL的首页数据。
(3)解析数据,包含图书编号、名称、作者、出版社、出版时间、价格、简介、图书图片的URL,同时实现翻页功能爬取全部网页数据;
(4)数据持久化存储:将全部解析的数据存储到 .CSV文件;将全部图书的图片存储到当前目录中“download”文件夹;将全部解析的数据存储到数据库( MySQL或MongoDB )。 编写爬虫程序,使用获取到服务器URL的首页数据。
(3)使用解析RE、BS4、XPATH数据,包含图书编号、名称、作者、出版社、出版时间、价格、简介、图书图片的URL,同时实现翻页功能爬取全部网页数据;
(4)数据持久化存储:将全部解析的数据存储到 .CSV文件;将全部图书的图片存储到当前目录中“download”文件夹;将全部解析的数据存储到数据库( MySQL或MongoDB )。
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import os
import requests
import pymysql
from pymysql.converters import escape_string
conn = pymysql.connect(
user="root",
password="plmoknijbuhv123.",
host="localhost",
database="kaoshi",
port=3306,
)
cursor = conn.cursor()
driver=Chrome()
url="https://www.dangdang.com/"
driver.get(url)
input=driver.find_element(By.XPATH,'//*[@id="key_S"]')
input.send_keys("Python")
danji=driver.find_element(By.XPATH,'//*[@id="form_search_new"]/input[10]')
danji.click()
flat=True
fd=open("html.csv",mode="a",encoding="utf-8")
sname = 1
xuhao="%05d"%1
while flat:
newurl=driver.current_url
myheaders={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"}
response=requests.get(url=newurl,headers=myheaders)
response.encoding="GB2312"
html=response.text
soup=BeautifulSoup(html,'lxml')
ul=soup.find("ul",class_="bigimg")
lis=ul.find_all("li")
if not os.path.exists("download"):
os.mkdir("download")
i=1
for li in lis:
name=li.find("a")["title"]
try:
zuoze=li.find("p",class_="search_book_author").find("a")["title"]
except:
pass
money=li.find("p",class_="price").find("span",class_="search_now_price").text
riqi=li.find("p",class_="search_book_author").find_all("span")[1].text
chuban=li.find("p",class_="search_book_author").find_all("span")[2].text
jianjie=li.find("p",class_="detail").text
if i==1:
src=li.find("a").find("img")["src"]
else:
src=li.find("a").find("img")["data-original"]
movie_src="http:"+src
print(sname,name,zuoze)
resp=requests.get(movie_src,myheaders)
shu = "%05d" % sname
with open("download/"+str(shu)+".jpg","wb") as fp:
fp.write(resp.content)
fd.write(str(sname)+","+name+","+zuoze+","+chuban+","+riqi+","+money+","+jianjie+"\n")
sql="insert into kaoshi values (\'{}\',\'{}\',\'{}\',\'{}\',\'{}\',\'{}\');".format(escape_string(name),escape_string(zuoze),escape_string(chuban),escape_string(riqi),escape_string(money),escape_string(jianjie))
print(sql)
cursor.execute(sql)
conn.commit()
i=i+1
sname=sname+1
try:
next=driver.find_element(By.CLASS_NAME,"next").find_element(By.TAG_NAME,"a")
next.click()
except:
flat=False
fd.close()
cursor.close()
conn.close()