当当网排行版爬虫+可视化分析
立即下载
资源介绍:
爬取当当网排行榜并连接本地nosql数据库,对数据进行数据清洗后进行可视化分析展示
from pymongo import MongoClient
from matplotlib.font_manager import FontProperties
import pandas as pd
from pyecharts.charts import Pie, Bar, Liquid, Page
from pyecharts import options as opts
import webbrowser
import os
from pyecharts.commons.utils import JsCode
import matplotlib.pyplot as plt
from pyecharts.charts import Line
# 创建MongoDB客户端 选择数据库 选择集合(相当于表)
client = MongoClient('mongodb://localhost:27017/')
db = client['dangdang_4213']
collection = db['dangdang_4213']
# 查询所有文档 将查询结果转换为列表
documents = collection.find()
data_list = list(documents)
df = pd.DataFrame(data_list)
#---------------可视化部分代码
def tranform_price(x):
try:
# 尝试将输入转换为浮点数
num = float(x)
if num <= 30.0:
return '0~30元'
elif num <= 60.0:
return '31~60元'
elif num <= 100.0:
return '61~100元'
elif num <= 500.0:
return '101~500元'
else:
return '500以上'
except ValueError:
# 如果转换失败,返回一个默认值或者进行其他错误处理
return '未知价格'
df['价格分级'] = df['原价'].apply(lambda x:tranform_price(x))
price_1 = df['价格分级'].value_counts()
datas_pair_1 = [(i, int(j)) for i, j in zip(price_1.index, price_1.values)]
df['售价价格分级'] = df['折扣价'].apply(lambda x:tranform_price(x))
price_2 = df['售价价格分级'].value_counts()
datas_pair_2 = [(i, int(j)) for i, j in zip(price_2.index, price_2.values)]
# -----------------------------原价价格区间饼状图
pie1 = (
Pie(init_opts=opts.InitOpts(width='1000px', height='600px'))
.add('', datas_pair_1, radius=['35%', '60%'])
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b} : {d} %"))
.set_global_opts(
title_opts=opts.TitleOpts(
title="当当网书籍\n\n原价价格区间",
pos_left='center',
pos_top='center',
title_textstyle_opts=opts.TextStyleOpts(
color='#007AC2',
font_size=20,
font_weight='bold'
),
)
)
#.set_colors(['#EF9050', '#3B7BA9', '#6FB27C', '#FFAF34', '#D8BFD8', '#00BFFF', '#7FFFAA'])
.set_colors(['#BAD1EE', '#83BFEA', '#00B1A6', '#00AFCB', '#007AC2', '#004B7E', '#63C0AB'])
)
# pie1.render(path="pie_yuan.html")
# webbrowser.open("file://" + os.path.realpath("pie_yuan.html"))
# -------------------------折扣价价格区间饼状图
pie2 = (
Pie(init_opts=opts.InitOpts(width='1000px', height='600px'))
.add('', datas_pair_2, radius=['35%', '60%'])
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}:{d}%"))
.set_global_opts(
title_opts=opts.TitleOpts(
title="当当网书籍\n\n折扣价价格区间",
pos_left='center',
pos_top='center',
title_textstyle_opts=opts.TextStyleOpts(
color='#007AC2',
font_size=20,
font_weight='bold'
),
)
)
.set_colors(['#7EA2AC', '#A8C5C5', '#BED7D4', '#E3D2C8', '#D3B0A4', '#B5987A', '#7EAD99'])
)
# pie2.render(path="pie_zhekou.html")
# webbrowser.open("file://" + os.path.realpath("pie_zhekou.html"))
# --------------------------------出版社书籍数量柱状图
counts = df.groupby('出版社')['书名'].count().sort_values(ascending=False).head(20)
zhu=(
Bar(init_opts=opts.InitOpts(height='600px',width='1000px'))
.add_xaxis(counts.index.tolist())
.add_yaxis(
'出版社书籍数量',
counts.values.tolist(),
label_opts=opts.LabelOpts(is_show=True,position='top'),
itemstyle_opts=opts.ItemStyleOpts(
color=JsCode("""new echarts.graphic.LinearGradient(
0, 0, 0, 1,[{offset: 0,color: '#F8C6C4'}, {offset: 1,color: '#EB6D73'}])
"""
)
)
)
.set_global_opts(
title_opts=opts.TitleOpts(
title='各个出版社书籍数量柱状图'),
xaxis_opts=opts.AxisOpts(name='书籍名称',
type_='category',
#axislabel_opts=opts.LabelOpts(rotate=45),
axislabel_opts=opts.LabelOpts(
rotate=30, # 调整标签旋转角度
font_size=12 # 调整字体大小
),
),
yaxis_opts=opts.AxisOpts(
name='数量',
min_=0,
max_=290.0,
splitline_opts=opts.SplitLineOpts(is_show=True,linestyle_opts=opts.LineStyleOpts(type_='dash'))
),
tooltip_opts=opts.TooltipOpts(trigger='axis',axis_pointer_type='cross')
)
.set_series_opts(
markline_opts=opts.MarkLineOpts(
data=[
opts.MarkLineItem(type_='average',name='均值'),
opts.MarkLineItem(type_='max',name='最大值'),
opts.MarkLineItem(type_='min',name='最小值'),
]
)
)
)
# zhu.render(path="bar_chubanshe.html")
# webbrowser.open("file://" + os.path.realpath("bar_chubanshe.html"))
# #-------------------------电子书水球图
# # 电子书版本占比
# per = df['电子书价格'].value_counts()['']/len(df)
#
# # 首先,统计空字符串的数量
# count_no_ebook = (df['电子书价格'] == "").sum()
# # 然后,计算总数
# total_count = len(df)
# # 接着,计算没有电子书版本的比例
# per = count_no_ebook / total_count
# print(f"没有电子书版本的比例是:{per * 100:.2f}%")
# c = (
# Liquid()
# .add("lq", [1-per], is_outline_show=False)
# .set_global_opts(title_opts=opts.TitleOpts(title="电子书版本占比"))
# )
# c.render_notebook()
# c.render(path="c_dianzishu.html")
# webbrowser.open("file://" + os.path.realpath("c_dianzishu.html"))
#-------------------------电子书条形堆叠图和水球图
# 首先,统计空字符串的数量
count_no_ebook = (df['电子书价格'] == "").sum()
# 然后,计算总数
total_count = len(df)
# 接着,计算有电子书版本和没有电子书版本的比例
per_with_ebook = (total_count - count_no_ebook) / total_count
per_no_ebook = count_no_ebook / total_count
# 创建百分比堆叠条形图
bar = (
Bar(init_opts=opts.InitOpts(width="800px", height="500px"))
.add_xaxis(["电子书版本占比"])
.add_yaxis(
"有电子书版本",
[per_with_ebook * 100],
itemstyle_opts=opts.ItemStyleOpts(color="#FBD474"),
stack="电子书"
)
.add_yaxis(
"无电子书版本",
[per_no_ebook * 100],
itemstyle_opts=opts.ItemStyleOpts(color="#F6CE8C"),
stack="电子书"
)
.set_global_opts(
title_opts=opts.TitleOpts(title="电子书版本占比"),
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=0)),
yaxis_opts=opts.AxisOpts(
max_=100,
splitline_opts=opts.SplitLineOpts(is_show=True)
),
tooltip_opts=opts.TooltipOpts(
trigger="axis",
axis_pointer_type="cross"
)
)
)
# 创建水球图
# liquid_data = [1 - per_no_ebook]
# c = (
# Liquid()
# .add("lq", liquid_data, is_outline_show=False)
# )
# 查询前30个文档 将查询结果转换为列表
documents = collection.find().limit(30)
data_list = list(documents)
# 提取推荐值和书名
titles = [doc['书名'] for doc in data_list]
recommendations = [doc['推荐值'] for doc in data_list if '%' not in doc['推荐值'] and doc['推荐值'].isdigit()]
# 将推荐值转换为浮点数
recommendations = [float(rec) for rec in recommendations]
# # 创建一个Line对象,即折线图对象
# line = Line(init_opts=opts.InitOpts(width="1200