Python TMDB电影数据集多维度关联规则分析(python商务大数据分析)
立即下载
资源介绍:
TMDB,全称为The Movie Database(电影数据库),是一个在线的电影数据库网站,它提供了关于电影、电视节目以及演员、导演等电影制作人员的详细信息。TMDB不仅包含了大量的电影数据,还提供了丰富的电影资讯、影评以及用户评分等功能,是电影爱好者和专业人士获取电影信息的重要渠道之一。
我们做的事情是:不局限于市面上对这个数据集的简单处理,而是去深度解构了数据集,①对电影的不同属性进行关联规则、②根据电影分类、盈利与收益率关联规则来分析整体利润收益、③根据电影分类和收益率关联规则分析低投入高收益的电影、④以及模拟了随机观众对电影分类评价的关联规则以及聚类分析。
适用于python商务大数据需要属性构造和深度分析的场景。
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 18 17:56:46 2024
@author: 62561
"""
# 导入必要的库
from mlxtend.frequent_patterns import apriori, association_rules
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
#数据读取
movies=pd.read_csv('tmdb_5000_movies.csv')
movies.info()
movies.isnull().sum()
movies.describe()
#数据预处理
movies.query('budget==0').head(2)
movies.drop(['homepage','id','keywords','tagline','overview','spoken_languages','status'],axis=1,inplace = True)
movies.shape
movies.dropna(inplace=True)
movies.shape
movies.drop_duplicates(keep='first',inplace=True)
for col in movies.columns:
zero=len(movies[movies[col]==0])
print('{},零售数目:{}'.format(col,zero))
movies=movies.drop(index=movies.query('budget==0 or revenue==0').index)
movies.shape
movies.describe()
#属性构造:1-预算除以电影收入得到新属性:预算收益率
movies['ReturnRate']=1-movies['budget']/movies['revenue']
movies.describe()
#数据信息统计
#相关系数矩阵,即给出了任意两个变量之间的相关系数
alld=movies.corr()
#计算收入与其他字段相关系数
movies['release_year']= [i.year for i in pd.to_datetime(movies['release_date'])]
a=movies[['budget','popularity','vote_average','vote_count','runtime','release_year','revenue']].corr()
#查看收入前十的电影信息
top10=movies.sort_values('revenue',ascending = False).head(10)
top10[['budget','genres','original_title','release_date','revenue','runtime','vote_average','vote_count','popularity']]
#查看电影年份分布
movies['release_year'].value_counts()
movies['release_year'].hist(bins=25)
#查看电影时长分布
movies['runtime'].value_counts()
movies['runtime'].hist(bins=30)
#查看预算分布
movies['budget'].hist(bins=15)
#查看评分分布
movies['vote_average'].hist(bins=20)
#查看评论次数分布
movies['vote_count'].hist(bins=20)
#影响收入的因素分析
#绘制电影收入与影响因子散点图
plt.rcParams['font.sans-serif'] = [u'SimHei']
list1= ['budget','popularity','vote_count','vote_average','runtime','release_year']
fig,axes = plt.subplots(2,3,figsize=(12,8),dpi = 70)
for i in range(2):
for j in range(3):
axes[i,j].scatter(x= list1[3*i+j], y='revenue', data=movies,s = 3)
axes[i,j].set_title(list1[3*i+j])
fig.suptitle('影响电影要房的因素')
fig.show()
#分组分析电影收入与影响因子的关系
movies=movies[(movies['release_year']>2000)&(movies['release_year']<=2015)]
movies.shape
edge=[2000,2005,2010,2015]
moviesclass=['2001-2005年','2006-2010年','2011-2015年']
movies['fiveyearclass']=pd.cut(movies.loc[:,'release_year'],edge,labels=moviesclass,include_lowest=True)
movies.head()
edge = movies['revenue'].quantile([0,0.25,0.5,0.75,1]).values
moviesclass=['Low','Medium','Moderately High','High']
movies['revenueclass']= pd.cut(movies.loc[:,'revenue'],edge, labels=moviesclass, include_lowest = True)
movies.head ()
d_summary=movies.groupby(['fiveyearclass','revenueclass']).median()
d_summary
list1=['budget','popularity','vote_count','vote_average','runtime','release_year']
pos=list(range(len(d_summary[:'2001-2005年'])))
width=0.2
fig,ax=plt.subplots(2,3,figsize=(10,5))
plt.subplots_adjust(wspace=0.4,hspace =0.5)
for i in range(2):
for j in range(3):
ax[i,j].bar(pos,d_summary.loc['2001-2005年'][list1[3*i+j]], width, label='2001-2005年')
ax[i,j].bar([p + width for p in pos],d_summary.loc['2006-2010年'][list1[3*i+j]],width, label='2006-2010年')
ax[i,j].bar([p + width*2 for p in pos], d_summary.loc['2011-2015年'][list1[3*i+j]], width, label='2011-2015年')
ax[i,j].set_ylabel(list1[3*i+j])
ax[i,j].set_xlabel('revenue')
ax[i,j].set_title('revenue&'+list1[3*i+j])
ax[i,j].set_xticks([p + 1.5 * width for p in pos])
ax[i,j].set_xticklabels(['低','中下','中上','高'])
ax[i,j].legend()
ax[i,j].grid()
#分析电影类型对电影收入的影响
def counttype(movies):
moviesgeners={}
for i in movies.genres:
for j in eval(i):
if j['name']not in moviesgeners:
moviesgeners[j['name']]=1
else:
moviesgeners[j['name']]+=1
return moviesgeners
plt.figure(figsize=(10,5))
moviesgeners=counttype(movies)
moviesgeners=list(moviesgeners.items())
moviesgeners.sort(key=lambda tup:tup[1],reverse=True)
x=[i[0] for i in moviesgeners]
y=[i[1] for i in moviesgeners]
plt.bar(x,y)
plt.xticks(rotation=45)
plt.xlabel("电影类型")
plt.ylabel("电影数量")
plt.title("电影类型分布条形图")
plt.show()
top100=movies.sort_values('popularity',ascending=False).head(100)
moviesgeners=counttype(top100)
moviesgeners
moviesclass=['Low','Medium','Moderately High','High']
groupmovies=movies.groupby('revenueclass')
fig,axes = plt.subplots(2,2,figsize =(10,8))
plt.subplots_adjust(wspace=0.2,hspace=0.2)
i= 0
for j in range(2):
for k in range(2):
data=groupmovies.get_group(moviesclass[i])
moviesCounttype = counttype(data)
moviesCounttype=list(moviesCounttype.items())
moviesCounttype.sort(key=lambda tup: tup[1],reverse = True)
moviesCounttype= moviesCounttype[:5]
x=[i[0] for i in moviesCounttype]
y=[i[1] for i in moviesCounttype]
axes[j,k].bar(x,y,width =0.5)
axes[j,k].title.set_text(moviesclass[i])
i+= 1
import pandas as pd
import json
import random
from efficient_apriori import apriori
# 确保将 genres 列转换为列表
movies['genres'] = movies['genres'].apply(json.loads)
# 根据 vote_count 和 popularity 生成模拟交易数据
def generate_transactions(movies, num_users=200):
transactions = []
for _ in range(num_users):
user_movies = movies.sample(frac=1, weights='popularity').head(random.randint(1, 10))
watched_genres = set()
for _, row in user_movies.iterrows():
watched_genres.update([genre['name'] for genre in row['genres']])
transactions.append(list(watched_genres))
return transactions
random.seed(42)
transactions = generate_transactions(movies)
# 使用 efficient-apriori 进行关联规则分析
min_support = 0.5 # 降低支持度
min_confidence = 0.9 # 降低置信度
# 生成频繁项集和关联规则
itemsets, rules = apriori(transactions, min_support=min_support, min_confidence=min_confidence)
# 打印有趣的关联规则
for rule in rules:
lhs = ', '.join(rule.lhs) # 规则的前提(左侧项集)
rhs = ', '.join(rule.rhs) # 规则的结果(右侧项集)
print(f"If a user watches {lhs}, they are likely to also watch {rhs} (confidence: {rule.confidence:.2f})")
import networkx as nx
import matplotlib.pyplot as plt
from efficient_apriori import apriori
# 假设 transactions 是之前代码中准备好的交易数据列表
# itemsets, rules = apriori(transactions, min_support=0.05, min_confidence=0.7)
# 函数来绘制关联规则网络图
def plot_rules(rules):
G = nx.DiGraph()
# 添加节点和边
for rule in rules:
antecedents = rule.lhs # 使用 lhs (left-hand side) 获取前件
consequents = rule.rhs # 使用 rhs (right-hand side) 获取后件
for antecedent in antecedents:
G.add_node(antecedent, label=antecedent)
for consequent in consequents:
G.add_node(consequent, label=consequent)
for antecedent in antecedents:
for consequent in consequents:
G.add_edge(antecedent, consequent, weight=rule.confidence)
#