这个基于Python的个性化电影推荐系统是我在指导计算机专业学生毕业设计时经常采用的经典案例。它完美融合了爬虫技术、数据处理、推荐算法和Web开发等多个热门技术方向,特别适合作为本科或研究生阶段的综合实践项目。
系统采用B/S架构,前端使用Vue.js构建响应式界面,后端基于Django框架开发,数据存储选用MySQL关系型数据库。整个系统的技术栈选择考虑了以下几个关键因素:
提示:对于毕业设计项目,建议选择技术文档丰富、社区活跃的技术栈,这样在开发过程中遇到问题更容易找到解决方案。
系统的核心创新点在于:
系统采用经典的三层架构设计,各层职责明确:
vue复制<template>
<div class="movie-card">
<el-image :src="movie.poster" fit="cover"></el-image>
<div class="movie-info">
<h3>{{ movie.title }}</h3>
<el-rate v-model="movie.rating" disabled></el-rate>
<el-button @click="handleLike">收藏</el-button>
</div>
</div>
</template>
<script>
export default {
props: ['movie'],
methods: {
handleLike() {
this.$emit('on-like', this.movie.id)
}
}
}
</script>
python复制from rest_framework.views import APIView
from rest_framework.response import Response
class RecommendAPI(APIView):
def get(self, request):
user_id = request.user.id
# 获取混合推荐结果
content_based = content_based_recommend(user_id)
cf = collaborative_filtering(user_id)
hybrid = hybrid_recommend(content_based, cf)
return Response({
'recommendations': hybrid,
'timestamp': time.time()
})
python复制from django.db import models
class Movie(models.Model):
douban_id = models.CharField(max_length=20, unique=True)
title = models.CharField(max_length=200)
directors = models.CharField(max_length=300)
casts = models.TextField()
genres = models.JSONField()
rating = models.FloatField()
class Meta:
db_table = 'movie'
indexes = [
models.Index(fields=['rating']),
models.Index(fields=['genres'], name='genres_idx')
]
系统数据流遵循以下路径:
注意:在实际部署时,建议将爬虫服务与Web服务分离,避免爬虫任务影响用户请求响应时间。
爬虫项目采用Scrapy框架,关键配置如下:
python复制# settings.py
BOT_NAME = 'movie_spider'
USER_AGENT = 'Mozilla/5.0 (compatible; MovieRecBot/1.0)'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 2
CONCURRENT_REQUESTS = 16
ITEM_PIPELINES = {
'pipelines.MongoDBPipeline': 300,
}
为提高爬取效率,我们使用Scrapy-Redis构建分布式爬虫:
pip install scrapy-redispython复制from scrapy_redis.spiders import RedisSpider
class DoubanSpider(RedisSpider):
name = 'douban'
redis_key = 'douban:start_urls'
def parse(self, response):
# 解析逻辑保持不变
yield item
针对不同类型字段采用差异化处理:
python复制def clean_movie_data(df):
# 数值型字段:使用KNN填充
from sklearn.impute import KNNImputer
num_cols = ['rating', 'duration']
imputer = KNNImputer(n_neighbors=5)
df[num_cols] = imputer.fit_transform(df[num_cols])
# 文本型字段:多重填充策略
df['directors'] = df['directors'].fillna('未知导演')
# 分类字段:构建"其他"类别
genre_mode = df['genres'].mode()[0]
df['genres'] = df['genres'].apply(
lambda x: genre_mode if pd.isna(x) else x)
return df
清洗完成后进行数据质量检查:
python复制def validate_data(df):
# 检查评分范围
assert df['rating'].between(1, 10).all()
# 检查日期有效性
from datetime import datetime
current_year = datetime.now().year
assert df['year'].between(1900, current_year).all()
# 检查唯一性
assert df['douban_id'].nunique() == len(df)
系统采用三种推荐算法组合:
python复制from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def content_based_recommend(movie_id, top_n=5):
# 获取所有电影数据
movies = Movie.objects.all().values()
df = pd.DataFrame.from_records(movies)
# 构建特征矩阵
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['genres'] + ' ' + df['directors'])
# 计算相似度
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
# 获取推荐
idx = df[df['id']==movie_id].index[0]
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:top_n+1]
movie_indices = [i[0] for i in sim_scores]
return df.iloc[movie_indices]
python复制from surprise import Dataset, KNNBasic
from surprise.model_selection import train_test_split
def collaborative_filtering(user_id, top_n=5):
# 加载评分数据
ratings = Rating.objects.all().values()
df = pd.DataFrame.from_records(ratings)
# 使用Surprise库构建模型
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'movie_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.25)
# 训练KNN模型
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)
# 获取推荐
testset = [[user_id, movie.id, 4.] for movie in Movie.objects.all()]
predictions = algo.test(testset)
predictions.sort(key=lambda x: x.est, reverse=True)
return [pred.iid for pred in predictions[:top_n]]
python复制def hybrid_recommend(user_id, top_n=10):
# 获取用户最近评分的电影
last_rated = Rating.objects.filter(
user_id=user_id
).order_by('-timestamp').first()
if last_rated:
# 基于内容推荐
content_rec = content_based_recommend(last_rated.movie_id, top_n//2)
# 协同过滤推荐
cf_rec = collaborative_filtering(user_id, top_n//2)
# 合并结果
return list(set(content_rec + cf_rec))
else:
# 新用户返回热门电影
return Movie.objects.order_by('-rating')[:top_n]
为提高推荐实时性,系统采用以下策略:
python复制def get_user_interest(user_id):
# 获取用户所有行为
actions = UserAction.objects.filter(user_id=user_id)
# 计算带时间衰减的权重
now = time.time()
interest = defaultdict(float)
for action in actions:
# 时间衰减因子:最近行为权重更高
time_decay = 1 / (1 + math.log1p(now - action.timestamp))
interest[action.movie_id] += action.weight * time_decay
return dict(interest)
python复制from django.core.cache import cache
def get_recommendations(user_id):
cache_key = f'rec_{user_id}'
result = cache.get(cache_key)
if not result:
result = hybrid_recommend(user_id)
cache.set(cache_key, result, timeout=3600) # 缓存1小时
return result
推荐使用Docker Compose部署整套系统:
yaml复制version: '3'
services:
web:
build: .
command: gunicorn movie_rec.wsgi:application --bind 0.0.0.0:8000
volumes:
- .:/code
ports:
- "8000:8000"
depends_on:
- redis
- db
redis:
image: redis:alpine
ports:
- "6379:6379"
db:
image: mysql:8.0
environment:
MYSQL_ROOT_PASSWORD: password
MYSQL_DATABASE: movie_rec
volumes:
- db_data:/var/lib/mysql
ports:
- "3306:3306"
volumes:
db_data:
python复制# 使用select_related减少查询次数
movies = Movie.objects.select_related('director').filter(rating__gt=8)
# 添加适当索引
class Meta:
indexes = [
models.Index(fields=['rating']),
models.Index(fields=['release_date']),
]
javascript复制// 使用虚拟滚动优化长列表渲染
<template>
<RecycleScroller
class="movie-list"
:items="movies"
:item-size="200"
key-field="id"
>
<template v-slot="{ item }">
<MovieCard :movie="item" />
</template>
</RecycleScroller>
</template>
python复制@app.task
def run_spider():
os.system('scrapy crawl douban')
# 定时任务配置
CELERY_BEAT_SCHEDULE = {
'run-spider-every-night': {
'task': 'tasks.run_spider',
'schedule': crontab(hour=3, minute=0),
},
}
这个基础系统可以进一步扩展为:
python复制# 使用预训练CNN提取视觉特征
from tensorflow.keras.applications import VGG16
model = VGG16(weights='imagenet', include_top=False)
def extract_features(image_path):
img = load_img(image_path, target_size=(224, 224))
img_array = img_to_array(img)
expanded = np.expand_dims(img_array, axis=0)
preprocessed = preprocess_input(expanded)
features = model.predict(preprocessed)
return features.flatten()
知识图谱整合:
构建电影-演员-导演关系图谱,实现更智能的推荐
A/B测试框架:
python复制def recommend_with_abtest(user_id):
if user_id % 2 == 0:
# A组:传统算法
return hybrid_recommend(user_id)
else:
# B组:新算法
return deep_learning_recommend(user_id)
这个项目从技术选型到算法实现都体现了现代Web开发的典型模式,既适合作为学习项目掌握全栈开发技能,也具备商业应用的潜力。在实际开发过程中,我建议学生重点关注数据质量管理和推荐算法优化这两个核心环节,它们往往决定了推荐系统的最终效果。