movieLens推荐电影(二)

数据集与上篇文章相同。本篇文章是基于用户推荐算法

思路也与上篇大致一样。

只是先求出每个用户的相似用户,再根据相似用户评价过的电影,对评分进行排序即得推荐的电影。

与基于物品推荐不同点

过程:

  1. 遍历相似的用户
  2. 得到每个相似用户所评价过的电影
  3. 计算每个电影的评分
  4. 取评分前k个
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    # 遍历相似的用户 {用户:相关系数}
    for u, upear in matchNear[user]:
    # {电影:评分}
    for mov, mpear in data[u].items():
    if mov in userRating.keys():
    continue
    if mov not in scores.keys():
    scores[mov] = upear * mpear
    totalSco[mov] = upear
    scores[mov] += upear * mpear
    totalSco[mov] += upear

输出结果

输入的用户id与上篇文章一致。 不同的推荐算法对于推荐的电影也是不同的。

1
2
3
4
5
6
7
8
9
➜ movieLens git:(master) ✗ python movielens2.py
input userid:1
near: ["Dante's Peak (1997)", 'Thousand Acres, A (1997)', 'Murder at 1600 (1997)', 'Spice World (1997)', 'Scream 2 (1997)']
input userid:12
near: ['Usual Suspects, The (1995)', 'Once Were Warriors (1994)', 'Star Maps (1997)', 'Chinatown (1974)', 'Fallen (1998)']
input userid:13
near: ['Laura (1944)', 'Reservoir Dogs (1992)', 'Mrs. Dalloway (1997)', 'Four Days in September (1997)', 'Third Man, The (1949)']
input userid:exit
➜ movieLens git:(master) ✗

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -*- coding: utf-8 -*-
# !/usr/bin/env python
import sys
import numpy as np
import pandas as pd
def loadTrainSet():
movieList = {}
for line in open("data/ml-100k/u.item", encoding='ISO-8859-1').readlines():
# u.item 包含电影id,电影名称
(movieId, title) = line.split('|')[0:2]
movieList[movieId] = title
userInfo = {}
for line in open("data/ml-100k/u.data").readlines():
# u.data 包含用户id,电影id,评分
(uid, mid, rating) = line.split('\t')[0:3]
if uid not in userInfo.keys():
userInfo[uid] = {}
userInfo[uid][movieList[mid]] = int(rating)
return movieList,userInfo
# 皮尔逊相关系数评价:
def pearson(data, user1, user2):
movList = [mov for mov in data[user1].keys() if mov in data[user2].keys()]
mLen = len(movList)
if mLen == 0:
return 0
# print('user1: ,user2:',user1,user2)
# print('movList:',movList)
# 计算评价和 评价平方和 评价成绩和
sum_x = sum([data[user1][mov] for mov in movList])
sum_y = sum([data[user2][mov] for mov in movList])
sum_x2 = sum([data[user1][mov]**2 for mov in movList])
sum_y2 = sum([data[user2][mov]**2 for mov in movList])
sum_xy = sum([data[user1][mov] * data[user2][mov] for mov in movList])
# 皮尔逊相关系数计算
num = sum_xy - (sum_x * sum_y) / mLen
den = np.sqrt((sum_x2 - np.square(sum_x)/mLen) * (sum_y2 - np.square(sum_y)/mLen))
if den == 0:
return 0
return num/den
def topRating(data, user, k = 5):
# 计算用户与每个用户之间的皮尔逊相关系数
scores = {}
for u in data.keys():
if u != user:
scores[u] = pearson(data, user, u)
scoSorted = sorted(scores.items(),key=lambda scores:scores[1],reverse=True)
# print('user {0}, scoSorted: top {1}, {2}'.format(user, k, scoSorted[:k]))
return scoSorted[:k]
def NearUserList(data):
matchUserList = {}
for u in data.keys():
matchUserList[u] = topRating(data, u, 5)
return matchUserList
def getRecommendMov(data, matchNear, user, k=5):
try:
userRating = data[user]
except KeyError:
print('No User')
return 0
scores = {} #记录加权和
totalSco = {} #记录评分和
# 遍历相似的用户 {用户:相关系数}
for u, upear in matchNear[user]:
# {电影:评分}
for mov, mpear in data[u].items():
if mov in userRating.keys():
continue
if mov not in scores.keys():
scores[mov] = upear * mpear
totalSco[mov] = upear
scores[mov] += upear * mpear
totalSco[mov] += upear
rankings = [(scores[mov]/totalSco[mov],mov) for mov in scores.keys() if totalSco[mov] != 0]
rankings.sort(key=lambda x:x[0], reverse=True)
recommendMov = [rankings[i][1] for i in range(k)]
return recommendMov
def movielensClass():
movieList,userInfo = loadTrainSet()
matchNear = NearUserList(userInfo)
return matchNear,userInfo
if __name__ == '__main__':
matchNear,userInfo = movielensClass()
while True:
userid = input("input userid:")
if userid == 'exit':
break
else:
near = getRecommendMov(userInfo, matchNear, userid)
print('near:',near)