基于用户的协同过滤算法

推荐电影。

读取数据

文件格式: 用户 电影 评分…

1
2
3
4
5
6
7
fp = open("uid_score_bid","r")
users = {}
for line in fp:
lines = line.strip().split(",")
if lines[0] not in users:
users[lines[0]] = {}
users[lines[0]][lines[2]] = float(lines[1])

算法实现

利用皮尔逊相关系数计算公式 计算用户间的相似距离

  1. 计算出user与其他所有用户的相似度
  2. 将与uesr最相近的k个人中user没有看过的书推荐给user,分数排名
  3. 第i个人的与user的相似度,转换到[0,1]之间
  4. 第i个用户看过的书和相应的打分
  5. 排序
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    class recommender:
    #data:数据集,这里指users
    #k:表示得出最相近的k的近邻
    #metric:表示使用计算相似度的方法
    #n:表示推荐book的个数
    def __init__(self, data, k=3, metric='pearson', n=5):
    self.k = k
    self.n = n
    self.metric = metric
    if self.metric == "pearson":
    self.fn = self.pearson
    if type(data).__name__ == 'dict':
    self.data = data
    # 定义的计算相似度的公式,用的是皮尔逊相关系数计算方法
    def pearson(self, rating1, rating2):
    sum_xy = 0
    sum_x = 0
    sum_y = 0
    sum_x2 = 0
    sum_y2 = 0
    n = 0
    for key in rating1:
    if key in rating2:
    n += 1
    x = rating1[key] #每个电影的打分
    y = rating2[key]
    sum_xy += x * y
    sum_x += x
    sum_y += y
    sum_x2 += pow(x,2)
    sum_y2 += pow(y,2)
    if n == 0:
    return 0
    # 皮尔逊相关系数计算公式
    denominator = sqrt(sum_x2 - pow(sum_x,2)/n) * sqrt(sum_y2 - pow(sum_y,2)/n)
    if denominator == 0:
    return 0
    else:
    return (sum_xy - (sum_x * sum_y)/n)/denominator
    def computeNearestNeighbor(self,username):
    distances = []
    for instance in self.data:
    if instance != username:
    distance = self.fn(self.data[username],self.data[instance])
    distances.append((instance,distance))
    distances.sort(key=lambda artistTuple:artistTuple[1],reverse=True)
    return distances
    # 推荐算法的主体函数
    def recommend(self,user):
    # 定义一个字典,用来存储推荐的书单和分数
    recommendations = {}
    # 计算出user与其他所有用户的相似度,返回一个list
    nearest = self.computeNearestNeighbor(user)
    userRatings = self.data[user]
    totalDistance = 0.0
    # 得出最近的k个近邻的总距离
    for i in range(self.k):
    totalDistance += nearest[i][1]
    if totalDistance == 0.0:
    totalDistance = 1.0
    # 将与uesr最相近的k个人中user没有看过的书推荐给user,
    # 并且这里又做了一个分数的计算排名
    for i in range(self.k):
    # 第i个人的与user的相似度,转换到[0,1]之间
    weight = nearest[i][1]/totalDistance
    # 第i个人的name
    name = nearest[i][0]
    # 第i个用户看过的书和相应的打分
    neighborRatings = self.data[name]
    for artist in neighborRatings:
    if not artist in userRatings:
    if artist not in recommendations:
    recommendations[artist] = (neighborRatings[artist]*weight)
    else:
    recommendations[artist] = (recommendations[artist]+neighborRatings[artist]*weight)
    recommendations = list(recommendations.items())
    # 做了一个排序
    recommendations.sort(key=lambda artistTuple:artistTuple[1],reverse=True)
    return recommendations[:self.n],nearest

主函数

1
2
3
4
5
6
7
8
9
10
11
12
def adjustrecommend(id):
bookid_list = []
r = recommender(users)
k,nearuser = r.recommend("%s" % id)
for i in range(len(k)):
bookid_list.append(k[i][0])
return bookid_list,nearuser[:15]
if __name__ == '__main__':
bookid_list,near_list = adjustrecommend("luhangdedouban")
print ("bookid_list:",bookid_list)
print ("near_list:",near_list)

运行结果

bookid_list:给用户 luhangdedouban 推荐5部电影。

near_list:与用户 luhangdedouban 相似的其他用户及其相似度

1
2
('bookid_list:', ['1427831', '1041007', '1007305', '26276279', '2180619'])
('near_list:', [('sleepinmorning', 0.8100925873009825), ('124871246', 0.8017837257372784), ('suriding', 0.7826237921249284), ('73760473', 0.6686478498357363), ('144919585', 0.6499999999999979), ('54323940', 0.6255432421712243), ('64645627', 0.6141922686399912), ('sue19870424', 0.6109902663648334), ('67149957', 0.6060630657171625), ('101831219', 0.604917398119787), ('135426426', 0.5887252381134163), ('32034514', 0.5883484054145521), ('4126514', 0.5833333333333344), ('48568576', 0.5738737198305703), ('121532925', 0.5707894706761077)])