# Item-Item Collaborative Filtering

## Importing libraries

In [303]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

from surprise import KNNWithMeans, SVD
from surprise import Dataset
from surprise import Reader

## Reading and preprocessing data

Data source: https://www.kaggle.com/code/rounakbanik/movie-recommender-systems/notebook

We have selected a subset of original dataset with users that rated more than 690 movies and movies that were rated by more than 180 users. Additionally, we have added titles to the rated movies. The resulting dataset is an intersection of these user and movie subsets.

In [458]:
data = pd.read_excel('movies.xlsx')

In [459]:
# the correspondence between movies ids and titles
movie_title = data[['movieId', 'original_title']].drop_duplicates()

In [460]:
# we make a user-movie matrix and fill in non-rated movies with 0 to subsequently use this user-movie pairs to predict a rating 
full_data = data.pivot_table(values='rating',
                                index='userId',
                                columns='movieId').fillna(0)
full_data

movieId,1,32,47,50,110,150,260,296,318,356,...,1210,1270,1580,2028,2571,2762,2858,2959,4993,5952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15,2.0,4.0,5.0,5.0,3.0,3.0,5.0,5.0,2.0,1.0,...,5.0,5.0,4.0,3.0,5.0,1.0,4.0,5.0,5.0,5.0
23,3.0,4.0,4.5,4.0,3.5,3.5,4.5,4.5,5.0,4.5,...,4.0,4.5,3.5,4.0,4.0,4.0,3.5,3.5,4.0,4.0
30,4.0,2.0,4.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,...,4.0,5.0,4.0,5.0,3.0,5.0,5.0,4.0,3.0,0.0
73,5.0,5.0,5.0,5.0,4.0,3.5,4.5,5.0,5.0,5.0,...,5.0,5.0,3.0,4.5,4.5,4.0,4.5,5.0,5.0,5.0
212,3.0,3.5,3.5,3.5,5.0,4.0,4.0,4.0,4.5,4.0,...,0.0,3.0,1.5,4.0,5.0,3.5,4.0,5.0,5.0,5.0
213,3.0,1.5,2.5,0.0,2.5,1.5,5.0,0.0,0.0,2.0,...,5.0,3.0,4.0,0.0,4.0,2.5,0.0,0.0,4.5,4.0
294,4.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,3.0,4.0,...,4.0,4.0,4.5,0.0,4.5,0.0,0.0,0.0,4.0,4.0
311,3.0,0.0,0.5,3.0,3.0,5.0,4.0,3.0,4.5,5.0,...,3.5,4.5,3.0,5.0,4.0,4.0,0.0,0.0,0.0,0.0
380,4.0,5.0,5.0,5.0,5.0,4.0,4.0,5.0,4.0,5.0,...,4.0,3.0,3.0,5.0,5.0,5.0,5.0,5.0,4.5,4.0
388,0.0,4.0,5.0,5.0,5.0,4.5,4.5,5.0,5.0,4.0,...,5.0,4.0,3.0,5.0,5.0,4.0,4.0,3.0,5.0,4.5


In [461]:
# we restore the original data but with additional user-movie pairs with 0 rating (meaning the movie is unrated)
full_data = full_data.reset_index().melt(id_vars='userId', var_name='movieId', value_name='rating')

In [462]:
# we do train test split by leaving all the unrated movies (and corresponding users) in the test set 
# and all the rated movies (and corresponding users) in the train set
train = full_data[full_data.rating != 0]
test = full_data[full_data.rating == 0]

## Building a recommender system

### scikit surprise

In [463]:
# transforming data into the format acceptable by the scikit learn suprise framework 
reader = Reader(rating_scale=(0.5, 5))
train = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader)
trainset = train.build_full_trainset()
test = Dataset.load_from_df(test[['userId', 'movieId', 'rating']], reader)
testset = [test.df.loc[i].to_list() for i in test.df.index]

In [464]:
# creating a dataset 
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)
train, test = train_test_split(data, test_size=.2, random_state=42)

In [465]:
# we apply the Singular value decomposition (SVD) algorithm for collaborative filtering 
algo = SVD(random_state = 42)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2570cb0f430>

In [466]:
pred = algo.test(testset)
predictions = pd.DataFrame(columns = ['userId','movieId','rating'])
for i in range(len(pred)):
    predictions = pd.concat([predictions, pd.DataFrame({"userId": [pred[i].uid], 
                                                        "movieId": [pred[i].iid], "rating": [pred[i].est]})], axis = 0)

In [468]:
predictions.merge(movie_title, how = 'inner')

Unnamed: 0,userId,movieId,rating,original_title
0,388,1,4.178314,Toy Story
1,294,32,3.845652,Twelve Monkeys
2,311,32,3.56767,Twelve Monkeys
3,452,32,3.679041,Twelve Monkeys
4,461,32,3.703987,Twelve Monkeys
...,...,...,...,...
94,30,5952,4.410817,The Lord of the Rings: The Two Towers
95,311,5952,3.608098,The Lord of the Rings: The Two Towers
96,518,5952,4.36384,The Lord of the Rings: The Two Towers
97,547,5952,3.356063,The Lord of the Rings: The Two Towers


In [469]:
# we can also apply a basic nearest neighbors approach, 
# e.g. the one that takes into account the mean ratings of each user, uses a cosine distance 
# and computes similarities between items
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNWithMeans(sim_options=sim_options)
algo.fit(trainset)
pred = algo.test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [470]:
predictions = pd.DataFrame(columns = ['userId','movieId','rating'])
for i in range(len(pred)):
    predictions = pd.concat([predictions, pd.DataFrame({"userId": [pred[i].uid], 
                                                        "movieId": [pred[i].iid], "rating": [pred[i].est]})], axis = 0)

In [471]:
predictions.merge(movie_title, how = 'inner')

Unnamed: 0,userId,movieId,rating,original_title
0,388,1,4.037184,Toy Story
1,294,32,3.728931,Twelve Monkeys
2,311,32,3.426616,Twelve Monkeys
3,452,32,3.554053,Twelve Monkeys
4,461,32,3.630588,Twelve Monkeys
...,...,...,...,...
94,30,5952,4.299921,The Lord of the Rings: The Two Towers
95,311,5952,3.709705,The Lord of the Rings: The Two Towers
96,518,5952,4.398882,The Lord of the Rings: The Two Towers
97,547,5952,3.399342,The Lord of the Rings: The Two Towers
