Module tfrec.utils.model_utils
Expand source code
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
def preprocess_and_split(df, shuffle=True, split=True, test_size=0.2, random_state=None):
"""Preprocesses and Splits the dataset if necessary.
**Encodes** & **Decodes** User and Item, each from `0 to len(user) and len(item)`.
Splits into **Train** and **Test** and shuffles(if `shuffle = True`).
Returns Dataset as numpy arrays and **Encoding** and **Decodings** of users and items.
Parameters
----------
df : pandas.core.frame.DataFrame
Dataframe of shape `(m,3)` with columns in the order - `user`, `item`, `rating`.
shuffle : boolean, optional
Boolean to determine should the dataset be shuffled after encoding (default is `True`).
split : boolean, optional
Boolean to determine if the dataset should be split into train and test set (default is `True`).
test_size : float, optional
Size of test split (default is `0.2`).
random_state : integer, optional
Seed variable, useful for reproducing results (default is `None`).
Returns
-------
dataset, user_item_encodings : tuple
Dataset contains tuple of train and test variables if `shuffle = True`.
**dataset**
(x_train, y_train), (x_test, y_test).
**user_item_encodings**
(user_to_encoded, encoded_to_user,item_to_encoded, encoded_to_item).
"""
# Get unique ids
user_ids = df.iloc[:, 0].unique().tolist()
item_ids = df.iloc[:, 1].unique().tolist()
# Create encoding and decoding maps for users
user_to_encoded = {user_id: encoded_id for encoded_id,
user_id in enumerate(user_ids)}
encoded_to_user = {encoded_id: user_id for encoded_id,
user_id in enumerate(user_ids)}
# Create encoding and decoding maps for items
item_to_encoded = {item_id: encoded_id for encoded_id,
item_id in enumerate(item_ids)}
encoded_to_item = {encoded_id: item_id for encoded_id,
item_id in enumerate(item_ids)}
user_item_encodings = (user_to_encoded, encoded_to_user,
item_to_encoded, encoded_to_item)
# Encode Dataset
df_encoded = pd.DataFrame([], columns=['user', 'item', 'rating'])
df_encoded['user'] = df.iloc[:, 0].map(user_to_encoded)
df_encoded['item'] = df.iloc[:, 1].map(item_to_encoded)
df_encoded['rating'] = df.iloc[:, 2].values.astype(np.float32)
# Shuffle if shuffle = True
if shuffle:
df_encoded = df_encoded.sample(frac=1, random_state=None)
# Dataframe -> Numpy Arrays
x = df_encoded[['user', 'item']].values
y = df_encoded[['rating']].values
dataset = x, y
# Split if split = True
if split:
x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size=test_size, random_state=random_state
)
dataset = (x_train, y_train), (x_test, y_test)
return dataset, user_item_encodings
def cross_validate(model, X, y, folds=5, epochs=5, batch_size=32, callbacks=None, shuffle=False, random_state=None):
"""Performs K-Fold cross validation on data.
Parameters
----------
model : tf.keras.Model
A compiled model with appropriate metrics to keep track of.
X : numpy.ndarray
Numpy array of shape `(m, 2)` of user-item pairs.
y : numpy.ndarray
Numpy array of shape `(m,)` of target rating.
folds : integer, optional
Number of splits for K Fold (default is `5`).
epochs : integer, optional
Epochs to fit the model (default is `10`).
batch_size : integer, optional
Batch_size to be passed in while fitting (default is `32`).
callbacks : list, optional
List of callbacks to be passed in while fitting (default `None`).
shuffle : boolean, optional
Shuffle before splitting into batches or not (default `False`).
random_state : integer, optional
Seed variable, only useful if `shuffle = True` (default `None`).
Returns
-------
numpy.ndarray
Numpy array of shape `(folds, 1(loss) + no_of_metrics)`.
Contains validation scores evaluated for each fold.
"""
# Initalize KFold
kfolds = KFold(n_splits=folds, random_state=random_state, shuffle=shuffle)
all_metrics = []
# To build the model
if type(model).__name__ == 'SVDpp':
model.implicit_feedback(X[:10, :])
model(X[:10, :])
# Workaround to reset weights after each fold fit
weights = model.get_weights()
i = 1
for train, val in kfolds.split(X, y):
# Gather implicit feedback if model is SVD++
if type(model).__name__ == 'SVDpp':
model.implicit_feedback(X[train])
print(f'\nFitting on Fold {i}')
# Train and evaluate metrics
history = model.fit(
X[train], y[train], batch_size=batch_size, epochs=epochs, callbacks=callbacks)
print(f'\nEvaluating on Fold {i}')
fold_score = history.model.evaluate(X[val], y[val])
all_metrics.append(fold_score)
if folds != i:
model.set_weights(weights)
i += 1
all_metrics = np.array(all_metrics)
for i, metric in enumerate(model.metrics_names):
print(f'Mean {metric.capitalize()} : {np.mean(all_metrics.T[i])}')
return all_metrics, model
Functions
def cross_validate(model, X, y, folds=5, epochs=5, batch_size=32, callbacks=None, shuffle=False, random_state=None)
-
Performs K-Fold cross validation on data.
Parameters
model
:tf.keras.Model
- A compiled model with appropriate metrics to keep track of.
X
:numpy.ndarray
- Numpy array of shape
(m, 2)
of user-item pairs. y
:numpy.ndarray
- Numpy array of shape
(m,)
of target rating. folds
:integer
, optional- Number of splits for K Fold (default is
5
). epochs
:integer
, optional- Epochs to fit the model (default is
10
). batch_size
:integer
, optional- Batch_size to be passed in while fitting (default is
32
). callbacks
:list
, optional- List of callbacks to be passed in while fitting (default
None
). shuffle
:boolean
, optional- Shuffle before splitting into batches or not (default
False
). random_state
:integer
, optional- Seed variable, only useful if
shuffle = True
(defaultNone
).
Returns
numpy.ndarray
-
Numpy array of shape
(folds, 1(loss) + no_of_metrics)
.Contains validation scores evaluated for each fold.
Expand source code
def cross_validate(model, X, y, folds=5, epochs=5, batch_size=32, callbacks=None, shuffle=False, random_state=None): """Performs K-Fold cross validation on data. Parameters ---------- model : tf.keras.Model A compiled model with appropriate metrics to keep track of. X : numpy.ndarray Numpy array of shape `(m, 2)` of user-item pairs. y : numpy.ndarray Numpy array of shape `(m,)` of target rating. folds : integer, optional Number of splits for K Fold (default is `5`). epochs : integer, optional Epochs to fit the model (default is `10`). batch_size : integer, optional Batch_size to be passed in while fitting (default is `32`). callbacks : list, optional List of callbacks to be passed in while fitting (default `None`). shuffle : boolean, optional Shuffle before splitting into batches or not (default `False`). random_state : integer, optional Seed variable, only useful if `shuffle = True` (default `None`). Returns ------- numpy.ndarray Numpy array of shape `(folds, 1(loss) + no_of_metrics)`. Contains validation scores evaluated for each fold. """ # Initalize KFold kfolds = KFold(n_splits=folds, random_state=random_state, shuffle=shuffle) all_metrics = [] # To build the model if type(model).__name__ == 'SVDpp': model.implicit_feedback(X[:10, :]) model(X[:10, :]) # Workaround to reset weights after each fold fit weights = model.get_weights() i = 1 for train, val in kfolds.split(X, y): # Gather implicit feedback if model is SVD++ if type(model).__name__ == 'SVDpp': model.implicit_feedback(X[train]) print(f'\nFitting on Fold {i}') # Train and evaluate metrics history = model.fit( X[train], y[train], batch_size=batch_size, epochs=epochs, callbacks=callbacks) print(f'\nEvaluating on Fold {i}') fold_score = history.model.evaluate(X[val], y[val]) all_metrics.append(fold_score) if folds != i: model.set_weights(weights) i += 1 all_metrics = np.array(all_metrics) for i, metric in enumerate(model.metrics_names): print(f'Mean {metric.capitalize()} : {np.mean(all_metrics.T[i])}') return all_metrics, model
def preprocess_and_split(df, shuffle=True, split=True, test_size=0.2, random_state=None)
-
Preprocesses and Splits the dataset if necessary.
Encodes & Decodes User and Item, each from
0 to len(user) and len(item)
.Splits into Train and Test and shuffles(if
shuffle = True
).Returns Dataset as numpy arrays and Encoding and Decodings of users and items.
Parameters
df
:pandas.core.frame.DataFrame
- Dataframe of shape
(m,3)
with columns in the order -user
,item
,rating
. shuffle
:boolean
, optional- Boolean to determine should the dataset be shuffled after encoding (default is
True
). split
:boolean
, optional- Boolean to determine if the dataset should be split into train and test set (default is
True
). test_size
:float
, optional- Size of test split (default is
0.2
). random_state
:integer
, optional- Seed variable, useful for reproducing results (default is
None
).
Returns
dataset
,user_item_encodings
:tuple
-
Dataset contains tuple of train and test variables if
shuffle = True
.dataset
(x_train, y_train), (x_test, y_test).
user_item_encodings
(user_to_encoded, encoded_to_user,item_to_encoded, encoded_to_item).
Expand source code
def preprocess_and_split(df, shuffle=True, split=True, test_size=0.2, random_state=None): """Preprocesses and Splits the dataset if necessary. **Encodes** & **Decodes** User and Item, each from `0 to len(user) and len(item)`. Splits into **Train** and **Test** and shuffles(if `shuffle = True`). Returns Dataset as numpy arrays and **Encoding** and **Decodings** of users and items. Parameters ---------- df : pandas.core.frame.DataFrame Dataframe of shape `(m,3)` with columns in the order - `user`, `item`, `rating`. shuffle : boolean, optional Boolean to determine should the dataset be shuffled after encoding (default is `True`). split : boolean, optional Boolean to determine if the dataset should be split into train and test set (default is `True`). test_size : float, optional Size of test split (default is `0.2`). random_state : integer, optional Seed variable, useful for reproducing results (default is `None`). Returns ------- dataset, user_item_encodings : tuple Dataset contains tuple of train and test variables if `shuffle = True`. **dataset** (x_train, y_train), (x_test, y_test). **user_item_encodings** (user_to_encoded, encoded_to_user,item_to_encoded, encoded_to_item). """ # Get unique ids user_ids = df.iloc[:, 0].unique().tolist() item_ids = df.iloc[:, 1].unique().tolist() # Create encoding and decoding maps for users user_to_encoded = {user_id: encoded_id for encoded_id, user_id in enumerate(user_ids)} encoded_to_user = {encoded_id: user_id for encoded_id, user_id in enumerate(user_ids)} # Create encoding and decoding maps for items item_to_encoded = {item_id: encoded_id for encoded_id, item_id in enumerate(item_ids)} encoded_to_item = {encoded_id: item_id for encoded_id, item_id in enumerate(item_ids)} user_item_encodings = (user_to_encoded, encoded_to_user, item_to_encoded, encoded_to_item) # Encode Dataset df_encoded = pd.DataFrame([], columns=['user', 'item', 'rating']) df_encoded['user'] = df.iloc[:, 0].map(user_to_encoded) df_encoded['item'] = df.iloc[:, 1].map(item_to_encoded) df_encoded['rating'] = df.iloc[:, 2].values.astype(np.float32) # Shuffle if shuffle = True if shuffle: df_encoded = df_encoded.sample(frac=1, random_state=None) # Dataframe -> Numpy Arrays x = df_encoded[['user', 'item']].values y = df_encoded[['rating']].values dataset = x, y # Split if split = True if split: x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_size, random_state=random_state ) dataset = (x_train, y_train), (x_test, y_test) return dataset, user_item_encodings