%cd /content
!git clone https://github.com/zzzyx21/zzzyx21.github.io.git
%cd /content/zzzyx21.github.io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

/content
Cloning into 'zzzyx21.github.io'...
remote: Enumerating objects: 34, done.
remote: Counting objects: 100% (34/34), done.
remote: Compressing objects: 100% (22/22), done.
remote: Total 34 (delta 16), reused 25 (delta 10), pack-reused 0
Receiving objects: 100% (34/34), 2.96 MiB | 8.30 MiB/s, done.
Resolving deltas: 100% (16/16), done.
/content/zzzyx21.github.io


games = pd.read_csv("./vgchartz-2024.csv", encoding="ISO-8859-1")
games = games[["title", "console", "genre", "critic_score", "total_sales", "na_sales", "jp_sales", "pal_sales", "other_sales", "release_date"]]
games


# Dropping rows where all specified columns are NaN
columns_to_check = ['critic_score','total_sales']
games = games.dropna(subset=columns_to_check)
regions_to_check = ['na_sales', 'jp_sales', 'pal_sales', 'other_sales']
games = games.dropna(subset=regions_to_check, how="all")
games


summary_statistics = games.describe()
summary_statistics


plt.figure(figsize=(7,3))
plt.hist(games['total_sales'], bins=30)
plt.title('Distribution of Total Sales (in millions)')
plt.xlabel('Total Sales (in millions)')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.75)


plt.figure(figsize=(7,3))
plt.hist(games['total_sales'], bins=30, log = True)
plt.title('Distribution of Total Sales (in millions)')
plt.xlabel('Total Sales (in millions)')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.75)


games["total_sales"].corr(games["critic_score"])
# a correlation coefficient of 0.28 is considered to be a weak correlation.

0.2811658100265469


games.plot.scatter(x="critic_score", y="total_sales", alpha=.5, color = 'green')
# There is a moderate positive correlation between critic scores and total sales.

<Axes: xlabel='critic_score', ylabel='total_sales'>


# Sum the sales figures for each genre
genre_sales = games.groupby('genre')[['na_sales', 'jp_sales', 'pal_sales', 'other_sales']].sum()

# Add a total sales column for each genre
genre_sales['total_sales'] = genre_sales.sum(axis=1)

# Sort the genres by total sales
genre_sales_sorted = genre_sales.sort_values('total_sales', ascending=False)

# Plotting the total sales for each genre
plt.figure(figsize=(7,3))
genre_sales_sorted['total_sales'].plot(kind='bar')
plt.title('Total Sales by Genre')
plt.xlabel('Genre')
plt.ylabel('Total Sales (in millions)')
plt.xticks(rotation=45)
plt.show()

genre_sales_sorted['total_sales']

genre
Shooter             609.93
Action              574.34
Sports              484.90
Role-Playing        254.23
Racing              252.24
Misc                164.90
Platform            155.96
Fighting            138.59
Adventure           131.52
Simulation           99.43
Action-Adventure     76.36
Strategy             46.81
Puzzle               31.36
Music                13.37
Party                 2.99
Sandbox               1.89
MMO                   1.17
Education             0.61
Board Game            0.31
Visual Novel          0.03
Name: total_sales, dtype: float64


# Combine the steps for grouping PlayStation and Xbox consoles into a single cell

# Group all PlayStation consoles under a single 'PS' category and all Xbox consoles under 'Xbox'
games['console_grouped'] = games['console'].replace(
    {
        r'PS.*': 'PlayStation',  # Use regex to match any PlayStation variation
        r'X.*': 'Xbox'  # Use regex to match any Xbox variation
    },
    regex=True
)

# Recalculate the total sales figures for each console group
console_grouped_sales = games.groupby('console_grouped')[['na_sales', 'jp_sales', 'pal_sales', 'other_sales']].sum()
console_grouped_sales['total_sales'] = console_grouped_sales.sum(axis=1)

# Sort the console groups by total sales
console_grouped_sales_sorted = console_grouped_sales.sort_values('total_sales', ascending=False)

# Plotting the total sales for each console group in a single chart
plt.figure(figsize=(10, 6))
console_grouped_sales_sorted['total_sales'].plot(kind='bar')
plt.title('Total Sales by Console Group')
plt.xlabel('Console Group')
plt.ylabel('Total Sales (in millions)')
plt.xticks(rotation=90)
plt.show()

# Return the sorted total sales for further inspection if needed
console_grouped_sales_sorted['total_sales']

console_grouped
PlayStation    1546.15
Xbox            772.41
Wii             190.09
DS              133.28
PC               99.98
GC               80.58
GBA              75.08
3DS              43.92
N64              27.67
NS               25.20
WiiU             20.12
DC               10.68
NES               4.17
SAT               3.80
GBC               3.78
GB                3.31
SNES              0.53
GEN               0.19
VC                0.00
Name: total_sales, dtype: float64


# Select games with highest critic scores
top_games = games.sort_values(by='critic_score', ascending=False).head(20)

# Plot bar graph
plt.figure(figsize=(10, 6))
plt.bar(top_games['title'], top_games['critic_score'])
plt.title('Top Games with Highest Critic Scores')
plt.xlabel('Game Title')
plt.ylabel('Critic Score')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


# Sort by total sales in descending order
highest_sales_games = games.sort_values(by='total_sales', ascending=False).head(20)

# Plot horizontal bar graph
plt.figure(figsize=(10, 10))
plt.barh(highest_sales_games['title'], highest_sales_games['total_sales'])
plt.title('Top Games by Total Sales')
plt.xlabel('Total Sales')
plt.ylabel('Game Title')
plt.gca().invert_yaxis()  # Invert y-axis to display highest sales at the top
plt.tight_layout()
plt.show()


# Create a pairplot with total sales against console, genre, critic score, and release date
sns.pairplot(games, x_vars=["console", "genre", "critic_score", "release_date"], y_vars=["total_sales"], kind='scatter', height=4)
plt.show()


steam = pd.read_csv("./steam_store_data_2024.csv")
steam = steam[["title", "price", "allReviews"]]
steam


ratings_map = {
    'Overwhelmingly Positive': 9,
    'Very Positive': 8,
    'Positive': 7,
    'Mostly Positive': 6,
    'Mixed': 5,
    'Mostly Negative': 4,
    'Negative': 3,
    'Very Negative': 2,
    'Overwhelmingly Negative': 1
}

steam['allReviews'] = steam['allReviews'].replace(ratings_map)
steam


steam['price'] = steam['price'].astype(str)
steam['price'] = steam['price'].str.replace('$', '', regex = False)
steam['price'] = steam['price'].astype('float64')
steam


steam.isna().sum()

title          0
price         25
allReviews    29
dtype: int64


steam.dropna(inplace=True) # Drops rows with any NaN value
steam.drop_duplicates(subset='title')
steam


# Cleaning the price data to remove the dollar sign and convert it to float
steam['price'] = steam['price'].replace('[\$,]', '', regex=True).astype(float)

# Plotting the histogram of prices
steam['price'].hist(bins=20)
plt.title('Distribution of Game Prices on Steam')
plt.xlabel('Price ($)')
plt.ylabel('Number of Games')
plt.show()


games = pd.read_csv("./vgchartz-2024.csv", encoding="ISO-8859-1")
games = games[["title", "console", "genre", "critic_score", "total_sales", "na_sales", "jp_sales", "pal_sales", "other_sales"]]


steam = pd.read_csv("./steam_store_data_2024.csv")
steam = steam[["title", "price", "allReviews"]]
steam

ratings_map = {
    'Overwhelmingly Positive': 9,
    'Very Positive': 8,
    'Positive': 7,
    'Mostly Positive': 6,
    'Mixed': 5,
    'Mostly Negative': 4,
    'Negative': 3,
    'Very Negative': 2,
    'Overwhelmingly Negative': 1
}

steam['allReviews'] = steam['allReviews'].replace(ratings_map)
steam['price'] = steam['price'].astype(str)
steam['price'] = steam['price'].str.replace('$', '', regex = False)
steam['price'] = steam['price'].astype('float64')

steam

games = games[["title", "console", "genre"]]
pc_games_vgchartz = games[games['console'] == 'PC']

# Merge the filtered PC games dataset with the Steam data
pc_merged_data = pd.merge(steam, pc_games_vgchartz, on='title', how='inner')
pc_merged_data = pc_merged_data.drop_duplicates(subset='title')
pc_merged_data['price'] = pc_merged_data['price'].fillna(pc_merged_data['price'].mean())
pc_merged_data['allReviews'] = pc_merged_data['allReviews'].fillna(pc_merged_data['allReviews'].mean())

pc_merged_data


pc_merged_data.shape

(39, 5)


plt.figure(figsize=(7, 3))
sns.histplot(pc_merged_data['price'], bins=30)
plt.title('Distribution of Game Prices')
plt.xlabel('Price ($)')
plt.ylabel('Number of Games')
plt.grid(True)
plt.show()


plt.figure(figsize=(6, 3))
sns.scatterplot(data=pc_merged_data, x='price', y='allReviews')
plt.title('Price vs. Review Scores')
plt.xlabel('Price ($)')
plt.ylabel('Review Score')
plt.grid(True)
plt.show()

reg_model = LinearRegression()
reg_model.fit(pc_merged_data[['price']], pc_merged_data['allReviews'])

# Predict review scores using the linear regression model
predicted_reviews = reg_model.predict(pc_merged_data[['price']])


# Group by genre and calculate average price and average review score
genre_analysis = pc_merged_data.groupby('genre').agg(
    Average_Price=('price', 'mean'),
    Average_Review_Score=('allReviews', 'mean'),
    Count=('title', 'count')
).sort_values(by='Count', ascending=False)

# Plotting average price and review score by genre
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))
genre_analysis['Average_Price'].plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('Average Price by Genre')
axes[0].set_xlabel('Genre')
axes[0].set_ylabel('Average Price ($)')
axes[0].tick_params(axis='x', rotation=45)

genre_analysis['Average_Review_Score'].plot(kind='bar', ax=axes[1], color='lightgreen')
axes[1].set_title('Average Review Score by Genre')
axes[1].set_xlabel('Genre')
axes[1].set_ylabel('Average Review Score')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Display the genre analysis for number of games per genre
genre_analysis[['Count']]


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.compose import make_column_selector as selector

# Load the data
games = pd.read_csv("./vgchartz-2024.csv", encoding="ISO-8859-1")
games = games[["title", "console", "genre", "critic_score", "total_sales", "release_date"]]

# Handle missing values
games.dropna(subset=["console", "genre", "critic_score", "release_date", "total_sales"], inplace=True)

# Split features and target variable
X = games.drop(columns=["total_sales"])
y = games["total_sales"]

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing
# Define transformers for numerical and categorical features
numerical_features = selector(dtype_exclude="object")(X)
categorical_features = selector(dtype_include="object")(X)

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 1.6234001455878784


# Log-transform the actual and predicted total sales
y_test_log = np.log1p(y_test)
y_pred_log = np.log1p(y_pred)

# Fit a linear regression line
slope, intercept = np.polyfit(y_test_log, y_pred_log, 1)
line = slope * y_test_log + intercept

# Plotting actual vs predicted total sales with a trend line on a log scale
plt.figure(figsize=(10, 6))
plt.scatter(y_test_log, y_pred_log, color='blue', alpha=0.5)
plt.plot(y_test_log, line, color='red', linewidth=2)
plt.xlabel('Log(Actual Total Sales)')
plt.ylabel('Log(Predicted Total Sales)')
plt.title('Log(Actual) vs Log(Predicted) Total Sales with Trend Line Random Forest Regressor')
plt.grid(True)
plt.show()

# Calculate R-squared value
r_squared = r2_score(y_test_log, y_pred_log)
print(f"R-squared value: {r_squared}")

R-squared value: 0.2792974010449363


from sklearn.neighbors import KNeighborsRegressor

# Define model
knn_model  = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', KNeighborsRegressor(n_neighbors=5))])

# Train the model
knn_model .fit(X_train, y_train)

# Predictions
y_pred = knn_model .predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 1.5529656727272727


# Log-transform the actual and predicted total sales
y_test_log = np.log1p(y_test)
y_pred_log = np.log1p(y_pred)

# Fit a linear regression line
slope, intercept = np.polyfit(y_test_log, y_pred_log, 1)
line = slope * y_test_log + intercept

# Plotting actual vs predicted total sales with a trend line on a log scale
plt.figure(figsize=(10, 6))
plt.scatter(y_test_log, y_pred_log, color='blue', alpha=0.5)
plt.plot(y_test_log, line, color='red', linewidth=2)
plt.xlabel('Log(Actual Total Sales)')
plt.ylabel('Log(Predicted Total Sales)')
plt.title('Log(Actual) vs Log(Predicted) Total Sales with Trend Line (KNN)')
plt.grid(True)
plt.show()

# Calculate R-squared value
r_squared = r2_score(y_test_log, y_pred_log)
print(f"R-squared value: {r_squared}")

R-squared value: 0.27131038170548916


from sklearn.model_selection import cross_val_score

# Define the RandomForestRegressor model
rf_model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])

# Perform cross-validation
cv_scores_rf = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert the negative MSE scores to positive and calculate RMSE
cv_rmse_scores_rf = np.sqrt(-cv_scores_rf)

# Print the RMSE scores and mean RMSE
print("Cross-Validation RMSE Scores:", cv_rmse_scores_rf)
print("Mean RMSE:", cv_rmse_scores_rf.mean())

# Plotting cross-validation RMSE scores
plt.figure(figsize=(7, 3))
plt.bar(range(1, 6), cv_rmse_scores_rf)
plt.xlabel('Fold')
plt.ylabel('RMSE')
plt.title('Cross-Validation RMSE Scores for Random Forest Regressor')
plt.grid(axis='y')
plt.show()

Cross-Validation RMSE Scores: [1.82545462 1.64839904 1.01734427 1.04961326 0.96906819]
Mean RMSE: 1.3019758763516938


from sklearn.neighbors import KNeighborsRegressor

# Define the KNeighborsRegressor model
knn_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', KNeighborsRegressor(n_neighbors=5))])

# Perform cross-validation on the KNN model
cv_scores_knn = cross_val_score(knn_model, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert the negative MSE scores to positive and calculate RMSE for KNN
cv_rmse_scores_knn = np.sqrt(-cv_scores_knn)

# Print the RMSE scores and mean RMSE for KNN
print("Cross-Validation RMSE Scores for KNN:", cv_rmse_scores_knn)
print("Mean RMSE for KNN:", cv_rmse_scores_knn.mean())

# Plotting cross-validation RMSE scores for KNN
plt.figure(figsize=(7, 3))
plt.bar(range(1, 6), cv_rmse_scores_knn)
plt.xlabel('Fold')
plt.ylabel('RMSE')
plt.title('Cross-Validation RMSE Scores for KNN')
plt.grid(axis='y')
plt.show()

Cross-Validation RMSE Scores for KNN: [1.7273006  1.62028745 1.00503727 1.07073941 1.02809062]
Mean RMSE for KNN: 1.2902910677318675


from sklearn.model_selection import learning_curve

# Perform cross-validation on the KNN model
cv_scores_knn = cross_val_score(knn_model, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert the negative MSE scores to positive and calculate RMSE for KNN
cv_rmse_scores_knn = np.sqrt(-cv_scores_knn)

# Define the learning curve parameters
train_sizes, train_scores, test_scores = learning_curve(
    knn_model, X, y, cv=5, scoring='neg_mean_squared_error',
    train_sizes=np.linspace(0.1, 1.0, 5))

# Calculate mean and standard deviation for train and test scores
train_scores_mean = -train_scores.mean(axis=1)
test_scores_mean = -test_scores.mean(axis=1)
train_scores_std = train_scores.std(axis=1)
test_scores_std = test_scores.std(axis=1)

# Convert scores to RMSE
train_rmse_scores = np.sqrt(train_scores_mean)
test_rmse_scores = np.sqrt(test_scores_mean)
train_rmse_std = np.sqrt(train_scores_std)
test_rmse_std = np.sqrt(test_scores_std)

# Plot learning curve
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_rmse_scores, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_rmse_scores, 'o-', color="g", label="Cross-validation score")
plt.fill_between(train_sizes, train_rmse_scores - train_rmse_std,
                 train_rmse_scores + train_rmse_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_rmse_scores - test_rmse_std,
                 test_rmse_scores + test_rmse_std, alpha=0.1, color="g")
plt.title('Learning Curves for KNN Model')
plt.xlabel('Training examples')
plt.ylabel('RMSE')
plt.legend(loc="best")
plt.grid(True)
plt.show()

# Output RMSE scores from cross-validation
cv_rmse_scores_knn

array([1.7273006 , 1.62028745, 1.00503727, 1.07073941, 1.02809062])


# Define the learning curve parameters for RandomForest
train_sizes_rf, train_scores_rf, test_scores_rf = learning_curve(
    rf_model, X, y, cv=5, scoring='neg_mean_squared_error',
    train_sizes=np.linspace(0.1, 1.0, 5))

# Calculate mean and standard deviation for train and test scores for RandomForest
train_scores_mean_rf = -train_scores_rf.mean(axis=1)
test_scores_mean_rf = -test_scores_rf.mean(axis=1)
train_scores_std_rf = train_scores_rf.std(axis=1)
test_scores_std_rf = test_scores_rf.std(axis=1)

# Convert scores to RMSE for RandomForest
train_rmse_scores_rf = np.sqrt(train_scores_mean_rf)
test_rmse_scores_rf = np.sqrt(test_scores_mean_rf)
train_rmse_std_rf = np.sqrt(train_scores_std_rf)
test_rmse_std_rf = np.sqrt(test_scores_std_rf)

# Plot learning curve for RandomForest
plt.figure(figsize=(10, 6))
plt.plot(train_sizes_rf, train_rmse_scores_rf, 'o-', color="r", label="Training score")
plt.plot(train_sizes_rf, test_rmse_scores_rf, 'o-', color="g", label="Cross-validation score")
plt.fill_between(train_sizes_rf, train_rmse_scores_rf - train_rmse_std_rf,
                 train_rmse_scores_rf + train_rmse_std_rf, alpha=0.1, color="r")
plt.fill_between(train_sizes_rf, test_rmse_scores_rf - test_rmse_std_rf,
                 test_rmse_scores_rf + test_rmse_std_rf, alpha=0.1, color="g")
plt.title('Learning Curves for RandomForest Model')
plt.xlabel('Training examples')
plt.ylabel('RMSE')
plt.legend(loc="best")
plt.grid(True)
plt.show()

# Output RMSE scores from cross-validation for RandomForest
cv_rmse_scores_rf

array([1.82545462, 1.64839904, 1.01734427, 1.04961326, 0.96906819])


# Selecting features and target
X = games[['critic_score']]  # Using only critic_score as feature for simplicity
y = games['total_sales']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Defining a pipeline for preprocessing and modeling
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('knn', KNeighborsRegressor(n_neighbors=5))  # KNN regressor
])

# Train the KNN model
pipeline.fit(X_train, y_train)

# Creating an example with a specific critic score to predict sales
example = pd.DataFrame({'critic_score': [8.5]})  # Example critic score of 85

# Predicting sales for the example
predicted_sales = pipeline.predict(example)

predicted_sales

array([2.916])


# Creating a scatter plot of critic_score vs total_sales
plt.figure(figsize=(10, 6))
plt.scatter(X_train['critic_score'], y_train, color='blue', alpha=0.5, label='Training data')

# Fitting a simple linear regression to add a trend line
slope, intercept = np.polyfit(X_train['critic_score'], y_train, 1)
trend_line = slope * X_train['critic_score'] + intercept

# Plot the trend line
plt.plot(X_train['critic_score'], trend_line, color='red', linewidth=2, label='Trend line')

# Adding the example point with predicted sales
plt.scatter([8.5], predicted_sales, color='red', s=100, label='Example prediction')

# Setting up the plot
plt.xlabel('Critic Score')
plt.ylabel('Total Sales (millions)')
plt.title('Critic Score vs. Total Sales with Trend Line')
plt.legend()
plt.grid(True)
plt.show()

	title	console	genre	critic_score	total_sales	na_sales	jp_sales	pal_sales	other_sales	release_date
33	.hack//G.U. Vol.2//Reminisce	PS2	Role-Playing	6.2	0.23	0.11	NaN	0.09	0.03	2007/05/08
35	.hack//G.U. Vol.3//Redemption	PS2	Role-Playing	5.7	0.17	NaN	0.17	NaN	NaN	2007/09/10
36	.hack//Infection Part 1	PS2	Role-Playing	7.7	1.27	0.49	0.26	0.38	0.13	2003/02/11
38	.hack//Mutation Part 2	PS2	Role-Playing	7.5	0.68	0.23	0.20	0.18	0.06	2003/05/07
39	.hack//Outbreak Part 3	PS2	Role-Playing	7.1	0.46	0.14	0.17	0.11	0.04	2003/09/09
...	...	...	...	...	...	...	...	...	...	...
63919	Zoo Tycoon DS	DS	Strategy	4.6	0.98	0.86	0.01	0.03	0.07	2005/10/11
63934	ZooCube	GBA	Puzzle	8.6	0.05	0.03	NaN	0.01	0.00	2002/05/14
63935	ZooCube	GC	Puzzle	6.9	0.02	0.02	NaN	0.00	0.00	2002/05/05
63977	Zubo	DS	Misc	7.5	0.11	0.08	NaN	0.02	0.01	2009/03/10
63988	Zuma's Revenge!	PC	Puzzle	8.3	0.01	0.01	NaN	NaN	0.00	2009/09/16

	critic_score	total_sales	na_sales	jp_sales	pal_sales	other_sales
count	4126.000000	4126.000000	3738.000000	1402.000000	3779.000000	4003.000000
mean	7.101890	0.737230	0.416581	0.108959	0.263697	0.083560
std	1.439307	1.408497	0.734706	0.162062	0.612218	0.199425
min	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	6.300000	0.110000	0.080000	0.020000	0.020000	0.010000
50%	7.300000	0.300000	0.180000	0.050000	0.070000	0.020000
75%	8.100000	0.750000	0.430000	0.130000	0.250000	0.080000
max	10.000000	20.320000	9.760000	1.870000	9.850000	3.120000

	title	price	allReviews
4	Atomic Heart	29.99	6.0
6	Bendy and the Dark Revival	5.99	8.0
7	Bendy and the Dark Revival	5.99	8.0
8	BlazBlue	17.15	6.0
9	Boxes: Lost Fragments	13.49	8.0
10	CARRION	5.99	8.0
11	CARRION	5.99	8.0
15	Crisis Core: Final Fantasy VII Reunion	29.99	8.0
16	Days Gone	12.49	8.0
17	Dead by Daylight	7.99	8.0
18	Dead by Daylight	7.99	8.0
19	Dead Space	23.99	8.0
21	Destiny 2	14.99	6.0
22	Destiny 2: Lightfall + Annual Pass	33.00	4.0
23	Dragon Quest XI: Echoes of an Elusive Age	23.99	8.0
24	Dragon Quest XI: Echoes of an Elusive Age	23.99	8.0
25	Fallout 76	9.99	6.0
27	Final Fantasy VII Remake Intergrade	34.99	8.0
28	Final Fantasy VII Remake Intergrade	34.99	8.0
29	Final Fantasy X / X-2 HD Remaster	11.99	8.0
30	Final Fantasy XII: The Zodiac Age	19.99	8.0
31	Final Fantasy XV: Windows Edition	13.99	8.0
32	Flashing Lights - Police, Firefighting, Emerge...	8.49	8.0
33	Flashing Lights - Police, Firefighting, Emerge...	8.49	8.0
35	God of War	24.99	9.0
36	Grounded	26.79	8.0
38	Halo Infinite	23.99	6.0
39	Halo Infinite (Campaign)	23.99	5.0
40	Headbangers: Rhythm Royale	13.39	8.0
41	Hell Let Loose	42.24	8.0
42	HELLCARD	15.79	8.0
45	Hogwarts Legacy	35.99	8.0
46	Hogwarts Legacy	35.99	8.0
49	Last Train Home	26.39	8.0
50	LEGO Star Wars: The Complete Saga	4.99	9.0
54	Marvel’s Spider-Man Remastered	35.99	8.0
55	Marvel’s Spider-Man: Miles Morales	29.99	8.0
56	Moonbreaker	22.49	8.0
58	Ori and the Will of the Wisps	9.89	9.0
59	Ori and the Will of the Wisps	9.89	9.0
60	Overcooked! 2	6.24	8.0
61	Poppy Playtime - Chapter 3	9.89	8.0
62	Ratchet & Clank: Rift Apart	40.19	8.0
63	Ready or Not	37.49	6.0
64	Ready or Not	37.49	6.0
65	Returnal	40.19	8.0
69	Star Wars Battlefront II	3.49	8.0
70	Star Wars: The Force Unleashed - Ultimate Sith...	6.99	8.0
71	Star Wars: Empire at War - Gold Pack	6.99	9.0
72	Stranger of Paradise: Final Fantasy Origin	23.99	8.0
73	The Elder Scrolls V: Skyrim Special Edition	9.99	8.0
75	Thronefall	5.24	9.0
76	Thronefall	5.24	9.0
77	Thymesia	14.99	8.0
78	Thymesia	14.99	8.0
82	Uncharted: Legacy of Thieves Collection	24.99	8.0
83	Undying	13.99	6.0

	title	price	allReviews	console	genre
0	Tycoon Collection	22.224375	7.714286	PC	Strategy
1	Atomic Heart	29.990000	6.000000	PC	Shooter
2	Banishers: Ghosts of New Eden	49.990000	7.714286	PC	Role-Playing
3	Crisis Core: Final Fantasy VII Reunion	29.990000	8.000000	PC	Role-Playing
4	Days Gone	12.490000	8.000000	PC	Action-Adventure
5	Dead by Daylight	7.990000	8.000000	PC	Action
7	Dead Space	23.990000	8.000000	PC	Shooter
8	Deep Rock Galactic	22.224375	7.714286	PC	Shooter
9	Destiny 2	14.990000	6.000000	PC	Shooter
10	Fallout 76	9.990000	6.000000	PC	Role-Playing
11	Fight Crab	22.224375	7.714286	PC	Action
12	Final Fantasy X / X-2 HD Remaster	11.990000	8.000000	PC	Role-Playing
13	Final Fantasy XII: The Zodiac Age	19.990000	8.000000	PC	Role-Playing
14	Final Fantasy XV: Windows Edition	13.990000	8.000000	PC	Role-Playing
15	Goat Simulator	22.224375	7.714286	PC	Misc
16	God of War	24.990000	9.000000	PC	Action-Adventure
17	Grounded	26.790000	8.000000	PC	Action-Adventure
18	Gunvolt Records Cychronicle	22.224375	7.714286	PC	Music
19	Halo Infinite	23.990000	6.000000	PC	Shooter
20	Helldivers 2	59.990000	7.714286	PC	Shooter
21	Hogwarts Legacy	35.990000	8.000000	PC	Role-Playing
23	Last Train Home	26.390000	8.000000	PC	Strategy
24	LEGO Star Wars: The Complete Saga	4.990000	9.000000	PC	Misc
25	Ori and the Will of the Wisps	9.890000	9.000000	PC	Action-Adventure
27	Ratchet & Clank: Rift Apart	40.190000	8.000000	PC	Action-Adventure
28	Ready or Not	37.490000	6.000000	PC	Shooter
30	Returnal	40.190000	8.000000	PC	Shooter
31	Star Wars Battlefront II	3.490000	8.000000	PC	Shooter
32	Star Wars: The Force Unleashed - Ultimate Sith...	6.990000	8.000000	PC	Action
33	Star Wars: Empire at War - Gold Pack	6.990000	9.000000	PC	Strategy
34	Stranger of Paradise: Final Fantasy Origin	23.990000	8.000000	PC	Action
35	The Elder Scrolls V: Skyrim Special Edition	9.990000	8.000000	PC	Role-Playing
36	Nicolas Eymerich - The Inquisitor	22.224375	7.714286	PC	Misc
37	Thymesia	14.990000	8.000000	PC	Role-Playing
39	Tomb Raider I-III Remastered	26.990000	7.714286	PC	Action-Adventure
40	ULTROS	22.490000	7.714286	PC	Action-Adventure
41	Uncharted: Legacy of Thieves Collection	24.990000	8.000000	PC	Action-Adventure
42	Undying	13.990000	6.000000	PC	Adventure
43	Vampire: The Masquerade - Bloodlines 2	22.224375	7.714286	PC	Action

Video Game Success Through the Lens of Sales and Reviews¶

Motivation¶

Datasets¶

Video Game Sales Dataset¶

Steam Store Game Dataset¶

Modeling Approach for Different Features Suggesting Total Sales¶

Data Preparation¶

Feature Engineering¶

Model Selection¶

RandomForest Regressor¶

K-Nearest Neighbors (KNN)¶

Analysis of the Log-Transformed Actual vs. Predicted Sales¶

Analysis of the Cross-Validation RMSE Scores¶

Analysis of the Learning Curves¶

Conclusion for model approaches for different features suggesting total sales¶

Modelling Approach for Critic Score to Total Sales¶

Analysis of the Scatter Plot¶

Conclusion for model approaches for critic score suggesting total sales¶

	title	console	genre	critic_score	total_sales	na_sales	jp_sales	pal_sales	other_sales	release_date
0	140	PC	Platform	NaN	NaN	NaN	NaN	NaN	NaN	2013/10/16
1	140	WiiU	Platform	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	757	PC	Simulation	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	1849	OSX	Misc	NaN	NaN	NaN	NaN	NaN	NaN	2014/07/01
4	1849	PC	Misc	NaN	NaN	NaN	NaN	NaN	NaN	2014/07/01
...	...	...	...	...	...	...	...	...	...	...
64011	Zwei!!	PSP	Role-Playing	NaN	0.02	NaN	0.02	NaN	NaN	2008/12/11
64012	Zwei!!	PS2	Role-Playing	NaN	NaN	NaN	NaN	NaN	NaN	2004/08/26
64013	Zwei!!	PC	Role-Playing	NaN	NaN	NaN	NaN	NaN	NaN	2001/01/01
64014	Zyklus	PC	Adventure	NaN	NaN	NaN	NaN	NaN	NaN	2003/04/30
64015	Zyuden Sentai Kyoryuger: Game on Gaburincho	3DS	Action	NaN	0.05	NaN	0.05	NaN	NaN	2013/08/08

	title	price	allReviews
0	100 Ninja Cats	NaN	NaN
1	And the Hero Was Never Seen Again	NaN	NaN
2	Tycoon Collection	NaN	NaN
3	Arzette: The Jewel of Faramore	NaN	NaN
4	Atomic Heart	$29.99	Mostly Positive
...	...	...	...
81	ULTROS	$22.49	NaN
82	Uncharted: Legacy of Thieves Collection	$24.99	Very Positive
83	Undying	$13.99	Mostly Positive
84	Vampire: The Masquerade - Bloodlines 2	NaN	NaN
85	WitchHand	NaN	NaN