import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer
import joblib

# Load dataset
df = pd.read_csv('unilag_venue_booking_data.csv')

df.shape

(1500, 9)

df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   event_type      1500 non-null   object
 1   hall_name       1500 non-null   object
 2   guest_count     1500 non-null   int64 
 3   cost_of_hall    1500 non-null   int64 
 4   rating          1500 non-null   int64 
 5   booking_status  1500 non-null   object
 6   date            1500 non-null   object
 7   time            1500 non-null   object
 8   amenities       1500 non-null   object
dtypes: int64(3), object(6)
memory usage: 105.6+ KB

#Cheching for NaN values
df.isna().sum()

event_type        0
hall_name         0
guest_count       0
cost_of_hall      0
rating            0
booking_status    0
date              0
time              0
amenities         0
dtype: int64

#Cheching for duplicates
df.duplicated().sum()

np.int64(0)

df.describe()

numeric_fields_df = df.select_dtypes(include = ["number"])

numeric_fields_df.corr()

df["event_type"].value_counts()

event_type
Seminar            321
Workshop           204
Concert            197
Religious Event    195
Stage Play         178
Tech Talk          178
Conference         177
Birthday            50
Name: count, dtype: int64

df["booking_status"].value_counts()

booking_status
Success        1335
Rescheduled     138
Cancelled        27
Name: count, dtype: int64

df["booking_status"].value_counts().plot(kind="pie")
plt.title("Booking status")
plt.ylabel("")
plt.show()

df["hall_name"].value_counts()

hall_name
Jelili Omotola Hall B                        148
Tolu Odugbemi                                146
Faculty of Social Science Lecture Theatre    146
Julius Berger Hall                           138
Jelili Omotola Hall A                        138
Faculty of Arts Theatre                      137
Rahaman Bello Lecture Theatre                135
Jelili Omotola Hall C                        132
Afe Babalola                                 129
Tayo Aderinokun                              129
J.F. Ade Ajayi Auditorium                    122
Name: count, dtype: int64

df["hall_name"].value_counts().plot(kind="pie")
plt.title("Booked Halls")
plt.ylabel("")
plt.show()

#find the Mean of Tenure based on the two groups of Churn (yes/no)
df.groupby("hall_name")["rating"].mean()

hall_name
Afe Babalola                                 3.124031
Faculty of Arts Theatre                      3.321168
Faculty of Social Science Lecture Theatre    3.630137
J.F. Ade Ajayi Auditorium                    4.000000
Jelili Omotola Hall A                        3.521739
Jelili Omotola Hall B                        3.391892
Jelili Omotola Hall C                        3.590909
Julius Berger Hall                           3.318841
Rahaman Bello Lecture Theatre                3.407407
Tayo Aderinokun                              3.503876
Tolu Odugbemi                                3.253425
Name: rating, dtype: float64

# Encode categorical variables
le_event = LabelEncoder()
le_hall = LabelEncoder()
mlb_amenities = MultiLabelBinarizer()

df['event_type_encoded'] = le_event.fit_transform(df['event_type'])
df['hall_name_encoded'] = le_hall.fit_transform(df['hall_name'])

# Amenities encoding (split by comma, e.g., "Generator,AC")
df['amenities_list'] = df['amenities'].apply(lambda x: x.split(','))
amenities_encoded = mlb_amenities.fit_transform(df['amenities_list'])

# Add the amenities binary columns to the dataframe
amenities_df = pd.DataFrame(amenities_encoded, columns=mlb_amenities.classes_)
df = pd.concat([df, amenities_df], axis=1)

# Features (X) and Target (y)
X = df[['event_type_encoded', 'guest_count', 'cost_of_hall', 'rating'] + list(mlb_amenities.classes_)]
y = df['hall_name_encoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

RandomForestClassifier(random_state=42)

# Predictions on test set
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.81

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le_hall.inverse_transform(sorted(y_test.unique()))))

Classification Report:
                                           precision    recall  f1-score   support

                             Afe Babalola       0.33      0.44      0.38        25
                  Faculty of Arts Theatre       0.36      0.40      0.38        25
Faculty of Social Science Lecture Theatre       1.00      1.00      1.00        22
                J.F. Ade Ajayi Auditorium       0.90      1.00      0.95        28
                    Jelili Omotola Hall A       1.00      0.94      0.97        32
                    Jelili Omotola Hall B       1.00      1.00      1.00        22
                    Jelili Omotola Hall C       1.00      1.00      1.00        28
                       Julius Berger Hall       0.42      0.29      0.34        35
            Rahaman Bello Lecture Theatre       1.00      1.00      1.00        26
                          Tayo Aderinokun       1.00      0.96      0.98        27
                            Tolu Odugbemi       1.00      1.00      1.00        30

                                 accuracy                           0.81       300
                                macro avg       0.82      0.82      0.82       300
                             weighted avg       0.81      0.81      0.81       300

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=le_hall.inverse_transform(sorted(y_test.unique())),
            yticklabels=le_hall.inverse_transform(sorted(y_test.unique())))
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# Save the model
joblib.dump(model, 'hall_recommender.pkl')

['hall_recommender.pkl']

joblib.dump(le_event, 'le_event.pkl')

['le_event.pkl']

joblib.dump(le_hall, 'le_hall.pkl')

['le_hall.pkl']

joblib.dump(mlb_amenities, 'mlb_amenities.pkl')

['mlb_amenities.pkl']

	event_type	hall_name	guest_count	cost_of_hall	rating	booking_status	date	time	amenities
0	Stage Play	Jelili Omotola Hall B	691	211963	4	Success	3/4/2023	19:00	Sound System, Stage, Generator, AC
1	Workshop	J.F. Ade Ajayi Auditorium	766	201250	5	Success	7/16/2024	9:00	AC, Generator
2	Workshop	Jelili Omotola Hall B	112	211963	3	Success	1/31/2021	8:00	Sound System, AC, Stage
3	Religious Event	Jelili Omotola Hall B	158	211963	4	Success	1/5/2022	8:00	Parking, Lighting, Projector
4	Religious Event	Faculty of Arts Theatre	916	93000	5	Success	10/8/2024	16:00	WiFi, AC, Lighting, Parking

	guest_count	cost_of_hall	rating
count	1500.000000	1500.000000	1500.000000
mean	511.988000	161321.065333	3.350000
std	285.329251	49154.441639	1.470014
min	21.000000	93000.000000	1.000000
25%	261.000000	93000.000000	2.000000
50%	505.000000	176250.000000	4.000000
75%	767.000000	210000.000000	5.000000
max	1000.000000	215375.000000	5.000000

	guest_count	cost_of_hall	rating
guest_count	1.000000	-0.005847	0.020118
cost_of_hall	-0.005847	1.000000	0.088026
rating	0.020118	0.088026	1.000000

AI Powered Hall Recommender

Train Model