# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LinearRegression 
import numpy as np

#load dataset file
df = pd.read_csv("ransomware_data_file.csv")

#check dataset dimension
print("Dataset Shape:", df.shape)

Dataset Shape: (62485, 18)

#check datafile metadata
print("Dataset Info:")
print(df.info())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62485 entries, 0 to 62484
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   FileName            62485 non-null  object
 1   md5Hash             62485 non-null  object
 2   Machine             62485 non-null  int64 
 3   DebugSize           62485 non-null  int64 
 4   DebugRVA            62485 non-null  int64 
 5   MajorImageVersion   62485 non-null  int64 
 6   MajorOSVersion      62485 non-null  int64 
 7   ExportRVA           62485 non-null  int64 
 8   ExportSize          62485 non-null  int64 
 9   IatVRA              62485 non-null  int64 
 10  MajorLinkerVersion  62485 non-null  int64 
 11  MinorLinkerVersion  62485 non-null  int64 
 12  NumberOfSections    62485 non-null  int64 
 13  SizeOfStackReserve  62485 non-null  int64 
 14  DllCharacteristics  62485 non-null  int64 
 15  ResourceSize        62485 non-null  int64 
 16  BitcoinAddresses    62485 non-null  int64 
 17  Benign              62485 non-null  int64 
dtypes: int64(16), object(2)
memory usage: 8.6+ MB
None

#show 5 columns sample of dataset
df.sample(5)

# Check for duplicate values

df.duplicated().sum()

np.int64(0)

# Check for missing values
print("\nMissing Values in the Dataset:")
print(df.isnull().sum())

Missing Values in the Dataset:
FileName              0
md5Hash               0
Machine               0
DebugSize             0
DebugRVA              0
MajorImageVersion     0
MajorOSVersion        0
ExportRVA             0
ExportSize            0
IatVRA                0
MajorLinkerVersion    0
MinorLinkerVersion    0
NumberOfSections      0
SizeOfStackReserve    0
DllCharacteristics    0
ResourceSize          0
BitcoinAddresses      0
Benign                0
dtype: int64

df.drop(columns =["DebugRVA","MajorImageVersion","MajorOSVersion","ExportRVA","ExportSize","IatVRA","MajorLinkerVersion","MinorLinkerVersion","NumberOfSections","SizeOfStackReserve","ResourceSize","BitcoinAddresses"], inplace = True)

df.sample(5)

df['Benign'].value_counts()

Benign
0    35367
1    27118
Name: count, dtype: int64

df['DebugSize'].value_counts()

DebugSize
0             36521
28            12719
84             8974
56             4087
112              89
1                55
536               9
257               6
32                3
30                3
512               3
140               2
1056              1
276               1
17                1
4                 1
135               1
10                1
1615155235        1
16                1
256               1
109               1
1236              1
102               1
49                1
1072              1
Name: count, dtype: int64

df.max()

FileName                           ztrace_maps (3).dll
md5Hash               ffffad120581ecad433eb5a3b403b6ac
Machine                                          43620
DebugSize                                   1615155235
DllCharacteristics                               58632
Benign                                               1
dtype: object

plt.pie(df['Benign'].value_counts(), labels = ['Ransomware', 'Benign'], autopct = '%0.2f')

([<matplotlib.patches.Wedge at 0x24514e29460>,
  <matplotlib.patches.Wedge at 0x24514e320f0>],
 [Text(-0.22647549309674733, 1.0764333936786732, 'Ransomware'),
  Text(0.22647545888068688, -1.076433400877538, 'Benign')],
 [Text(-0.12353208714368034, 0.5871454874610944, '56.60'),
  Text(0.12353206848037465, -0.587145491387748, '43.40')])

# Visualize the target variable distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Benign', data=df)
plt.title('Distribution of Ransomware vs Benign Samples')
plt.xlabel('Label (0: Ransomware, 1: Benign)')
plt.ylabel('Count')
plt.show()

#check relationship betweeen debug size and benign/malware state using a potion of the data
df_portion = df.sample(frac=0.5, random_state = 50)
sns.jointplot(data = df_portion, x="DebugSize", y="Benign", kind="scatter", alpha=0.3)
plt.show()

#check relationship betweeen dll characteristics and benign/malware state using a potion of the data
df_portion = df.sample(frac=0.5, random_state = 45)
sns.jointplot(data = df_portion, x="DllCharacteristics", y="Benign", kind="scatter")
plt.show()

#select numeric datatypes
numeric_df = df.select_dtypes(include=["number"])

# Check for correlations between numeric features
plt.figure(figsize=(12, 10))
correlation_matrix = numeric_df.corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.show()

sns.pairplot(df, hue='Benign')
plt.show()

# Define features and target variable
X = df.drop('Benign', axis=1)
y = df['Benign']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.select_dtypes(include=["number"]))
X_test_scaled = scaler.transform(X_test.select_dtypes(include=["number"]))

# Perform PCA to visualize data in 2D
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

plt.figure(figsize=(10, 8))
plt.scatter(X_train_pca[y_train == 0, 0], X_train_pca[y_train == 0, 1], label='Benign', alpha=0.6)
plt.scatter(X_train_pca[y_train == 1, 0], X_train_pca[y_train == 1, 1], label='Ransomware', alpha=0.6)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of Ransomware Dataset')
plt.legend()
plt.show()

# Train a Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)

RandomForestClassifier(random_state=42)

RandomForestClassifier(random_state=42)

# Make predictions on the test set
RF_y_predictions = rf_model.predict(X_test_scaled)

#comparing predictions of the RF Classifier with y test
sns.scatterplot(x=RF_y_predictions, y= y_test)
plt.xlabel ("Predictions")
plt.ylabel("Actual Test Values")
plt.title("Comparison of Predictions with True Values")
plt.show()

# Evaluate the RF classifier model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[10446   232]
 [  191  7877]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10678
           1       0.97      0.98      0.97      8068

    accuracy                           0.98     18746
   macro avg       0.98      0.98      0.98     18746
weighted avg       0.98      0.98      0.98     18746

# Feature importance analysis of RF classifier
feature_importance = rf_model.feature_importances_
features = X.select_dtypes(include=["number"]).columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importance in Random Forest Classifier')
plt.show()

# We can then define a fuction that takes the object as an argument alongside the training and test data
def train_classifier(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division = 0)
    recall = recall_score(y_test, y_pred, zero_division = 0)
    f1 = f1_score(y_test, y_pred, zero_division = 0)
    roc = roc_auc_score(y_test, y_pred)

    return accuracy, precision, recall, f1, roc

# Test the training function with the model

train_classifier(rf_model, X_train.select_dtypes(include=["number"]), y_train, X_test.select_dtypes(include=["number"]), y_test)

(0.9774351861730503,
 np.float64(0.9713898137871501),
 np.float64(0.9763262270699058),
 np.float64(0.9738517648513322),
 np.float64(0.9772996559586278))

accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
roc_scores = []

current_accuracy, current_precision, current_recall, current_f1, current_roc = train_classifier(rf_model, X_train.select_dtypes(include=["number"]), y_train, X_test.select_dtypes(include=["number"]), y_test)

print('Accuracy: ', current_accuracy)
print('Precision: ', current_precision)
print('Recall: ', current_recall)
print('F1: ', current_f1)
print('ROC: ', current_roc)
print('\n')

accuracy_scores.append(current_accuracy)
precision_scores.append(current_precision)
recall_scores.append(current_recall)
f1_scores.append(current_f1)
roc_scores.append(current_roc)

Accuracy:  0.9774351861730503
Precision:  0.9713898137871501
Recall:  0.9763262270699058
F1:  0.9738517648513322
ROC:  0.9772996559586278

performance_df = pd.DataFrame({'Metric': ['Accuracy','Precision','Recall','F1','ROC'], 'Performance': [current_accuracy, current_precision, current_recall, current_f1, current_roc]})

performance_df

sns.catplot(x = 'Metric', y = 'Performance', data = performance_df, kind = 'bar', palette='husl', height = 5)

plt.ylim(0.5, 1.0)
plt.xticks(rotation = 'vertical')
plt.show()

C:\Users\HP\AppData\Local\Temp\ipykernel_12324\1010139633.py:1: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.catplot(x = 'Metric', y = 'Performance', data = performance_df, kind = 'bar', palette='husl', height = 5)

	FileName	md5Hash	Machine	DebugSize	DebugRVA	MajorImageVersion	MajorOSVersion	ExportRVA	ExportSize	IatVRA	MajorLinkerVersion	MinorLinkerVersion	NumberOfSections	SizeOfStackReserve	DllCharacteristics	ResourceSize	Benign
51895	VirusShare_7e4483c7432a30d7df8b8b09fb692694	7e4483c7432a30d7df8b8b09fb692694	332	28	37248	0	5	0	0	36864	9	0	4	1048576	32768	191772	0
19218	KBDTIFI2 (3).DLL	5c2c14f2cf5ad00a8a7986fddd44c8e9	34404	84	11856	10	10	11760	84	0	14	10	4	262144	16736	1056	1
50115	VirusShare_4090ce6f23e946930cfbabffbb29325e	4090ce6f23e946930cfbabffbb29325e	332	0	0	0	5	0	0	253952	9	0	10	1048576	0	5640	0
10309	Microsoft.TeamFoundation.VersionControl.Common...	04b301e4b15c186c069930c488f92b7a	332	0	0	0	4	0	0	8192	11	0	3	1048576	34112	1464	1
44706	VirusShare_af4c9a5042f42a05f39751ab11ecd6b0	af4c9a5042f42a05f39751ab11ecd6b0	332	56	44194	5	5	0	0	4096	6	20	4	262144	34816	37160	0

	FileName	md5Hash	Machine	DebugSize	DllCharacteristics	Benign
13774	Microsoft.AnalysisServices.Common.Wizard.resou...	8f364da7eae118a585868dd4e90975ed	332	0	34112	1
34382	VirusShare_dc25a615a33101aa2904ec96a64ba722	dc25a615a33101aa2904ec96a64ba722	332	0	0	0
183	git-http-backend.exe	9b393204a12178a5dac65e0f7e63735f	34404	28	320	1
40991	VirusShare_ee2272af249d12a0bb5e5de6cdc971f0	ee2272af249d12a0bb5e5de6cdc971f0	332	0	0	0
9454	Microsoft.CodeAnalysis.VisualBasic.EditorFeatu...	8fe0cc53b9f1c49c2aab19c7d9d20618	332	56	34144	1

Ransomeware Detection with AI

Author: David Ezeani

	Metric	Performance
0	Accuracy	0.977435
1	Precision	0.971390
2	Recall	0.976326
3	F1	0.973852
4	ROC	0.977300