-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_preprocessing.py
More file actions
32 lines (24 loc) · 1.05 KB
/
data_preprocessing.py
File metadata and controls
32 lines (24 loc) · 1.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
##Titanic: Machine Learning from Disaster
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
# Load the train dataset
df = pd.read_csv("train.csv")
print("Original Dataset Loaded:")
print(df.head())
# Step 1: Drop column with too many missing values
df = df.drop(columns=['Cabin'])
# Step 2: Handle missing values
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
# Step 3: Drop columns not useful for preprocessing or modeling
df = df.drop(columns=['PassengerId', 'Name', 'Ticket'])
# Step 4: Convert categorical column 'Sex' and 'Embarked' to numerical (encoding)
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
# Step 5: Normalize numerical features
scaler = MinMaxScaler()
numeric_cols = ['Age', 'Fare', 'SibSp', 'Parch']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
# Step 6: Save the cleaned dataset
df.to_csv("titanic_cleaned.csv", index=False)
print("Cleaned dataset saved as 'titanic_cleaned.csv'.")