-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel.py
More file actions
76 lines (61 loc) · 2.81 KB
/
model.py
File metadata and controls
76 lines (61 loc) · 2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
import numpy as np
import pathlib as pl
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from joblib import dump
path = pl.Path (f"archive")
df_list = []
for file in path.iterdir():
df_list.append(pd.read_csv(file))
dfraw = pd.concat(df_list, ignore_index=True)
dfraw.drop (dfraw.iloc [:, 0 : 29], axis = 1, inplace = True) # until latitude
dfraw.drop (dfraw.iloc [:, 11 :], axis = 1, inplace = True)
dfraw.drop (dfraw.iloc [:, 2 : 4], axis = 1, inplace = True) # just lat long
dfraw['bathrooms'] = dfraw['bathrooms_text'].str.extract("(\d*\.?\d+)", expand=False)
dfraw['bathrooms'] = np.where(dfraw['bathrooms_text'].str.contains("half", case=False, na=False), 0.5, dfraw['bathrooms'])
dfraw = dfraw.dropna() # drop NaN values
dfraw["number_of_guests"] = dfraw["accommodates"]
df = dfraw.drop(["accommodates", "bathrooms_text"], axis=1)
df ["price"] = df ["price"].str.replace ("$", "", regex = False)
df ["price"] = df ["price"].str.replace (",", "", regex = False)
df ["price"] = df ["price"].astype (np.float32, copy = False)
df ["price"] = df ["price"].astype (np.int32, copy = False)
df ["number_of_guests"] = df ["number_of_guests"].astype (np.int32, copy = False)
df ["bedrooms"] = df ["bedrooms"].astype (np.int32, copy = False)
df ["beds"] = df ["beds"].astype (np.int32, copy = False)
df ["bathrooms"] = df ["bathrooms"].astype (np.float32, copy = False)
df ["latitude"] = df ["latitude"].astype (np.float32, copy = False)
df ["longitude"] = df ["longitude"].astype (np.float32, copy = False)
def outlier (col):
q1 = col.quantile (0.25)
q3 = col.quantile (0.75)
iqr = q3 - q1
lenght = 1.5
return q1 - lenght * iqr, q3 + lenght * iqr
def remove (ds, col):
before = ds.shape [0]
low, up = outlier (ds [col])
ds = ds.loc [(ds [col] >= low) & (ds [col] <= up), :]
return ds, before - ds.shape [0]
df, removed = remove (df, "price")
df ["num_amenities"] = df ["amenities"].str.split (",").apply (len)
df ["num_amenities"] = df ["num_amenities"].astype (np.int32, copy = False)
dfenc = df.drop ("amenities", axis = 1)
def evaluate (ytest, prediction):
rmse = np.sqrt (mean_squared_error (ytest, prediction))
mae = mean_absolute_error (ytest, prediction)
r2 = r2_score (ytest, prediction)
return f"\n-----\nModel: Random Forest \nRMSE: {rmse:.2f}\nMAE: {mae:.2f}\nR2: {r2:.2%}\n-----\n"
y = dfenc ["price"]
x = dfenc.drop ("price", axis = 1)
xtrain, xtest, ytrain, ytest = train_test_split (x, y)
model = RandomForestRegressor()
print("Training model...")
model.fit (xtrain, ytrain)
print("Model training done!")
dump(model, "model.joblib")
if __name__ == "__main__":
prediction = model.predict (xtest)
print (evaluate (ytest, prediction))