-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhelpers.py
More file actions
71 lines (58 loc) · 2.19 KB
/
helpers.py
File metadata and controls
71 lines (58 loc) · 2.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# -*- coding: utf-8 -*-
import csv
import numpy as np
import datetime
import fasttext
def read_test(path):
"""read test data"""
with open(path) as fp:
data_test = fp.readlines()
ids = [0] * len(data_test)
for i in range(len(data_test)):
ids[i] = data_test[i].split(',', 1)[0]
data_test[i] = data_test[i].split(',', 1)[1]
return ids, data_test
def read_train(path):
"""read train data"""
with open(path) as fp:
data_train = fp.readlines()
return data_train
def create_csv_submission(ids, y_pred, name):
"""
Creates an output file in csv format for submission to kaggle
Arguments: ids (event ids associated with each prediction)
y_pred (predicted class labels)
name (string name of .csv output file to be created)
"""
with open(name, 'w') as csvfile:
fieldnames = ['Id', 'Prediction']
writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
writer.writeheader()
for r1, r2 in zip(ids, y_pred):
writer.writerow({'Id':int(r1),'Prediction':int(r2)})
def calculate_time(elapsed):
"""
Takes a time in seconds and returns format as hh:mm:ss
"""
elapsed_rounded = int(round((elapsed)))
return str(datetime.timedelta(seconds=elapsed_rounded))
def batch_accuracy(preds, labels):
"""Function to calculate the accuracy of predictions vs labels"""
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
def compute_word_embedding(model, data, dimension, vocabulary):
"""Function to compute word embedding(bag of words)"""
ret_word_embedding = np.zeros((len(data), dimension))
for i, sentence in enumerate(data):
words = sentence.split(sep = ' ')
count = 0
avg_word_vector = np.zeros(dimension)
for word in words:
if word in vocabulary:
avg_word_vector = np.add(avg_word_vector,model[word])
count += 1
if count != 0:
avg_word_vector = avg_word_vector / count
ret_word_embedding[i] = avg_word_vector
return ret_word_embedding