NLP_GUI/main.py at main · nel417/NLP_GUI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
import json
import random
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier

file_name = "books2.json"

en_core_web = spacy.load("en_core_web_sm")
en_core_web.add_pipe('spacytextblob')


class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"


class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE


class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews

    def get_text(self):
        return [x.text for x in self.reviews]

    def get_labels(self):
        return [x.sentiment for x in self.reviews]

    def even(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        #         print(len(negative))
        #         print(len(positive))
        shrink_positive = positive[:len(negative)]
        self.reviews = negative + shrink_positive
        random.shuffle(self.reviews)


reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review["reviewText"], review["overall"]))


def sentiment_from_scratch(t):
    Train, Test = train_test_split(reviews, test_size=0.33, random_state=42)
    container_train = ReviewContainer(Train)
    container_test = ReviewContainer(Test)
    container_train.even()
    x_train = container_train.get_text()
    y_train = container_train.get_labels()
    container_test.even()
    x_test = container_test.get_text()
    y_test = container_test.get_labels()
    vec = TfidfVectorizer()
    train_x_vec = vec.fit_transform(x_train)
    test_x_vec = vec.transform(x_test)
    y_train.count(Sentiment.NEGATIVE)
    classify = svm.SVC(kernel='linear')
    classify.fit(train_x_vec, y_train)
    classify.predict(test_x_vec[0])
    classify_dec = DecisionTreeClassifier()
    from sklearn.metrics import f1_score
    classify_dec.fit(train_x_vec, y_train)
    classify_dec.predict(test_x_vec[8])
    new_test = vec.transform([t])
    output = classify.predict(new_test)
    print(classify.score(test_x_vec, y_test))
    print(classify_dec.score(test_x_vec, y_test))
    return_score = f1_score(y_test,
                            classify.predict(test_x_vec),
                            average=None,
                            labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])

    return output, return_score


def get_text_characteristics(sentence):
    document = en_core_web(sentence)
    output_array = []
    for token in document:
        output = {
            "Index": token.i, "Token": token.text, "Tag": token.tag_, "POS": token.pos_,
            "Dependency": token.dep_, "Lemma": token.lemma_, "Shape": token.shape_,
            "Alpha": token.is_alpha, "Is Stop Word": token.is_stop
        }
        output_array.append(output)
    return {"output": output_array}


def get_entity(sentence):
    document = en_core_web(sentence)
    output_array = []
    for token in document.ents:
        output = {
            "Text": token.text, "Start Char": token.start_char,
            "End Char": token.end_char, "Label": token.label_
        }
        output_array.append(output)
    return {"output": output_array}


def get_text_sentiment(sentence):
    document = en_core_web(sentence)

    url_sent_score = []
    url_sent_label = []
    total_pos = []
    total_neg = []
    sentiment = document._.blob.polarity
    sentiment = round(sentiment, 2)

    if sentiment > 0:
        sent_label = "Positive"
    else:
        sent_label = "Negative"

    url_sent_label.append(sent_label)
    url_sent_score.append(sentiment)
    positive_words = []
    negative_words = []

    for x in document._.blob.sentiment_assessments.assessments:
        if x[1] > 0:
            positive_words.append(x[0][0])
        elif x[1] < 0:
            negative_words.append(x[0][0])
        else:
            pass

    total_pos.append(', '.join(set(positive_words)))
    total_neg.append(', '.join(set(negative_words)))

    output = {"Score": url_sent_score, "Label": url_sent_label,
              "Positive words": total_pos, "Negative Words": total_neg}

    return {"output": output}