-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_handler.py
More file actions
70 lines (55 loc) · 2.37 KB
/
data_handler.py
File metadata and controls
70 lines (55 loc) · 2.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""Data handling utilities for CSV processing and batch analysis."""
from typing import Tuple, Optional, Any
import pandas as pd
from nlp_engine import classify_intent, analyze_sentiment
def process_batch(
data_file: Optional[Any],
target_column: str,
threshold: float
) -> Tuple[pd.DataFrame, str]:
"""Process CSV file with semantic analysis on specified column."""
if data_file is None:
raise ValueError("No CSV file uploaded")
try:
data = pd.read_csv(data_file.name)
except pd.errors.ParserError as e:
raise ValueError(f"Malformed CSV: {e}")
if data.empty:
raise ValueError("CSV file is empty")
if target_column not in data.columns:
available = ', '.join(data.columns)
raise ValueError(
f"Column '{target_column}' not found. Available: {available}"
)
data['Sentiment_Score'] = 0.0
data['Predicted_Intent'] = ""
data['Confidence'] = 0.0
data['Positive_Score'] = 0.0
data['Neutral_Score'] = 0.0
data['Negative_Score'] = 0.0
for idx, row in data.iterrows():
comment = str(row[target_column])
if not comment or not comment.strip() or comment == 'nan':
data.at[idx, 'Predicted_Intent'] = "INVALID_INPUT"
continue
try:
intent = classify_intent(comment, threshold)
data.at[idx, 'Predicted_Intent'] = intent['top_intent']
data.at[idx, 'Confidence'] = intent['top_confidence']
sentiment = analyze_sentiment(comment)
data.at[idx, 'Sentiment_Score'] = sentiment['compound']
data.at[idx, 'Positive_Score'] = sentiment['positive']
data.at[idx, 'Neutral_Score'] = sentiment['neutral']
data.at[idx, 'Negative_Score'] = sentiment['negative']
except Exception as e:
error_msg = str(e)[:50]
data.at[idx, 'Predicted_Intent'] = f"ERROR: {error_msg}"
output_path = "processed_comments.csv"
data.to_csv(output_path, index=False)
return data, output_path
def get_batch_stats(data: pd.DataFrame) -> Tuple[int, int, float]:
"""Calculate statistics for processed batch data."""
total = len(data)
valid = len(data[data['Predicted_Intent'] != 'INVALID_INPUT'])
avg_conf = data[data['Confidence'] > 0]['Confidence'].mean()
return total, valid, avg_conf