-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathdata_validations.py
More file actions
97 lines (95 loc) · 2.64 KB
/
data_validations.py
File metadata and controls
97 lines (95 loc) · 2.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import pandas as pd
import pandera.pandas as pa
from pandera import Check
df1_schema = pa.DataFrameSchema(
{
# Snapshot date stored as string "YYYY-MM-DD"
"As Of Date": pa.Column(
pa.DateTime,
checks=[Check.le(pd.Timestamp.today())],
nullable=False,
coerce=True,
),
"Tax ID": pa.Column(str, nullable=True),
"Shield No": pa.Column(str, nullable=True),
"Officer First Name": pa.Column(
str,
checks=[Check.str_length(0, 100)],
nullable=True,
),
"Officer Last Name": pa.Column(
str,
checks=[Check.str_length(0, 100)],
nullable=True,
),
"Active Per Last Reported Status": pa.Column(str, nullable=True),
"Last Reported Active Date": pa.Column(str, nullable=True),
"Officer Race": pa.Column(
str,
checks=[Check.str_length(min_value=1)],
nullable=False,
),
"Officer Gender": pa.Column(
str,
checks=[Check.str_length(min_value=1)],
nullable=False,
),
"Current Rank Abbreviation": pa.Column(str, nullable=True),
"Current Rank": pa.Column(
str,
checks=[Check.str_length(min_value=1)],
nullable=False,
),
"Current Command": pa.Column(
str,
checks=[Check.str_length(min_value=1)],
nullable=False,
),
"Total Complaints": pa.Column(
int,
checks=[
Check.ge(0),
Check.le(1000), # upperbound
],
nullable=False,
),
"Total Substantiated Complaints": pa.Column(
int,
checks=[
Check.ge(0),
Check.le(1000),
Check.less_than_or_equal_to("Total Complaints"),
],
nullable=False,
),
},
strict=False,
)
df2_schema = pa.DataFrameSchema(
{
"precinct": pa.Column(
int,
checks=[
Check.ge(1),
Check.le(200), # safe upper bound
],
nullable=False,
),
"crime_count": pa.Column(
float, # keep float since to_numeric may produce float
checks=[
Check.ge(0),
],
nullable=False,
coerce=True,
),
"Precinct Name": pa.Column(
str,
checks=[
Check.str_matches(r"^Precinct \d+$"),
],
nullable=False,
),
},
strict=False,
)