-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathclean_utils.py
More file actions
112 lines (97 loc) · 3.41 KB
/
clean_utils.py
File metadata and controls
112 lines (97 loc) · 3.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/python
# coding=utf-8
import re
import sys
import mistune
reload(sys)
sys.setdefaultencoding('utf-8')
MAX_STRING_LENGTH = 200
def remove_non_ascii_1(text):
return ''.join(i for i in text if ord(i) < 128)
def clean(text):
try:
if (text == '' or text == None):
return ''
text = remove_non_ascii_1(text)
text = re.sub('<', '<', text)
text = re.sub('>', '>', text)
text = re.sub('<;', ' ', text)
text = re.sub("<(.+?)/(.+?)>", ' ', text)
text = re.sub("<(.+?)>", ' ', text)
text = re.sub('\[',' ',text)
text = re.sub(']',' ',text)
text = re.sub('"', ' ', text)
text = re.sub('!', ' ', text)
text = re.sub('#', ' ', text)
text = re.sub('{', ' ', text)
text = re.sub('}', ' ', text)
text = re.sub('\|', ' ', text)
text = re.sub('\*', ' : ', text)
text = re.sub('=', ' ', text)
text = re.sub('\$', ' ', text)
text = re.sub('"', ' ', text)
text = re.sub('/','',text)
text = re.sub('\(',' ',text)
text = re.sub('\)', ' ', text)
text = re.sub('<',' ',text)
text = re.sub('>',' ',text)
text = re.sub('\.',' ',text)
text = re.sub('-', ' ', text)
text = re.sub(':', ' : ', text)
text = re.sub(' +', ' ', text)
text = re.sub('\n+', '\n', text)
except Exception, e:
print e
print 'ERROR OF clean'
return text.strip()
def camel_case_split(text):
matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', text)
texts = [m.group(0) for m in matches]
texts = [t.lower() for t in texts]
return ' '.join(texts)
def data_prepare_clean(text):
text = clean(text)
texts = text.split(' ')
texts = [camel_case_split(t) for t in texts]
return ' '.join(texts)
def base64_to_utf8(file):
with open(file, 'r') as f:
text = f.read()
print text
encoded = text.decode('base64')
return encoded.encode('utf-8')
def readme_clean(text):
url = '(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.' \
'[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\
.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})'
text = re.sub(url, '', text)
text = clean(text)
xml = "<(.+?)/(.+?)>"
text = re.sub(xml, '', text)
text = text.replace(',', ' ')
return text
#get description from readme file
def extract_markdown(text):
text = readme_clean(text)
pattern = re.compile(r'\#+(.+?)\#+', flags=re.DOTALL)
result = re.findall(pattern, text)
if (len(result) > 10):
return result[0].replace('\n', ' ')
else:
text = mistune.markdown(text)
pattern = re.compile(r'<p(.+?)/p>+', flags=re.DOTALL)
result = re.findall(pattern, text)
if (len(result) != 0):
return result[0].replace('\n', ' ')
else:
info = (text[:MAX_STRING_LENGTH] + ' ...') if len(text) > MAX_STRING_LENGTH else text
return clean(info)
if __name__ == "__main__":
a = '2011-01-26T19:01:12Z'
b = '2010-12-6T20:01:12Z'
print b > a
#print camel_case_split('CamelCaseXYZ CamelCaseXYZ fr ')
# print remove_non_ascii_1('sdwodesds~~~')
# print clean('fs\n\n ff ')
# print base64_to_utf8('/home/yangqiao/1')
# print extract_markdown(open('text.md', 'r').read())