-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
157 lines (121 loc) · 4.45 KB
/
preprocessing.py
File metadata and controls
157 lines (121 loc) · 4.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import os
import nltk
import re
import pickle
import numpy as np
import pandas as pd
from spacy.en import English
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from stop_words import get_stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
def _read_dump_to_lookup(data_dump):
"""
reads from a list of file names and creates a dictionary in which
keys are file names and values are the content of those files.
File names are unique and are http addresses of the original source.
included brute error handling to manage corrupt data
:param data_dump: list<str>
:return: dict<str, str>
"""
lookup = {}
for i, dump in enumerate(data_dump):
file_name = data_dump[i]
try:
with open(file_name, 'r') as f:
lookup[file_name] = f.read()
except:
pass
return lookup
def _ensure_all_fields_present(dicts):
"""
reads from a list of dictionaries where keys are html addresses and values
are html content and returns a list of dictionaries with keys are shared
between all of them. In the case of this data set, only < 1 % of keys is
discarded.
:param dicts: list<dict>
:return: list<dict>
"""
common_keys = {key for key in dicts[0].keys()
if all((key in d.keys() for d in dicts[1:]))}
def filter_dict(old_dict):
return {key: old_dict[key] for key in common_keys}
return [filter_dict(dictionary) for dictionary in dicts]
p_stemmer = PorterStemmer()
tokenizer = RegexpTokenizer(r'\w+[a-z]')
en_stop = get_stop_words('en')
def _tokenizing_stemming(curatorial_description):
"""
function to tokenize and stem long descriptions
input: string (single curatorial description)
output: list of strings (stemmed tokens)
"""
# TOKENIZING
stringed = str(curatorial_description)
tokens = tokenizer.tokenize(stringed)
raw = stringed.lower()
tokens = tokenizer.tokenize(raw)
# REMOVING STOP WORDS
# create English stop words list
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
# STEMMING
# stem token
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
stemmed_tokens = ' '.join(stemmed_tokens)
return stemmed_tokens
# TO DO: remove words such as "span" "div" "british museum" and maybe also first and last names
def _cleaning_images_finding_http(images_text):
'''
function which pulls out the http address for image corresponding to description
input: string (long string from a website)
output: string (http address pointing to an image)
'''
if images_text != "None":
pull_out_http = re.split(" ", images_text)
# print pull_out_http
http_address = re.findall(
'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
pull_out_http[3])
http_address = http_address[0]
else:
http_address = "None"
return http_address
def _cleaning_htmltags_whitespace(text):
"""
function to strip html from tags (e.g. '\n'), extra spaces and unicode
input: string (long html)
output: sting (only description)
"""
new_text = re.sub(r"</?[a-zA-Z]*>", ' ', text)
no_newlines = re.sub(r'\n', ' ', new_text)
no_unicode = re.sub(r'[^\x00-\x7F]+', ' ', no_newlines)
# print no_unicode
no_extraspaces = re.sub(r"\s{2,}", ' ', no_unicode)
no_search = re.sub(r"None", '', no_extraspaces)
return no_search
def _spacy_tokenizer_lemmatizer(text):
"""
Take a unicode string of text and return a list containing the lemmatized tokens
Output: list of lemmatized tokens
"""
parsed_data = parser(text)
list_of_lemmatized_tokens = [token.lemma_ for token in parsed_data]
return list_of_lemmatized_tokens
def _pulling_out_of_dictionary(dictionary):
"""
:param dictionary:
:return:
"""
not_cleaned_description_pairs = []
for link, curatorial_desription in physical_descriptions_lookup.iteritems():
try:
not_cleaned_description_pairs.append(
(link, curatorial_desription))
except Exception as e:
print e.message
return [pair[1] for pair in not_cleaned_description_pairs]