refactoring.
This commit is contained in:
parent
43955a17f2
commit
93e239756c
|
@ -0,0 +1,251 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import time
|
||||
|
||||
|
||||
from datetime import datetime
|
||||
import logging
|
||||
from nltk.corpus import stopwords
|
||||
import csv
|
||||
import functools
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &"
|
||||
|
||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
|
||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
|
||||
#path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv"
|
||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
|
||||
|
||||
path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv"
|
||||
|
||||
|
||||
content_collumn_name = "Description"
|
||||
|
||||
metaliste = [
|
||||
|
||||
"TicketNumber",
|
||||
"Subject",
|
||||
"CreatedDate",
|
||||
"categoryName",
|
||||
"Impact",
|
||||
"Urgency",
|
||||
"BenutzerID",
|
||||
"VerantwortlicherID",
|
||||
"EigentuemerID",
|
||||
"Solution"
|
||||
]
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||
corpus_name = "de_raw_corpus"
|
||||
|
||||
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
|
||||
|
||||
|
||||
|
||||
|
||||
# todo configuration file ?
|
||||
"""
|
||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
with open(config_ini) as f:
|
||||
config.read_file(f)
|
||||
"""
|
||||
|
||||
|
||||
|
||||
# config logging
|
||||
logging.basicConfig(filename=logfile, level=logging.INFO)
|
||||
# logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def printlog(string, level="INFO"):
|
||||
"""log and prints"""
|
||||
print(string)
|
||||
if level == "INFO":
|
||||
logging.info(string)
|
||||
elif level == "DEBUG":
|
||||
logging.debug(string)
|
||||
elif level == "WARNING":
|
||||
logging.warning(string)
|
||||
|
||||
|
||||
|
||||
def printRandomDoc(textacyCorpus):
|
||||
import random
|
||||
print()
|
||||
|
||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
||||
textacyCorpus[randIndex].metadata))
|
||||
|
||||
print()
|
||||
|
||||
|
||||
|
||||
def csv_to_textStream(path2csv: str, content_collumn_name: str):
|
||||
"""
|
||||
:param path2csv: string
|
||||
:param content_collumn_name: string
|
||||
:return: string-generator
|
||||
"""
|
||||
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
|
||||
content_collumn = 0 # standardvalue
|
||||
|
||||
for i, lst in enumerate(stream):
|
||||
if i == 0:
|
||||
# look for desired column
|
||||
for j, col in enumerate(lst):
|
||||
if col == content_collumn_name:
|
||||
content_collumn = j
|
||||
else:
|
||||
yield lst[content_collumn]
|
||||
|
||||
|
||||
def csv_to_DictStream(path2csv: str, metalist: [str]):
|
||||
"""
|
||||
:param path2csv: string
|
||||
:param metalist: list of strings
|
||||
:return: dict-generator
|
||||
"""
|
||||
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
|
||||
|
||||
content_collumn = 0 # standardvalue
|
||||
metaindices = []
|
||||
metadata_temp = {}
|
||||
for i, lst in enumerate(stream):
|
||||
if i == 0:
|
||||
for j, col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal
|
||||
for key in metalist:
|
||||
if re.sub('[^a-zA-Z]+', '', key) == re.sub('[^a-zA-Z]+', '', col):
|
||||
metaindices.append(j)
|
||||
metadata_temp = dict(
|
||||
zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
|
||||
|
||||
else:
|
||||
metadata = metadata_temp.copy()
|
||||
for key, value in metadata.items():
|
||||
metadata[key] = lst[value]
|
||||
yield metadata
|
||||
|
||||
|
||||
def save_corpus(corpus, corpus_path, corpus_name, parser):
|
||||
"""
|
||||
# save stringstore
|
||||
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
||||
with open(stringstore_path, "w") as file:
|
||||
parser.vocab.strings.dump(file)
|
||||
|
||||
#todo save vocab?
|
||||
"""
|
||||
|
||||
# save parser
|
||||
parser.save_to_directory(corpus_path + str(parser.lang) + '_parser')
|
||||
|
||||
# save content
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
|
||||
|
||||
# save meta
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
|
||||
|
||||
|
||||
|
||||
##################################################################################################
|
||||
|
||||
|
||||
|
||||
def cleanTextstream(textstream):
|
||||
"""
|
||||
:param textstream: string-gen
|
||||
:param parser: spacy-parser
|
||||
:yield: string-gen
|
||||
"""
|
||||
|
||||
for txt in textstream:
|
||||
yield textacy.preprocess.normalize_whitespace(txt)
|
||||
|
||||
|
||||
def cleanDictstream(dictstream):
|
||||
"""
|
||||
:param dictstream: dict-gen
|
||||
:param parser: spacy-parser
|
||||
:yield: dict-gen
|
||||
"""
|
||||
|
||||
for dic in dictstream:
|
||||
|
||||
result = {}
|
||||
|
||||
for key, value in dic.items():
|
||||
result[key] = textacy.preprocess.normalize_whitespace(value)
|
||||
yield result
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
printlog("Corporization: {0}".format(datetime.now()))
|
||||
|
||||
path_csv_split = path2de_csv.split("/")
|
||||
printlog(path_csv_split[len(path_csv_split) - 1])
|
||||
path_csv_split = path2en_csv.split("/")
|
||||
printlog(path_csv_split[len(path_csv_split) - 1])
|
||||
|
||||
start = time.time()
|
||||
|
||||
DE_PARSER = spacy.load("de")
|
||||
EN_PARSER = spacy.load("en")
|
||||
|
||||
de_corpus = textacy.Corpus(DE_PARSER)
|
||||
en_corpus = textacy.Corpus(EN_PARSER)
|
||||
|
||||
|
||||
|
||||
## add files to textacy-corpus,
|
||||
printlog("Add texts to textacy-corpus")
|
||||
|
||||
de_corpus.add_texts(
|
||||
cleanTextstream(csv_to_textStream(path2de_csv, content_collumn_name)),
|
||||
cleanDictstream(csv_to_DictStream(path2de_csv, metaliste))
|
||||
)
|
||||
|
||||
|
||||
# leere docs aus corpus kicken
|
||||
de_corpus.remove(lambda doc: len(doc) == 0)
|
||||
|
||||
|
||||
for i in range(20):
|
||||
printRandomDoc(de_corpus)
|
||||
|
||||
|
||||
#save corpus
|
||||
|
||||
save_corpus(corpus=de_corpus,corpus_path=corpus_path,corpus_name=corpus_name,parser=DE_PARSER)
|
||||
|
||||
#todo das selbe mit en_corpus
|
||||
|
||||
|
||||
|
||||
end = time.time()
|
||||
printlog("Time Elapsed Corporization:{0} min".format((end - start) / 60))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,286 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import time
|
||||
import logging
|
||||
from nltk.corpus import stopwords as nltk_stopwords
|
||||
from collections import Counter
|
||||
import csv
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
import pickle
|
||||
|
||||
|
||||
# todo configuration file ?
|
||||
"""
|
||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
with open(config_ini) as f:
|
||||
config.read_file(f)
|
||||
"""
|
||||
|
||||
|
||||
|
||||
# config logging
|
||||
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
|
||||
logging.basicConfig(filename=logfile, level=logging.INFO)
|
||||
|
||||
|
||||
|
||||
|
||||
DE_PARSER = spacy.load("de")
|
||||
EN_PARSER = spacy.load("en")
|
||||
|
||||
|
||||
|
||||
def replaceRockDots():
|
||||
return lambda string: re.sub(r'[ß]', "ss",
|
||||
(re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
||||
|
||||
def printlog(string, level="INFO"):
|
||||
"""log and prints"""
|
||||
print(string)
|
||||
if level == "INFO":
|
||||
logging.info(string)
|
||||
elif level == "DEBUG":
|
||||
logging.debug(string)
|
||||
elif level == "WARNING":
|
||||
logging.warning(string)
|
||||
|
||||
|
||||
|
||||
|
||||
def save_obj(obj, path):
|
||||
with open(path + '.pkl', 'wb') as f:
|
||||
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def load_obj(path ):
|
||||
with open(path + '.pkl', 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
def create_lemma_dict(lemmalist):
|
||||
|
||||
lemma_dict = {}
|
||||
|
||||
for line in lemmalist:
|
||||
lem_word_pair = line.split()
|
||||
|
||||
lemma = lem_word_pair[0].strip().lower()
|
||||
|
||||
word = lem_word_pair[1].strip().lower()
|
||||
|
||||
lemma_dict[word] = lemma
|
||||
|
||||
return lemma_dict
|
||||
|
||||
|
||||
def build_thesaurus(path2lexicalentries, path2synsets):
|
||||
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
||||
syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
||||
|
||||
lexroot = lextree.getroot()
|
||||
synroot = syntree.getroot()
|
||||
|
||||
thesaurus = []
|
||||
|
||||
for r in synroot:
|
||||
for element in r:
|
||||
|
||||
if element.tag == "Synset":
|
||||
sysnet = []
|
||||
attrib = element.attrib
|
||||
id = attrib["id"]
|
||||
|
||||
for ro in lexroot:
|
||||
for elem in ro:
|
||||
if elem.tag == "LexicalEntry":
|
||||
subs_dicts = [subentry.attrib for subentry in elem]
|
||||
# <class 'list'>: [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}]
|
||||
|
||||
dic = {k: v for x in subs_dicts for k, v in x.items()} # to one dict
|
||||
if "synset" in dic.keys():
|
||||
if dic["synset"] == id:
|
||||
string = (dic["writtenForm"])
|
||||
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string)
|
||||
string = re.sub(r'[ö]', "oe", string)
|
||||
string = re.sub(r'[ü]', "ue", string)
|
||||
string = re.sub(r'[ä]', "ae", string)
|
||||
|
||||
# alle punkte raus
|
||||
string = re.sub(r'[.]', "", string)
|
||||
|
||||
# alles in klammern raus
|
||||
string = re.sub(r"\((.*)\)", " ", string)
|
||||
|
||||
# längeres leerzeichen normalisieren
|
||||
string = textacy.preprocess.normalize_whitespace(string)
|
||||
|
||||
sysnet.append(string.lower().strip())
|
||||
|
||||
# nach anzhal der wörter in den strings sortieren
|
||||
sysnet.sort(key=lambda x: len(x.split()))
|
||||
if len(sysnet) != 0:
|
||||
# todo warum sind manche leer?
|
||||
thesaurus.append(sysnet)
|
||||
return thesaurus
|
||||
|
||||
#todo thesaurus in dictionary
|
||||
|
||||
|
||||
def create_stopwordlist():
|
||||
|
||||
de_stop_words1 = list(map(replaceRockDots(),
|
||||
list(
|
||||
map(textacy.preprocess.normalize_whitespace,
|
||||
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt")
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
de_stop_words2 = list(map(replaceRockDots(),list(set(nltk_stopwords.words('german')))))
|
||||
|
||||
de_stop_words3 = list(map(replaceRockDots(),list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)))
|
||||
|
||||
de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("stopwords-de.txt"))))
|
||||
|
||||
de_stop_words = list(set(de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4))
|
||||
|
||||
return de_stop_words
|
||||
|
||||
#todo en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)+ list(set(nltk_stopwords.words('english'))))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
########################## Spellchecking ##########################################
|
||||
# http://norvig.com/spell-correct.html
|
||||
# http://wortschatz.uni-leipzig.de/en/download
|
||||
|
||||
def words(text): return re.findall(r'\w+', text.lower())
|
||||
|
||||
|
||||
##################################################################################################
|
||||
|
||||
# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare .json
|
||||
# außerdem saubere stoppwortliste und nomenliste
|
||||
|
||||
|
||||
|
||||
# THESAURUS
|
||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
||||
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
||||
|
||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
|
||||
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
||||
|
||||
|
||||
|
||||
# SPELLCHECKING
|
||||
path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict.pkl"
|
||||
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list.pkl"
|
||||
path2thesauruslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list.pkl"
|
||||
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list.pkl"
|
||||
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list.pkl"
|
||||
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list.pkl"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
start = time.time()
|
||||
printlog("Init: {0}".format(datetime.now()))
|
||||
|
||||
|
||||
|
||||
|
||||
printlog("create and save lemma_dict")
|
||||
LEMMAS = list(
|
||||
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))
|
||||
|
||||
lemma_dict = create_lemma_dict(LEMMAS)
|
||||
save_obj(lemma_dict, path2lemmadict)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
printlog("Build and save Wordlist for Spellchecking")
|
||||
WORDS = Counter(words(open(path2words).read()))
|
||||
save_obj(WORDS, path2wordlist)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
printlog("Build and save Thesaurus")
|
||||
THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries, path2synsets=synsets)
|
||||
print(THESAURUS[0:10])
|
||||
|
||||
|
||||
save_obj(THESAURUS, path2thesauruslist)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
printlog("Build and save stoppwortliste")
|
||||
de_stop_words = create_stopwordlist()
|
||||
save_obj(de_stop_words, path2stopwordlist)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
printlog("Build and save nomenliste")
|
||||
NOUNS = list(textacy.fileio.read_file_lines(
|
||||
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt")) + list(
|
||||
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt"))
|
||||
NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS))
|
||||
save_obj(NOUNS, path2NOUNSlist)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
printlog("Build and save fistnameslist")
|
||||
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
|
||||
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt")))
|
||||
|
||||
save_obj(VORNAMEN, path2firstnameslist)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
end = time.time()
|
||||
printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -12,6 +12,8 @@ path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-E
|
|||
path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv"
|
||||
|
||||
|
||||
#idee roh-corpus (nur whitespace weg) speichern -> pregeprocesster corpus -> damit arbeiten
|
||||
|
||||
|
||||
path_csv_split = path2de_csv.split("/")
|
||||
print(path_csv_split[len(path_csv_split) - 1])
|
||||
|
@ -124,7 +126,15 @@ specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORE
|
|||
hardSFinder = re.compile(r'[ß]', re.IGNORECASE)
|
||||
|
||||
|
||||
import pickle
|
||||
|
||||
def save_obj(obj, path):
|
||||
with open(path + '.pkl', 'wb') as f:
|
||||
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def load_obj(path ):
|
||||
with open(path + '.pkl', 'rb') as f:
|
||||
return pickle.load(f)
|
||||
def printlog(string, level="INFO"):
|
||||
"""log and prints"""
|
||||
print(string)
|
||||
|
|
38
testra.py
38
testra.py
|
@ -52,8 +52,44 @@ corpus.add_texts(
|
|||
|
||||
print(corpus)
|
||||
"""
|
||||
import pickle
|
||||
|
||||
def save_obj(obj, path):
|
||||
with open(path + '.pkl', 'wb') as f:
|
||||
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def load_obj(path ):
|
||||
with open(path + '.pkl', 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
|
||||
|
||||
lemmalist = list(map(textacy.preprocess.normalize_whitespace,
|
||||
list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))))
|
||||
|
||||
lemma_dict = {}
|
||||
|
||||
for line in lemmalist:
|
||||
|
||||
lem_word_pair = line.split()
|
||||
|
||||
lemma = lem_word_pair[0].strip().lower()
|
||||
|
||||
word = lem_word_pair[1].strip().lower()
|
||||
|
||||
lemma_dict[word] = lemma
|
||||
|
||||
|
||||
|
||||
print(lemma_dict["abbekomme"])
|
||||
|
||||
save_obj(lemma_dict, "test_dictionies")
|
||||
|
||||
loaded = load_obj("test_dictionies")
|
||||
|
||||
print(loaded["abbekomme"])
|
||||
|
||||
"""
|
||||
from postal.parser import parse_address
|
||||
|
||||
|
||||
|
@ -63,7 +99,7 @@ print(parse_address(address))
|
|||
|
||||
address = "Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder- und Lagerwesen LogistikCampus Joseph-von-Fraunhofer-Str. 2-4 D-44227 Dortmund "
|
||||
print(parse_address(address))
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue