aufgeräumt
This commit is contained in:
parent
4fe12679fb
commit
56c8bce2d7
70
config.ini
70
config.ini
|
@ -1,21 +1,67 @@
|
|||
[filepath]
|
||||
|
||||
thesauruspath = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/openthesaurus.csv
|
||||
|
||||
path2xml = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/ticketSamples.xml
|
||||
|
||||
path2csv = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv
|
||||
|
||||
small = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv
|
||||
[thesaurus]
|
||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml
|
||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_dict.pkl
|
||||
|
||||
|
||||
logfile = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log
|
||||
[spellchecking]
|
||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt
|
||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_dict.pkl
|
||||
|
||||
lemmas = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmatization-de.txt
|
||||
|
||||
[lemmatization]
|
||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt
|
||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict.pkl
|
||||
|
||||
|
||||
[nouns]
|
||||
input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt
|
||||
input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt
|
||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list.pkl
|
||||
|
||||
|
||||
[firstnames]
|
||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt
|
||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list.pkl
|
||||
|
||||
|
||||
[de_stopwords]
|
||||
input1 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt
|
||||
input2 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt
|
||||
input3 = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.txt
|
||||
pickle_file = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list.pkl
|
||||
|
||||
|
||||
[logging]
|
||||
level = INFO
|
||||
filename = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log
|
||||
|
||||
|
||||
[de_corpus]
|
||||
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv
|
||||
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv
|
||||
#input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv
|
||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv
|
||||
|
||||
path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/
|
||||
raw = de_raw_ticket
|
||||
pre = de_pre_ticket
|
||||
|
||||
|
||||
[en_corpus]
|
||||
input = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv
|
||||
|
||||
path = /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/
|
||||
raw = en_raw_ticket
|
||||
pre = en_pre_ticket
|
||||
|
||||
|
||||
|
||||
[tickets]
|
||||
content_collumn_name = Description
|
||||
metaliste = TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerID,VerantwortlicherID,EigentuemerID,Solution
|
||||
|
||||
|
||||
|
||||
language = de
|
||||
|
||||
[preprocessing]
|
||||
|
||||
|
|
197
corporization.py
197
corporization.py
|
@ -1,6 +1,35 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import time
|
||||
import logging
|
||||
from stop_words import get_stop_words
|
||||
|
||||
#import words as words
|
||||
from nltk.corpus import stopwords as nltk_stopwords
|
||||
from collections import Counter
|
||||
import csv
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
import pickle
|
||||
import configparser as ConfigParser
|
||||
from miscellaneous import *
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
import time
|
||||
|
||||
|
||||
|
||||
|
||||
from datetime import datetime
|
||||
|
@ -17,87 +46,15 @@ import sys
|
|||
csv.field_size_limit(sys.maxsize)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &"
|
||||
|
||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
|
||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
|
||||
#path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv"
|
||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
|
||||
|
||||
path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv"
|
||||
|
||||
|
||||
content_collumn_name = "Description"
|
||||
|
||||
metaliste = [
|
||||
"TicketNumber",
|
||||
"Subject",
|
||||
"CreatedDate",
|
||||
"categoryName",
|
||||
"Impact",
|
||||
"Urgency",
|
||||
"BenutzerID",
|
||||
"VerantwortlicherID",
|
||||
"EigentuemerID",
|
||||
"Solution"
|
||||
]
|
||||
|
||||
|
||||
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||
corpus_name = "de_raw_ticketCorpus"
|
||||
|
||||
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
|
||||
|
||||
|
||||
|
||||
|
||||
# todo configuration file
|
||||
"""
|
||||
# load config
|
||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
with open(config_ini) as f:
|
||||
config.read_file(f)
|
||||
"""
|
||||
|
||||
|
||||
|
||||
# config logging
|
||||
logging.basicConfig(filename=logfile, level=logging.INFO)
|
||||
# logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def printlog(string, level="INFO"):
|
||||
"""log and prints"""
|
||||
print(string)
|
||||
if level == "INFO":
|
||||
logging.info(string)
|
||||
elif level == "DEBUG":
|
||||
logging.debug(string)
|
||||
elif level == "WARNING":
|
||||
logging.warning(string)
|
||||
|
||||
|
||||
|
||||
def printRandomDoc(textacyCorpus):
|
||||
import random
|
||||
print()
|
||||
|
||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
||||
textacyCorpus[randIndex].metadata))
|
||||
|
||||
print()
|
||||
|
||||
|
||||
|
||||
def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
|
||||
|
@ -146,75 +103,93 @@ def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
|
|||
yield metadata
|
||||
|
||||
|
||||
def save_corpus(corpus, corpus_path, corpus_name, parser):
|
||||
"""
|
||||
# save stringstore
|
||||
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
||||
with open(stringstore_path, "w") as file:
|
||||
parser.vocab.strings.dump(file)
|
||||
|
||||
#todo save vocab?
|
||||
"""
|
||||
|
||||
# save parser
|
||||
parserpath = corpus_path + str(parser.lang) + '_parser'
|
||||
parser.save_to_directory(parserpath)
|
||||
|
||||
# save content
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
|
||||
|
||||
# save meta
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
|
||||
|
||||
|
||||
|
||||
##################################################################################################
|
||||
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &"
|
||||
|
||||
|
||||
"""
|
||||
content_collumn_name = "Description"
|
||||
metaliste = [
|
||||
"TicketNumber",
|
||||
"Subject",
|
||||
"CreatedDate",
|
||||
"categoryName",
|
||||
"Impact",
|
||||
"Urgency",
|
||||
"BenutzerID",
|
||||
"VerantwortlicherID",
|
||||
"EigentuemerID",
|
||||
"Solution"
|
||||
]
|
||||
"""
|
||||
|
||||
|
||||
content_collumn_name = config.get("tickets","content_collumn_name")
|
||||
metaliste = config.get("tickets","metaliste")
|
||||
|
||||
|
||||
path2de_csv = config.get("de_corpus","input")
|
||||
corpus_de_path = config.get("de_corpus", "path")
|
||||
raw_de_name = config.get("de_corpus", "raw")
|
||||
|
||||
|
||||
path2en_csv = config.get("en_corpus","input")
|
||||
corpus_en_path = config.get("en_corpus", "path")
|
||||
raw_en_name = config.get("en_corpus", "raw")
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
start = time.time()
|
||||
printlog("Corporization: {0}".format(datetime.now()))
|
||||
|
||||
|
||||
#print paths
|
||||
path_csv_split = path2de_csv.split("/")
|
||||
printlog(path_csv_split[len(path_csv_split) - 1])
|
||||
path_csv_split = path2en_csv.split("/")
|
||||
printlog(path_csv_split[len(path_csv_split) - 1])
|
||||
|
||||
start = time.time()
|
||||
|
||||
|
||||
DE_PARSER = spacy.load("de")
|
||||
EN_PARSER = spacy.load("en")
|
||||
|
||||
de_corpus = textacy.Corpus(DE_PARSER)
|
||||
en_corpus = textacy.Corpus(EN_PARSER)
|
||||
raw_de_corpus = textacy.Corpus(DE_PARSER)
|
||||
raw_en_corpus = textacy.Corpus(EN_PARSER)
|
||||
|
||||
|
||||
|
||||
## add files to textacy-corpus,
|
||||
printlog("Add texts to textacy-corpus")
|
||||
## add files to textacy-corpi,
|
||||
printlog("Add texts to textacy-corpi")
|
||||
|
||||
de_corpus.add_texts(
|
||||
raw_de_corpus.add_texts(
|
||||
ticketcsv_to_textStream(path2de_csv, content_collumn_name),
|
||||
ticket_csv_to_DictStream(path2de_csv, metaliste)
|
||||
)
|
||||
|
||||
|
||||
# leere docs aus corpus kicken
|
||||
de_corpus.remove(lambda doc: len(doc) == 0)
|
||||
raw_en_corpus.add_texts(
|
||||
ticketcsv_to_textStream(path2en_csv, content_collumn_name),
|
||||
ticket_csv_to_DictStream(path2en_csv, metaliste)
|
||||
)
|
||||
|
||||
|
||||
for i in range(20):
|
||||
printRandomDoc(de_corpus)
|
||||
# leere docs aus corpi kicken
|
||||
raw_de_corpus.remove(lambda doc: len(doc) == 0)
|
||||
raw_en_corpus.remove(lambda doc: len(doc) == 0)
|
||||
|
||||
|
||||
#save corpus
|
||||
#for i in range(20):
|
||||
# printRandomDoc(raw_de_corpus)
|
||||
# printRandomDoc(raw_en_corpus)
|
||||
|
||||
save_corpus(corpus=de_corpus,corpus_path=corpus_path,corpus_name=corpus_name,parser=DE_PARSER)
|
||||
|
||||
#todo das selbe mit en_corpus
|
||||
#save corpi
|
||||
save_corpus(corpus=raw_de_corpus, corpus_path=corpus_de_path, corpus_name=raw_de_name)
|
||||
save_corpus(corpus=raw_en_corpus, corpus_path=corpus_en_path, corpus_name=raw_en_name)
|
||||
|
||||
|
||||
|
||||
|
|
Binary file not shown.
|
@ -1,9 +0,0 @@
|
|||
{"categoryName":"zhb","Subject":"schulungstest","Solution":""}
|
||||
{"categoryName":"neuanschluss","Subject":"telephone contract","Solution":"subject"}
|
||||
{"categoryName":"zhb","Subject":"schulungstest","Solution":""}
|
||||
{"categoryName":"neuanschluss","Subject":"telephone contract","Solution":"frau hinrichs überdenkt die situation und macht dann neue anträge . dieses ticket wird geschlossen"}
|
||||
{"categoryName":"neuanschluss","Subject":"telephone contract","Solution":"faxnummer 3166 wurde unter die telefonnummer 7179 im elektronischen telefonbuch eingetragen"}
|
||||
{"categoryName":"lan","Subject":"defekte netzwerkdose frage zu vpn","Solution":"hallo herr rauner , die netzwerkdose weist z. z. keine verbindungsprobleme auf . falls doch welche bestehen , melden sie sich bitte bei uns . mit freunldichen grüßen aicha oikrim"}
|
||||
{"categoryName":"betrieb","Subject":"sso login via browser mit zertifikat","Solution":"der login via zertifikat am sso - dienst mittels firefox und unicard sollte funktionieren . eventuell wurden durch ein browserupdate die einstellungen gelöscht . bitte prüfen sie ob die ca - zertifikate installiert sind : https://pki.pca.dfn.de/tu-dortmund-chipcard-ca/cgi-bin/pub/pki?cmd=getstaticpage;name=index;id=2&ra_id=0 \" https://pki.pca.dfn.de/tu-dortmund-chipcard-ca/cgi-bin/pub/pki?cmd=getstaticpage;name=index;id=2&ra_id=0 \" und ob das kryptographie modul im firefox hinterlegt ist : https://service.tu-dortmund.de/group/intra/authentifizierung"}
|
||||
{"categoryName":"elektronisches telefonbuch","Subject":"telephone contract","Solution":"erledigt"}
|
||||
{"categoryName":"verwaltung","Subject":"laptop macht komische geräusche","Solution":"herr alexev swetlomier ( hiwi ) küümert sich bereits um das laptop und frau herbst weiß auch bescheid die zur zeit im urlaub ist"}
|
File diff suppressed because one or more lines are too long
351
init.py
351
init.py
|
@ -4,6 +4,9 @@ from datetime import datetime
|
|||
|
||||
import time
|
||||
import logging
|
||||
from stop_words import get_stop_words
|
||||
|
||||
#import words as words
|
||||
from nltk.corpus import stopwords as nltk_stopwords
|
||||
from collections import Counter
|
||||
import csv
|
||||
|
@ -15,58 +18,35 @@ from scipy import *
|
|||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
import pickle
|
||||
import configparser as ConfigParser
|
||||
from miscellaneous import *
|
||||
|
||||
|
||||
# todo configuration file ?
|
||||
"""
|
||||
# load config
|
||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
with open(config_ini) as f:
|
||||
config.read_file(f)
|
||||
"""
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/init.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_init.log &"
|
||||
|
||||
|
||||
# config logging
|
||||
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
|
||||
logging.basicConfig(filename=logfile, level=logging.INFO)
|
||||
|
||||
|
||||
|
||||
def create_lemma_dict(path2lemmalist):
|
||||
"""
|
||||
Creates a dict out of a file a la:
|
||||
|
||||
DE_PARSER = spacy.load("de")
|
||||
EN_PARSER = spacy.load("en")
|
||||
l1 w1
|
||||
l1 w2
|
||||
l2 w1
|
||||
l2 w2
|
||||
|
||||
Result will be used as lemma_dict["word"] --> lemma
|
||||
|
||||
|
||||
def replaceRockDots():
|
||||
return lambda string: re.sub(r'[ß]', "ss",
|
||||
(re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
||||
|
||||
def printlog(string, level="INFO"):
|
||||
"""log and prints"""
|
||||
print(string)
|
||||
if level == "INFO":
|
||||
logging.info(string)
|
||||
elif level == "DEBUG":
|
||||
logging.debug(string)
|
||||
elif level == "WARNING":
|
||||
logging.warning(string)
|
||||
|
||||
|
||||
|
||||
|
||||
def save_obj(obj, path):
|
||||
with open(path + '.pkl', 'wb') as f:
|
||||
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def load_obj(path ):
|
||||
with open(path + '.pkl', 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
def create_lemma_dict(lemmalist):
|
||||
:param path2lemmalist: str
|
||||
:return: dictionary
|
||||
"""
|
||||
lemmalist = list(map(textacy.preprocess.normalize_whitespace, list(
|
||||
textacy.fileio.read_file_lines(path2lemmalist))))
|
||||
|
||||
lemma_dict = {}
|
||||
|
||||
|
@ -81,69 +61,22 @@ def create_lemma_dict(lemmalist):
|
|||
|
||||
return lemma_dict
|
||||
|
||||
"""
|
||||
def build_thesaurus(path2lexicalentries, path2synsets):
|
||||
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
||||
syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
||||
|
||||
|
||||
def build_thesaurus_dict(path2wordnet,returnall=False):
|
||||
"""
|
||||
Creates a dict out of the deWordNet
|
||||
https://raw.githubusercontent.com/hdaSprachtechnologie/odenet/master/deWordNet.xml
|
||||
|
||||
Result will be used as lemma_dict["word"] --> lemma
|
||||
|
||||
:param path2lexicalentries: str
|
||||
:param returnall: bool if True, also return , word2synsets, synset2Words
|
||||
:return: dictionaries: thesaurus
|
||||
"""
|
||||
lextree = ET.parse(path2wordnet, ET.XMLParser(encoding="utf-8"))
|
||||
|
||||
lexroot = lextree.getroot()
|
||||
synroot = syntree.getroot()
|
||||
|
||||
thesaurus = []
|
||||
|
||||
for r in synroot:
|
||||
for element in r:
|
||||
|
||||
if element.tag == "Synset":
|
||||
sysnet = []
|
||||
attrib = element.attrib
|
||||
id = attrib["id"]
|
||||
|
||||
for ro in lexroot:
|
||||
for elem in ro:
|
||||
if elem.tag == "LexicalEntry":
|
||||
subs_dicts = [subentry.attrib for subentry in elem]
|
||||
# <class 'list'>: [{'partOfSpeech': 'n', 'writtenForm': 'Kernspaltung'}, {'synset': 'de-1-n', 'id': 'w1_1-n'}]
|
||||
|
||||
dic = {k: v for x in subs_dicts for k, v in x.items()} # to one dict
|
||||
if "synset" in dic.keys():
|
||||
if dic["synset"] == id:
|
||||
string = (dic["writtenForm"])
|
||||
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string)
|
||||
string = re.sub(r'[ö]', "oe", string)
|
||||
string = re.sub(r'[ü]', "ue", string)
|
||||
string = re.sub(r'[ä]', "ae", string)
|
||||
|
||||
# alle punkte raus
|
||||
string = re.sub(r'[.]', "", string)
|
||||
|
||||
# alles in klammern raus
|
||||
string = re.sub(r"\((.*)\)", " ", string)
|
||||
|
||||
# längeres leerzeichen normalisieren
|
||||
string = textacy.preprocess.normalize_whitespace(string)
|
||||
|
||||
sysnet.append(string.lower().strip())
|
||||
|
||||
# nach anzhal der wörter in den strings sortieren
|
||||
sysnet.sort(key=lambda x: len(x.split()))
|
||||
if len(sysnet) != 0:
|
||||
# todo warum sind manche leer?
|
||||
thesaurus.append(sysnet)
|
||||
return thesaurus
|
||||
|
||||
#todo thesaurus in dictionary
|
||||
"""
|
||||
|
||||
def build_thesaurus(path2lexicalentries):#, path2synsets):
|
||||
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
||||
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
||||
|
||||
lexroot = lextree.getroot()
|
||||
#synroot = syntree.getroot()
|
||||
|
||||
|
||||
word2synsets = {}
|
||||
template = {"w1": ["s1", "s2"]}
|
||||
|
@ -167,6 +100,9 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
|
|||
if 'writtenForm' in lex_dict.keys():
|
||||
string = (lex_dict["writtenForm"])
|
||||
|
||||
if string == "Kennwort":
|
||||
pass
|
||||
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string)
|
||||
string = re.sub(r'[ö]', "oe", string)
|
||||
|
@ -186,15 +122,17 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
|
|||
|
||||
word2synsets[string] = synlist
|
||||
|
||||
|
||||
synset2Words = {}
|
||||
template = {"s1": ["w1","w2"]}
|
||||
|
||||
for word,synset in word2synsets.items():
|
||||
for syn in synset:
|
||||
if syn not in synset2Words.keys():
|
||||
synset2Words[syn] = [word]
|
||||
else:
|
||||
synset2Words[syn].append(word)
|
||||
if word != '':
|
||||
for syn in synset:
|
||||
if syn not in synset2Words.keys():
|
||||
synset2Words[syn] = [word]
|
||||
else:
|
||||
synset2Words[syn].append(word)
|
||||
|
||||
# nach anzhal der wörter in den strings sortieren
|
||||
for synset in word2synsets.values():
|
||||
|
@ -203,91 +141,135 @@ def build_thesaurus(path2lexicalentries):#, path2synsets):
|
|||
thesaurus = {}
|
||||
thesaurus_template = {"w1" : "mainsyn"}
|
||||
|
||||
|
||||
for word,synset in word2synsets.items():
|
||||
try:
|
||||
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym
|
||||
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym #todo nach (hauptform) suchen?
|
||||
except:
|
||||
pass
|
||||
return thesaurus
|
||||
|
||||
|
||||
if returnall:
|
||||
return thesaurus, word2synsets, synset2Words
|
||||
else:
|
||||
return thesaurus
|
||||
|
||||
|
||||
|
||||
def create_stopword_lists(*paths):
|
||||
"""
|
||||
for r in synroot:
|
||||
for element in r:
|
||||
creates a list of stoppwords from:
|
||||
spacy
|
||||
nltk
|
||||
stop_words
|
||||
|
||||
if element.tag == "Synset":
|
||||
synset = []
|
||||
attrib = element.attrib
|
||||
id = attrib["id"]
|
||||
:param paths: list of additional filepaths where each file looks like
|
||||
w1
|
||||
w2
|
||||
w3
|
||||
filenames must be a la de_stopwords_1.txt, en_stopwords_2.txt
|
||||
|
||||
if id not in synset2Words.keys():
|
||||
synset2Words[id] = "WORD"
|
||||
:return: lists: de_stopwords, en_stopwords
|
||||
"""
|
||||
|
||||
## GERMAN
|
||||
|
||||
# from packages
|
||||
de_stop_words1 = list(get_stop_words("de"))
|
||||
|
||||
de_stop_words2 = list(nltk_stopwords.words('german'))
|
||||
|
||||
de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS)
|
||||
|
||||
#from files
|
||||
de_filepaths = []
|
||||
for path in paths:
|
||||
if os.path.basename(path).split("_")[0] == 'de' and os.path.basename(path).split("_")[
|
||||
1] == 'stopwords':
|
||||
de_filepaths.append(path)
|
||||
|
||||
|
||||
def create_stopwordlist():
|
||||
de_stop_words4 = list_from_files(*de_filepaths)
|
||||
|
||||
de_stop_words1 = list(map(replaceRockDots(),
|
||||
list(
|
||||
map(textacy.preprocess.normalize_whitespace,
|
||||
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt")
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
de_stop_words2 = list(map(replaceRockDots(),list(set(nltk_stopwords.words('german')))))
|
||||
|
||||
de_stop_words3 = list(map(replaceRockDots(),list(__import__("spacy." + DE_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)))
|
||||
|
||||
de_stop_words4 = list(map(replaceRockDots(),list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords-de.txt"))))
|
||||
|
||||
de_stop_words = list(set(de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4))
|
||||
|
||||
return de_stop_words
|
||||
|
||||
#todo en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)+ list(set(nltk_stopwords.words('english'))))
|
||||
#combine everything
|
||||
de_stop_words = list(set(map(replaceRockDots(), list(map(textacy.preprocess.normalize_whitespace,
|
||||
de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))
|
||||
|
||||
|
||||
|
||||
## ENGLISH
|
||||
|
||||
# from packages
|
||||
en_stop_words1 = list(get_stop_words("en"))
|
||||
|
||||
en_stop_words2 = list(nltk_stopwords.words('english'))
|
||||
|
||||
en_stop_words3 = list(__import__("spacy.en", globals(), locals(), ['object']).STOP_WORDS)
|
||||
|
||||
# from files
|
||||
en_filepaths = [path for path in paths if
|
||||
os.path.basename(path).split("_")[0] == 'en' and os.path.basename(path).split("_")[
|
||||
1] == 'stopwords']
|
||||
|
||||
en_stop_words4 = list_from_files(*en_filepaths)
|
||||
|
||||
|
||||
########################## Spellchecking ##########################################
|
||||
# http://norvig.com/spell-correct.html
|
||||
# http://wortschatz.uni-leipzig.de/en/download
|
||||
# combine everything
|
||||
en_stop_words = list(set(map(replaceRockDots(), list(map(textacy.preprocess.normalize_whitespace,
|
||||
en_stop_words1 + en_stop_words2 + en_stop_words3 + en_stop_words4)))))
|
||||
|
||||
|
||||
return de_stop_words, en_stop_words
|
||||
|
||||
|
||||
|
||||
def build_words_for_spellchecking(path2words):
|
||||
"""
|
||||
create word-Counter for spellchecking
|
||||
|
||||
http://norvig.com/spell-correct.html
|
||||
http://wortschatz.uni-leipzig.de/en/download
|
||||
|
||||
http://pcai056.informatik.uni-leipzig.de/downloads/corpora/deu_news_2015_1M.tar.gz
|
||||
:return: Counter
|
||||
"""
|
||||
def words(text): return re.findall(r'\w+', text.lower())
|
||||
|
||||
return Counter(words(open(path2words).read()))
|
||||
|
||||
def words(text): return re.findall(r'\w+', text.lower())
|
||||
|
||||
|
||||
##################################################################################################
|
||||
|
||||
# ziel: dictionaries für thesaurus, correctwordliste und lemmas als ladbare dateien
|
||||
# außerdem saubere stoppwortliste und nomenliste
|
||||
|
||||
|
||||
|
||||
# THESAURUS
|
||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
||||
#synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
||||
path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
|
||||
|
||||
path2wordnet = config.get("thesaurus","input")
|
||||
path2thesaurus_dict = config.get("thesaurus","pickle_file")
|
||||
|
||||
|
||||
# SPELLCHECKING
|
||||
path2words = '/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt'
|
||||
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"
|
||||
path2words_file = config.get("spellchecking","input")
|
||||
path2wordlist = config.get("spellchecking","pickle_file")
|
||||
|
||||
|
||||
# LEMMA
|
||||
path2lemma_file = config.get("lemmatization","input")
|
||||
path2lemmadict = config.get("lemmatization","pickle_file")
|
||||
|
||||
# NOMEN
|
||||
nouns1 = config.get("nouns","input1")
|
||||
nouns2 = config.get("nouns","input2")
|
||||
path2nouns_list = config.get("nouns","pickle_file")
|
||||
|
||||
|
||||
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
|
||||
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
|
||||
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
|
||||
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
|
||||
|
||||
|
||||
|
||||
# VORNAMEN
|
||||
firstnames_txt = config.get("firstnames","input")
|
||||
path2firstnameslist = config.get("firstnames","pickle_file")
|
||||
|
||||
# STOPWORDS
|
||||
stop1 = config.get("de_stopwords","input1")
|
||||
stop2 = config.get("de_stopwords","input2")
|
||||
stop3 = config.get("de_stopwords","input3")
|
||||
path2stopwordlist = config.get("de_stopwords","pickle_file")
|
||||
|
||||
|
||||
|
||||
|
@ -297,71 +279,42 @@ def main():
|
|||
|
||||
|
||||
|
||||
|
||||
printlog("create and save lemma_dict")
|
||||
LEMMAS = list(
|
||||
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))
|
||||
|
||||
lemma_dict = create_lemma_dict(LEMMAS)
|
||||
lemma_dict = create_lemma_dict(path2lemma_file)
|
||||
save_obj(lemma_dict, path2lemmadict)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
printlog("Build and save Wordlist for Spellchecking")
|
||||
WORDS = Counter(words(open(path2words).read()))
|
||||
save_obj(WORDS, path2wordlist)
|
||||
|
||||
|
||||
words = build_words_for_spellchecking(path2words_file)
|
||||
save_obj(words, path2wordlist)
|
||||
|
||||
|
||||
|
||||
printlog("Build and save Thesaurus")
|
||||
THESAURUS = build_thesaurus(path2lexicalentries=lexicalentries)
|
||||
|
||||
|
||||
save_obj(THESAURUS, path2thesaurusdict)
|
||||
|
||||
thesaurus = build_thesaurus_dict(path2wordnet)
|
||||
save_obj(thesaurus, path2thesaurus_dict)
|
||||
|
||||
|
||||
|
||||
|
||||
printlog("Build and save stoppwortliste")
|
||||
de_stop_words = create_stopwordlist()
|
||||
de_stop_words = create_stopword_lists(stop1, stop2, stop3)
|
||||
save_obj(de_stop_words, path2stopwordlist)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
printlog("Build and save nomenliste")
|
||||
NOUNS = list(textacy.fileio.read_file_lines(
|
||||
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt")) + list(
|
||||
textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt"))
|
||||
NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS))
|
||||
save_obj(NOUNS, path2NOUNSlist)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
printlog("Build and save fistnameslist")
|
||||
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
|
||||
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt")))
|
||||
|
||||
save_obj(VORNAMEN, path2firstnameslist)
|
||||
|
||||
|
||||
nouns = list_from_files(nouns1,nouns2)
|
||||
save_obj(nouns, path2nouns_list)
|
||||
|
||||
|
||||
printlog("Build and save firstnameslist")
|
||||
vornamen = list_from_files(firstnames_txt)
|
||||
save_obj(vornamen, path2firstnameslist)
|
||||
|
||||
|
||||
end = time.time()
|
||||
printlog("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
|
||||
printlog("Time Elapsed Initialization:{0} min".format((end - start) / 60))
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
import init
|
||||
import corporization
|
||||
import preprocessing
|
||||
from miscellaneous import *
|
||||
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
|
||||
|
||||
|
||||
init.main()
|
||||
printlog("")
|
||||
|
||||
corporization.main()
|
||||
printlog("")
|
||||
|
||||
preprocessing.main()
|
||||
printlog("")
|
||||
|
|
@ -0,0 +1,281 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import random
|
||||
|
||||
import time
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from datetime import datetime
|
||||
import logging
|
||||
from nltk.corpus import stopwords
|
||||
import csv
|
||||
import functools
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
import time
|
||||
start = time.time()
|
||||
|
||||
import logging
|
||||
from nltk.corpus import stopwords
|
||||
import csv
|
||||
import functools
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
|
||||
|
||||
import time
|
||||
|
||||
import enchant
|
||||
|
||||
start = time.time()
|
||||
|
||||
import logging
|
||||
|
||||
import csv
|
||||
import functools
|
||||
import os.path
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
import sys
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
from textacy import Vectorizer
|
||||
import warnings
|
||||
import configparser as ConfigParser
|
||||
import sys
|
||||
import hunspell
|
||||
from postal.parser import parse_address
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import time
|
||||
import logging
|
||||
from nltk.corpus import stopwords as nltk_stopwords
|
||||
from collections import Counter
|
||||
import csv
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
import pickle
|
||||
|
||||
|
||||
|
||||
# load config
|
||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
with open(config_ini) as f:
|
||||
config.read_file(f)
|
||||
|
||||
|
||||
|
||||
# config logging
|
||||
filename = config.get("logging","filename")
|
||||
level = config.get("logging","level")
|
||||
if level == "INFO":
|
||||
level = logging.INFO
|
||||
elif level == "DEBUG":
|
||||
level = logging.DEBUG
|
||||
elif level == "WARNING":
|
||||
level = logging.WARNING
|
||||
logging.basicConfig(filename=filename, level=level)
|
||||
|
||||
|
||||
|
||||
def printlog(string, level="INFO"):
|
||||
"""log and prints"""
|
||||
print(string)
|
||||
if level == "INFO":
|
||||
logging.info(string)
|
||||
elif level == "DEBUG":
|
||||
logging.debug(string)
|
||||
elif level == "WARNING":
|
||||
logging.warning(string)
|
||||
|
||||
|
||||
def compose(*functions):
|
||||
def compose2(f, g):
|
||||
return lambda x: f(g(x))
|
||||
|
||||
return functools.reduce(compose2, functions, lambda x: x)
|
||||
|
||||
|
||||
def get_calling_function():
|
||||
"""finds the calling function in many decent cases.
|
||||
https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
|
||||
"""
|
||||
fr = sys._getframe(1) # inspect.stack()[1][0]
|
||||
co = fr.f_code
|
||||
for get in (
|
||||
lambda: fr.f_globals[co.co_name],
|
||||
lambda: getattr(fr.f_locals['self'], co.co_name),
|
||||
lambda: getattr(fr.f_locals['cls'], co.co_name),
|
||||
lambda: fr.f_back.f_locals[co.co_name], # nested
|
||||
lambda: fr.f_back.f_locals['func'], # decorators
|
||||
lambda: fr.f_back.f_locals['meth'],
|
||||
lambda: fr.f_back.f_locals['f'],
|
||||
):
|
||||
try:
|
||||
func = get()
|
||||
except (KeyError, AttributeError):
|
||||
pass
|
||||
else:
|
||||
if func.__code__ == co:
|
||||
return func
|
||||
raise AttributeError("func not found")
|
||||
|
||||
|
||||
def save_obj(obj, path):
|
||||
with open(path , 'wb') as f:
|
||||
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def load_obj(path):
|
||||
with open(path, 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
def replaceRockDots():
|
||||
return lambda string: re.sub(r'[ß]', "ss",
|
||||
(re.sub(r'[ö]', "oe",
|
||||
(re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
||||
|
||||
def list_from_files(*paths):
|
||||
"""
|
||||
create string-list from file like
|
||||
n1
|
||||
n2
|
||||
n3
|
||||
|
||||
:param paths: list(str) or str if single path
|
||||
:return: list(str)
|
||||
"""
|
||||
|
||||
listlist = []
|
||||
for path in paths:
|
||||
listlist.append(list(textacy.fileio.read_file_lines(path)))
|
||||
|
||||
#liste von listen zu einer liste
|
||||
liste = [item for sublist in listlist for item in sublist]
|
||||
|
||||
return list(map(textacy.preprocess.normalize_whitespace, liste))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def printRandomDoc(textacyCorpus):
|
||||
"""
|
||||
printlogss random doc out of a textacy-Corpus
|
||||
:param textacyCorpus:
|
||||
"""
|
||||
print()
|
||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
||||
textacyCorpus[randIndex].metadata))
|
||||
|
||||
print()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def save_corpus(corpus, corpus_path, corpus_name):
|
||||
"""
|
||||
saves a textacy-corpus including spacy-parser
|
||||
:param corpus: textacy-Corpus
|
||||
:param corpus_path: str
|
||||
:param corpus_name: str (should content the language like "_de_")
|
||||
"""
|
||||
|
||||
"""
|
||||
# save stringstore
|
||||
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
||||
with open(stringstore_path, "w") as file:
|
||||
parser.vocab.strings.dump(file)
|
||||
|
||||
#todo save vocab?
|
||||
"""
|
||||
|
||||
# save parser
|
||||
parser = corpus.spacy_lang
|
||||
parserpath = corpus_path + str(parser.lang) + '_parser'
|
||||
parser.save_to_directory(parserpath)
|
||||
|
||||
# save content
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus), contentpath)
|
||||
|
||||
# save meta
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||
"""
|
||||
Load textacy-Corpus including spacy-parser out from file
|
||||
:param corpus_path: str
|
||||
:param corpus_name: str (should content the language like "_de_")
|
||||
:param lang: str language code)
|
||||
:return: texracy.Corpus, spacy.language
|
||||
"""
|
||||
|
||||
#ckeck for language
|
||||
if "_de_" in corpus_name:
|
||||
lang="de"
|
||||
elif "_en_" in corpus_name:
|
||||
lang ="en"
|
||||
|
||||
|
||||
# load parser
|
||||
parser = spacy.load(lang)
|
||||
|
||||
|
||||
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
|
||||
with open(stringstorepath) as file:
|
||||
parser.vocab.strings.load(file)
|
||||
|
||||
vocabpath = Path(corpus_path + str(lang) + '_parser'+'/vocab/lexemes.bin')
|
||||
parser.vocab.load_lexemes(vocabpath)
|
||||
|
||||
#load corpus
|
||||
corpus = textacy.Corpus(parser)
|
||||
|
||||
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
|
||||
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||
return corpus, corpus.spacy_lang
|
||||
|
||||
|
||||
|
||||
|
|
@ -420,10 +420,10 @@ custom_words = ["grüßen", "fragen"]
|
|||
####################'####################'####################'####################'####################'##############
|
||||
|
||||
|
||||
## files to textacy-corpus
|
||||
## files to textacy-corpi
|
||||
textacyCorpus = textacy.Corpus(PARSER)
|
||||
|
||||
print("add texts to textacy-corpus...")
|
||||
print("add texts to textacy-corpi...")
|
||||
textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH))
|
||||
|
||||
|
||||
|
|
|
@ -182,8 +182,8 @@ cleanStream = compose(
|
|||
cleanEnt
|
||||
)
|
||||
"""
|
||||
# content: xml -> stringCleaning -> pipe -> docCleaning -> corpus
|
||||
# metadata:xml -> -> stringCleaning -> corpus
|
||||
# content: xml -> stringCleaning -> pipe -> docCleaning -> corpi
|
||||
# metadata:xml -> -> stringCleaning -> corpi
|
||||
|
||||
corpus = textacy.Corpus(PARSER)
|
||||
|
||||
|
|
755
preprocessing.py
755
preprocessing.py
|
@ -2,27 +2,53 @@
|
|||
|
||||
from datetime import datetime
|
||||
print(datetime.now())
|
||||
from datetime import datetime
|
||||
|
||||
import time
|
||||
import logging
|
||||
from stop_words import get_stop_words
|
||||
|
||||
#import words as words
|
||||
from nltk.corpus import stopwords as nltk_stopwords
|
||||
from collections import Counter
|
||||
import csv
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
import pickle
|
||||
import configparser as ConfigParser
|
||||
from miscellaneous import *
|
||||
|
||||
|
||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_med.csv"
|
||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_small.csv"
|
||||
#path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_mini.csv"
|
||||
path2de_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/de_tickets.csv"
|
||||
|
||||
path2en_csv = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/en_tickets.csv"
|
||||
|
||||
|
||||
#idee roh-corpus (nur whitespace weg) speichern -> pregeprocesster corpus -> damit arbeiten
|
||||
|
||||
|
||||
path_csv_split = path2de_csv.split("/")
|
||||
print(path_csv_split[len(path_csv_split) - 1])
|
||||
path_csv_split = path2en_csv.split("/")
|
||||
print(path_csv_split[len(path_csv_split) - 1])
|
||||
|
||||
|
||||
import time
|
||||
start = time.time()
|
||||
|
||||
|
||||
|
||||
|
||||
from datetime import datetime
|
||||
import logging
|
||||
from nltk.corpus import stopwords
|
||||
import csv
|
||||
import functools
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import spacy
|
||||
import textacy
|
||||
from scipy import *
|
||||
import sys
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
|
||||
|
||||
import time
|
||||
|
||||
import logging
|
||||
from nltk.corpus import stopwords
|
||||
|
@ -40,231 +66,29 @@ csv.field_size_limit(sys.maxsize)
|
|||
|
||||
import pickle
|
||||
|
||||
def save_obj(obj, path):
|
||||
with open(path + '.pkl', 'wb') as f:
|
||||
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def load_obj(path ):
|
||||
with open(path + '.pkl', 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
|
||||
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
|
||||
#load parser
|
||||
parserpath = corpus_path + str(lang) + '_parser'
|
||||
parser = spacy.load(parserpath)
|
||||
|
||||
corpus = textacy.Corpus(parser)
|
||||
|
||||
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||
return corpus
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/"
|
||||
corpus_name = "de_raw_ticketCorpus"
|
||||
|
||||
print(load_corpus(corpus_path,corpus_name))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
|
||||
|
||||
|
||||
|
||||
# todo configuration file ?
|
||||
"""
|
||||
# load config
|
||||
config_ini = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/config.ini"
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
with open(config_ini) as f:
|
||||
config.read_file(f)
|
||||
"""
|
||||
|
||||
|
||||
|
||||
# config logging
|
||||
logfile = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModelTickets.log"
|
||||
logging.basicConfig(filename=logfile, level=logging.INFO)
|
||||
# logging.basicConfig(filename=config.get("filepath","logfile"), level=logging.INFO)
|
||||
|
||||
|
||||
|
||||
|
||||
# THESAURUS
|
||||
path2thesaurusdict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/thesaurus_list"
|
||||
THESAURUS = load_obj(path2thesaurusdict)
|
||||
|
||||
|
||||
# SPELLCHECKING
|
||||
path2wordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/words_list"
|
||||
|
||||
|
||||
path2lemmadict = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemma_dict"
|
||||
path2stopwordlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/stopwords_list"
|
||||
path2NOUNSlist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nouns_list"
|
||||
path2firstnameslist = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames_list"
|
||||
|
||||
|
||||
|
||||
# SPELLCHECKING
|
||||
|
||||
|
||||
parser.save_to_directory(corpus_path + str(parser.lang) + '_parser')
|
||||
|
||||
DE_PARSER = spacy.load("de")
|
||||
EN_PARSER = spacy.load("en")
|
||||
|
||||
|
||||
|
||||
de_stop_words = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
|
||||
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stop_words.txt"))) + list(set(stopwords.words('german')))
|
||||
|
||||
en_stop_words= set(list(__import__("spacy." + EN_PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)+ list(set(stopwords.words('english'))))
|
||||
|
||||
|
||||
|
||||
|
||||
LEMMAS = list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lemmas.txt"))
|
||||
|
||||
VORNAMEN = list(map(textacy.preprocess.normalize_whitespace, textacy.fileio.read_file_lines(
|
||||
"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/firstnames.txt")))
|
||||
|
||||
NOUNS = list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen2.txt")) + list(textacy.fileio.read_file_lines("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/nomen.txt"))
|
||||
NOUNS = list(map(textacy.preprocess.normalize_whitespace, NOUNS))
|
||||
|
||||
|
||||
"""
|
||||
print(de_stop_words[10:30])
|
||||
print(LEMMAS[10:30])
|
||||
print(VORNAMEN[10:30])
|
||||
print(NOUNS[10:30])
|
||||
"""
|
||||
|
||||
|
||||
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
|
||||
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
|
||||
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
|
||||
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
|
||||
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
|
||||
hardSFinder = re.compile(r'[ß]', re.IGNORECASE)
|
||||
|
||||
|
||||
|
||||
def printlog(string, level="INFO"):
|
||||
"""log and prints"""
|
||||
print(string)
|
||||
if level == "INFO":
|
||||
logging.info(string)
|
||||
elif level == "DEBUG":
|
||||
logging.debug(string)
|
||||
elif level == "WARNING":
|
||||
logging.warning(string)
|
||||
|
||||
printlog("Load functions")
|
||||
|
||||
|
||||
def compose(*functions):
|
||||
def compose2(f, g):
|
||||
return lambda x: f(g(x))
|
||||
|
||||
return functools.reduce(compose2, functions, lambda x: x)
|
||||
|
||||
|
||||
def get_calling_function():
|
||||
"""finds the calling function in many decent cases.
|
||||
https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
|
||||
"""
|
||||
fr = sys._getframe(1) # inspect.stack()[1][0]
|
||||
co = fr.f_code
|
||||
for get in (
|
||||
lambda: fr.f_globals[co.co_name],
|
||||
lambda: getattr(fr.f_locals['self'], co.co_name),
|
||||
lambda: getattr(fr.f_locals['cls'], co.co_name),
|
||||
lambda: fr.f_back.f_locals[co.co_name], # nested
|
||||
lambda: fr.f_back.f_locals['func'], # decorators
|
||||
lambda: fr.f_back.f_locals['meth'],
|
||||
lambda: fr.f_back.f_locals['f'],
|
||||
):
|
||||
try:
|
||||
func = get()
|
||||
except (KeyError, AttributeError):
|
||||
pass
|
||||
else:
|
||||
if func.__code__ == co:
|
||||
return func
|
||||
raise AttributeError("func not found")
|
||||
|
||||
|
||||
def printRandomDoc(textacyCorpus):
|
||||
import random
|
||||
print()
|
||||
|
||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||
printlog("Index: {0} ; Text: {1} ; Metadata: {2}\n".format(randIndex, textacyCorpus[randIndex].text,
|
||||
textacyCorpus[randIndex].metadata))
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def csv_to_contentStream(path2csv: str, content_collumn_name: str):
|
||||
"""
|
||||
:param path2csv: string
|
||||
:param content_collumn_name: string
|
||||
:return: string-generator
|
||||
"""
|
||||
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
|
||||
content_collumn = 0 # standardvalue
|
||||
|
||||
for i, lst in enumerate(stream):
|
||||
if i == 0:
|
||||
# look for desired column
|
||||
for j, col in enumerate(lst):
|
||||
if col == content_collumn_name:
|
||||
content_collumn = j
|
||||
else:
|
||||
yield lst[content_collumn]
|
||||
|
||||
|
||||
def csv_to_metaStream(path2csv: str, metalist: [str]):
|
||||
"""
|
||||
:param path2csv: string
|
||||
:param metalist: list of strings
|
||||
:return: dict-generator
|
||||
"""
|
||||
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
|
||||
|
||||
content_collumn = 0 # standardvalue
|
||||
metaindices = []
|
||||
metadata_temp = {}
|
||||
for i, lst in enumerate(stream):
|
||||
if i == 0:
|
||||
for j, col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal
|
||||
for key in metalist:
|
||||
if key == col:
|
||||
metaindices.append(j)
|
||||
metadata_temp = dict(
|
||||
zip(metalist, metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
|
||||
|
||||
else:
|
||||
metadata = metadata_temp.copy()
|
||||
for key, value in metadata.items():
|
||||
metadata[key] = lst[value]
|
||||
yield metadata
|
||||
|
||||
REGEX_SPECIALCHAR = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
|
||||
REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
|
||||
|
||||
|
||||
THESAURUS = {}
|
||||
WORDS = {}
|
||||
LEMMAS = {}
|
||||
NOUNS = []
|
||||
VORNAMEN= []
|
||||
de_stop_words=[]
|
||||
|
||||
############# filter tokens
|
||||
|
||||
|
@ -303,14 +127,12 @@ def remove_words_containing_Numbers():
|
|||
return lambda tok: not bool(re.search('\d', tok.lower_))
|
||||
|
||||
|
||||
"""
|
||||
def remove_words_containing_topLVL():
|
||||
return lambda tok: not bool(re.search(regex_topLvl, tok.lower_))
|
||||
return lambda tok: not bool(re.se |