topicModelingTickets/testra.py

565 lines
17 KiB
Python
Raw Normal View History

# -*- coding: utf-8 -*-
import re
2017-09-25 13:12:23 +02:00
import time
import json
2017-09-20 15:22:13 +02:00
import spacy
import textacy
from functools import reduce
2017-09-25 13:12:23 +02:00
start = time.time()
2017-10-02 14:31:33 +02:00
import enchant
2017-09-25 13:12:23 +02:00
from datetime import datetime
import xml.etree.ElementTree as ET
print(datetime.now())
2017-10-10 14:42:09 +02:00
"""
PARSER=spacy.load("de")
2017-10-16 14:01:38 +02:00
corpi = textacy.Corpus(PARSER)
testcontetn = [
"fdsfdsfsd",
"juzdtjlkö",
"gfadojplk"
]
testmetda = [
{"categoryName":"zhb","Solution":"","Subject":"schulungstest"},
{"categoryName":"neuanschluss","Solution":"subject","Subject":"telephone contract"},
{"categoryName":"zhb","Solution":"","Subject":"setuji"}
]
def makecontent(testcontetn):
for content in testcontetn:
yield content
def makemeta( testmetda):
for metdata in testmetda:
yield metdata
2017-10-16 14:01:38 +02:00
corpi.add_texts(
makecontent(testcontetn),
makemeta(testmetda)
)
2017-10-16 14:01:38 +02:00
print(corpi)
2017-10-10 14:42:09 +02:00
"""
2017-10-11 17:16:04 +02:00
import pickle
2017-10-10 14:42:09 +02:00
2017-10-11 17:16:04 +02:00
def save_obj(obj, path):
with open(path + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
2017-10-10 14:42:09 +02:00
2017-10-11 17:16:04 +02:00
def load_obj(path ):
with open(path + '.pkl', 'rb') as f:
return pickle.load(f)
2017-10-16 14:01:38 +02:00
def load_corpus(corpus_path, corpus_name, lang="de"):
from pathlib import Path
# load parser
parser = spacy.load(lang)
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
with open(stringstorepath) as file:
parser.vocab.strings.load(file)
vocabpath = Path(corpus_path + str(lang) + '_parser'+'/vocab/lexemes.bin')
parser.vocab.load_lexemes(vocabpath)
corpus = textacy.Corpus(parser)
contentpath = corpus_path + corpus_name + "_content.bin"
metapath = corpus_path + corpus_name + "_meta.json"
metadata_stream = textacy.fileio.read_json_lines(metapath)
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
corpus.add_doc(
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
return corpus
import os
a = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_1.txt"
b = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_2.txt"
d = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/de_stopwords_3.txt"
c = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/en_stopwords_1.txt"
liste = [a,b,c,d]
de_filepaths = [path for path in liste if os.path.basename(path).split("_")[0]=='de' and os.path.basename(path).split("_")[1]=='stopwords']
from nltk.corpus import stopwords as nltk_stopwords
from stop_words import get_stop_words
import spacy
from miscellaneous import *
# from packages
de_stop_words1 = list(get_stop_words("de"))
de_stop_words2 = list(nltk_stopwords.words('german'))
de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS)
# from files
de_stop_words_list = [list(textacy.fileio.read_file_lines(path)) for path in de_filepaths]
de_stop_words4 = [item for sublist in de_stop_words_list for item in sublist]
#print(de_stop_words4)
de_stop_words = list(set(map(replaceRockDots(),list(map(textacy.preprocess.normalize_whitespace, de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))
print(len(de_stop_words))
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/testra.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_testra.log &"
# THESAURUS
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
def build_thesaurus(path2lexicalentries):#, path2synsets):
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
lexroot = lextree.getroot()
#synroot = syntree.getroot()
word2synsets = {}
template = {"w1": ["s1", "s2"]}
for ro in lexroot:
for elem in ro:
if elem.tag == "LexicalEntry":
lex_dictlist = [subentry.attrib for subentry in elem]
synlist = []
string = "WORD"
for lex_dict in lex_dictlist:
if "synset" in lex_dict.keys():
synset = lex_dict["synset"]
synlist.append(synset)
if 'writtenForm' in lex_dict.keys():
string = (lex_dict["writtenForm"])
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
2017-10-11 17:16:04 +02:00
# alle punkte raus
string = re.sub(r'[.]', "", string)
2017-10-11 17:16:04 +02:00
# alles in klammern raus
string = re.sub(r"\((.*)\)", " ", string)
2017-10-11 17:16:04 +02:00
# längeres leerzeichen normalisieren
string = textacy.preprocess.normalize_whitespace(string)
2017-10-11 17:16:04 +02:00
string = string.lower().strip()
2017-10-11 17:16:04 +02:00
word2synsets[string] = synlist
2017-10-11 17:16:04 +02:00
synset2Words = {}
template = {"s1": ["w1","w2"]}
2017-10-11 17:16:04 +02:00
for word,synset in word2synsets.items():
for syn in synset:
if syn not in synset2Words.keys():
synset2Words[syn] = [word]
else:
synset2Words[syn].append(word)
2017-10-11 17:16:04 +02:00
# nach anzhal der wörter in den strings sortieren
for synset in word2synsets.values():
synset.sort(key=lambda x: len(x.split()))
2017-10-11 17:16:04 +02:00
thesaurus = {}
thesaurus_template = {"w1" : "mainsyn"}
2017-10-11 17:16:04 +02:00
for word,synset in word2synsets.items():
try:
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym
except:
pass
return thesaurus
2017-10-11 17:16:04 +02:00
"""
for r in synroot:
for element in r:
2017-10-11 17:16:04 +02:00
if element.tag == "Synset":
synset = []
attrib = element.attrib
id = attrib["id"]
2017-10-11 17:16:04 +02:00
if id not in synset2Words.keys():
synset2Words[id] = "WORD"
"""
2017-10-11 17:16:04 +02:00
"""
2017-10-10 14:42:09 +02:00
from postal.parser import parse_address
address = "Nicolas Rauner LS Biomaterialien und Polymerwissenschaften Fakultät Bio- und Chemieingenieurwesen TU Dortmund D-44227 Dortmund Tel: + 49-(0)231 / 755 - 3015 Fax: + 49-(0)231 / 755 - 2480"
print(parse_address(address))
address = "Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder- und Lagerwesen LogistikCampus Joseph-von-Fraunhofer-Str. 2-4 D-44227 Dortmund "
print(parse_address(address))
2017-10-11 17:16:04 +02:00
"""
2017-10-10 14:42:09 +02:00
"""
2017-10-16 14:01:38 +02:00
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
corpus_name = "testcorpus"
2017-10-10 14:42:09 +02:00
2017-10-16 14:01:38 +02:00
#corpi.save(corpus_path, name=corpus_name, compression=corpus_compression)
#corpi = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
import pathlib
strings_path = pathlib.Path(corpus_path + 'strings.json')
path_lexemes_bin_ = pathlib.Path(corpus_path + 'lexemes.bin')
PARSER.vocab.dump(path_lexemes_bin_)
nlp.vocab.load_lexemes(path_lexemes_bin_)
2017-10-10 14:42:09 +02:00
def save_corpus(corpus_path,corpus_name):
# save stringstore
stringstore_path = corpus_path + corpus_name + '_strings.json'
with open(stringstore_path, "w") as file:
PARSER.vocab.strings.dump(file)
#save content
contentpath = corpus_path + corpus_name+ "_content.bin"
2017-10-16 14:01:38 +02:00
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpi),contentpath)
#save meta
metapath = corpus_path + corpus_name +"_meta.json"
2017-10-16 14:01:38 +02:00
textacy.fileio.write_json_lines((doc.metadata for doc in corpi), metapath)
def load_corpus(corpus_path,corpus_name):
# load new lang
nlp = spacy.load("de")
#load stringstore
stringstore_path = corpus_path + corpus_name + '_strings.json'
with open(stringstore_path,"r") as file:
nlp.vocab.strings.load(file)
2017-10-16 14:01:38 +02:00
# define corpi
corpi = textacy.Corpus(nlp)
# load meta
metapath = corpus_path + corpus_name +"_meta.json"
metadata_stream = textacy.fileio.read_json_lines(metapath)
#load content
contentpath = corpus_path + corpus_name+ "_content.bin"
2017-10-16 14:01:38 +02:00
spacy_docs = textacy.fileio.read_spacy_docs(corpi.spacy_vocab, contentpath)
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
2017-10-16 14:01:38 +02:00
corpi.add_doc(
textacy.Doc(spacy_doc, lang=corpi.spacy_lang, metadata=metadata))
2017-10-16 14:01:38 +02:00
return corpi
save_corpus(corpus_path,corpus_name)
print(load_corpus(corpus_path,corpus_name))
2017-10-10 14:42:09 +02:00
"""
2017-09-25 13:12:23 +02:00
"""
def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER):
#return lambda doc : parser(" ".join([tok.lower_ for tok in doc]))
return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc]))
def getFirstSynonym(word, thesaurus, default_return_first_Syn=False):
if not isinstance(word, str):
return str(word)
word = word.lower()
# durch den thesaurrus iterieren
for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen
for syn in syn_block:
syn = syn.lower()
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
if word == syn:
return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
else: # falls es ein satz ist
if word in syn:
return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
return str(word) # zur Not, das ursrpüngliche Wort zurückgeben
def getHauptform(syn_block, word, default_return_first_Syn=False):
for syn in syn_block:
syn = syn.lower()
if "hauptform" in syn and len(syn.split(" ")) <= 2:
# nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus
for w in syn.split(" "):
if not re.match(r'\([^)]+\)', w):
return w
if default_return_first_Syn:
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
for w in syn_block:
if not re.match(r'\([^)]+\)', w):
return w
return word # zur Not, das ursrpüngliche Wort zurückgeben
"""
2017-09-25 13:12:23 +02:00
"""
path2xml="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml"
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for r in root:
for element in r:
if element.tag == "Synset":
attrib = element.attrib
for i,subentry in enumerate(element):
if subentry.tag == "Lemma" and subentry.attrib["partOfSpeech"] == "n":
string = (subentry.attrib["writtenForm"])
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# seperate_words_on_regex:
string = " ".join(re.compile(regex_specialChars).split(string))
string_list=string.split()
if len(string_list) == 1:
nomen.append(string.lower().strip())
"""
"""
2017-10-02 14:31:33 +02:00
import re
from collections import Counter
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def words(text): return re.findall(r'\w+', text.lower())
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
WORDS = Counter(words(open('/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt').read()))
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def P(word, N=sum(WORDS.values())):
"Probability of `word`."
return WORDS[word] / N
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def correction(word):
"Most probable spelling correction for word."
return max(candidates(word), key=P)
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def candidates(word):
"Generate possible spelling corrections for word."
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def known(words):
"The subset of `words` that appear in the dictionary of WORDS."
return set(w for w in words if w in WORDS)
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def edits1(word):
"All edits that are one edit away from `word`."
letters = 'abcdefghijklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)
2017-09-25 13:12:23 +02:00
2017-10-02 14:31:33 +02:00
def edits2(word):
"All edits that are two edits away from `word`."
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
2017-09-25 13:12:23 +02:00
"""
2017-09-25 13:12:23 +02:00
"""
### extract from derewo
2017-09-25 13:12:23 +02:00
#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html
2017-09-25 13:12:23 +02:00
raw = textacy.fileio.read_file_lines("DeReKo-2014-II-MainArchive-STT.100000.freq")
2017-09-25 13:12:23 +02:00
for line in raw:
line_list=line.split()
if line_list[2] == "NN":
string = line_list[1].lower()
2017-09-25 13:12:23 +02:00
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
2017-09-25 13:12:23 +02:00
nomen.append(string.lower().strip())
textacy.fileio.write_file_lines(nomen,"nomen2.txt")
2017-09-25 13:12:23 +02:00
"""
"""
2017-09-21 12:05:32 +02:00
stream = textacy.fileio.read_csv("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv", delimiter=";")
content_collumn_name = "Description"
content_collumn = 9 # standardvalue
de_tickets=[]
en_tickets=[]
misc_tickets=[]
error_count = 0
for i, lst in enumerate(stream):
if i == 0:
de_tickets.append(lst)
en_tickets.append(lst)
misc_tickets.append(lst)
else:
try:
content_collumn_ = lst[content_collumn]
if detect(content_collumn_) == "de":
de_tickets.append(lst)
elif detect(content_collumn_) == "en":
en_tickets.append(lst)
else:
misc_tickets.append(lst)
except:
misc_tickets.append(lst)
error_count += 1
print(error_count)
textacy.fileio.write_csv(de_tickets,"M42-Export/de_tickets.csv", delimiter=";")
textacy.fileio.write_csv(en_tickets,"M42-Export/en_tickets.csv", delimiter=";")
textacy.fileio.write_csv(misc_tickets,"M42-Export/misc_tickets.csv", delimiter=";")
2017-09-25 13:12:23 +02:00
"""
"""
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
def stringcleaning(stringstream, funclist):
for string in stringstream:
for f in funclist:
string = f(string)
yield string
def seperate_words_on_regex(regex=regex_specialChars):
return lambda string: " ".join(re.compile(regex).split(string))
words = [
"uniaccount",
"nr54065467",
"nr54065467",
"455a33c5,"
"tvt?=",
"tanja.saborowski@tu-dortmund.de",
"-",
"m-sw1-vl4053.itmc.tu-dortmund.de",
"------problem--------"
]
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
print(s.strip())
#print(stringcleaning(w,string_comp))
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
#result = specialFinder.sub(" ", w)
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
2017-09-20 15:22:13 +02:00
"""
"""
2017-09-21 12:05:32 +02:00
def replaceRockDots():
return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
2017-09-20 15:22:13 +02:00
2017-09-21 12:05:32 +02:00
de_stop_words = list(textacy.fileio.read_file_lines(filepath="german_stopwords_full.txt"))
2017-09-20 15:22:13 +02:00
#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))
#print(blob.entities)
de_stop_words = list(map(replaceRockDots(),de_stop_words))
2017-09-21 12:05:32 +02:00
#LEMMAS = list(map(replaceRockDots(),LEMMAS))
#VORNAMEN = list(map(replaceRockDots(),VORNAMEN))
2017-09-20 15:22:13 +02:00
de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))
2017-09-21 12:05:32 +02:00
#LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
#VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))
2017-09-20 15:22:13 +02:00
2017-09-21 12:05:32 +02:00
#textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
#textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")
2017-09-20 15:22:13 +02:00
2017-09-21 12:05:32 +02:00
"""
2017-09-20 15:22:13 +02:00
end = time.time()
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))
2017-10-02 14:31:33 +02:00