602 lines
22 KiB
Python
602 lines
22 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
import re
|
|
import time
|
|
import json
|
|
|
|
#import spacy
|
|
#import textacy
|
|
from functools import reduce
|
|
|
|
import textacy
|
|
|
|
start = time.time()
|
|
|
|
import enchant
|
|
|
|
from datetime import datetime
|
|
import os
|
|
import xml.etree.ElementTree as ET
|
|
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
|
from miscellaneous import *
|
|
|
|
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &"
|
|
|
|
parser = spacy.load("de")
|
|
|
|
|
|
|
|
"""
|
|
# load config
|
|
config_ini = FILEPATH + "config.ini"
|
|
|
|
config = ConfigParser.ConfigParser()
|
|
with open(config_ini) as f:
|
|
config.read_file(f)
|
|
|
|
|
|
PARSER=spacy.load("de")
|
|
|
|
|
|
corpi = textacy.Corpus(PARSER)
|
|
|
|
testcontetn = [
|
|
"fdsfdsfsd",
|
|
"juzdtjlkö",
|
|
"gfadojplk"
|
|
]
|
|
|
|
testmetda = [
|
|
{"categoryName":"zhb","Solution":"","Subject":"schulungstest"},
|
|
{"categoryName":"neuanschluss","Solution":"subject","Subject":"telephone contract"},
|
|
{"categoryName":"zhb","Solution":"","Subject":"setuji"}
|
|
]
|
|
|
|
|
|
def makecontent(testcontetn):
|
|
for content in testcontetn:
|
|
yield content
|
|
|
|
|
|
def makemeta( testmetda):
|
|
for metdata in testmetda:
|
|
yield metdata
|
|
|
|
|
|
def corpus2Text(corpus):
|
|
for doc in corpus:
|
|
yield doc.text
|
|
|
|
corpi.add_texts(
|
|
makecontent(testcontetn),
|
|
makemeta(testmetda)
|
|
)
|
|
corpus_de_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/test/"
|
|
rawCorpus_name = "de_test_ticket"
|
|
print(corpi)
|
|
|
|
#save_corpusV2(corpi,corpus_path=corpus_de_path,corpus_name=rawCorpus_name)
|
|
|
|
#textacy.fileio.write_file_lines(corpus2Text(corpi), filepath=corpus_de_path+"plain.txt")
|
|
|
|
|
|
dict = {"unicard redaktionsteam": 189, "kms": 131, "itmc_st\u00f6rungen": 17, "benutzerverwaltung_probleme": 168, "mailverteiler exchange": 130, "beamer": 70, "cws_confluence": 190, "benutzerverwaltung": 26, "sos": 166, "virtuelle server": 116, "sap": 7, "wlan": 21, "lsf": 6, "gastaufenthalt": 8, "umzug": 5, "firewall betreuung": 129, "ausleihe": 39, "fiona": 10, "kursplanung": 195, "schulungsraum verwaltung": 200, "plagiatserkennung": 32, "designentwicklung": 100, "ub basis it": 184, "tsm": 51, "backup tsm": 110, "raumkalender": 174, "veeam": 149, "linux bs": 42, "hochleistungsrechnen": 90, "e learning": 37, "h\u00f6rsaal\u00fcbertragung": 52, "sophos": 88, "service portal redaktion": 182, "verkauf": 93, "fk 16": 30, "campus app": 54, "dns": 71, "kurse": 196, "itmc schulungsr\u00e4ume": 96, "leitung": 91, "telefon": 14, "housing": 135, "softwarelizenzen": 35, "hcm stammdaten": 68, "semesterticket": 197, "exchange nutzung": 33, "mediendienste": 167, "sam spider": 172, "pvp": 27, "webserver": 29, "werkvertr\u00e4ge": 158, "ibz raumbuchung": 177, "webmailer": 126, "unicard sperrung": 64, "cd dvd produktion": 114, "lizenzserver": 92, "pr\u00fcfungsmanagement": 38, "blogs wikis foren": 87, "unicard ausgabe": 161, "pools": 157, "desktop & basisdienste": 144, "antrag auf rechnungserstellung": 193, "mailalias": 121, "evaexam": 133, "neuanschluss": 0, "mobilfunkvertr\u00e4ge": 69, "ftp server": 191, "haustechnik": 77, "raumbuchungssysteme": 186, "confluence": 181, "uniaccount zugangsdaten": 47, "itmc medienr\u00e4ume ef50": 171, "dokoll support": 128, "elektronisches telefonbuch": 3, "softwareverteilung": 153, "overhead projektor": 104, "sicherheit": 145, "itmc_als": 48, "itmc pools": 160, "zhb": 60, "serversupport": 101, "veranstaltungen": 61, "fk12 webauftritt": 138, "hardware": 142, "unicard produktion": 156, "telefonkonferenzen": 170, "dhcp": 188, "zertifikate server dfn": 139, "lan": 1, "datanet": 49, "neuausstattung": 173, "moodle": 16, "abmeldung": 13, "uni mail": 15, "medienr\u00e4ume ef50": 117, "verschiedene aufgaben": 40, "zentrale webserver": 75, "vorlesungsaufzeichnung": 152, "grafik": 132, "campus management": 72, "hacker angriff": 46, "pos": 23, "zugangsdaten": 41, "serviceportal": 63, "ews": 24, "voicemail box": 150, "service desk itmc": 74, "test": 180, "beschaffung": 57, "bestellung": 185, "vpn": 55, "app feedback": 66, "allgemein": 134, "rundmail": 105, "telefonabrechnung": 199, "limesurvey": 31, "unicard": 28, "eldorado": 140, "uniaccount": 12, "plotter": 125, "mdm mobile device management": 120, "namens\u00e4nderung": 43, "sd": 84, "basis applikationen": 103, "\u00e4nderung": 194, "fileserver einrichtung": 187, "fk14_test": 154, "werkst\u00e4tte": 179, "itmc_aufgaben": 45, "formulare antr\u00e4ge": 81, "facility": 192, "web": 169, "asknet": 136, "server storage": 113, "mail groupware": 20, "rektorat -b\u00fcro": 178, "office": 50, "werkstoffe lehrstuhl bauwesen": 59, "telefonzentrale": 115, "verwaltung": 4, "netze": 22, "beantragung": 82, "d.3 dms": 148, "redmine projektverwaltung": 141, "wsus": 106, "lido": 118, "rechnerr\u00e4ume": 143, "matrix42_hilfe": 18, "boss service desk": 44, "konteneinsicht": 62, "spam phishing": 53, "forensic": 164, "fk 12": 11, "benutzungsverwaltung": 198, "redmine": 79, "basis app": 85, "viren": 95, "fk12 migration": 155, "raumbuchung": 109, "virtuelle desktops citrix": 176, "outlook_einrichtung": 123, "kundenserver": 137, "nrw ticket": 80, "weiterentwicklung": 127, "siport zugangskontrolle": 98, "e mail dienste": 99, "vorlagenerstellung": 36, "video": 19, "studierendensekretariat": 111, "it sicherheit sic": 86, "boss": 25, "technik": 58, "dokoll pvp": 112, "betrieb": 2, "v2 campus app feedback": 151, "mailverteiler": 108, "videoschnitt": 119, "fk raumplanung 09": 9, "sap urlaub": 73, "keine r\u00fcckantwort": 124, "prozess- und projektmanagement": 67, "dienstreise": 34, "webgestaltung": 78, "schulung": 175, "software": 89, "medientechnik": 76, "servicedesk": 107, "service portal": 94, "software entwicklung": 165, "uniflow": 159, "ub_st\u00f6rungen": 162, "fk15": 183, "uhren": 83, "entwicklung": 163, "videokonferenzen": 97, "itmc webauftritt": 102, "joomla itmc website": 147, "changes": 122, "visitenkartenproduktion": 65, "lizenzmanagement": 146, "tonerb\u00f6rse": 201, "arbeitsplatzsupport": 56}
|
|
|
|
list = [(key,value) for key,value in dict.items()]
|
|
|
|
list.sort(key=lambda tup : tup[1])
|
|
"""
|
|
"""
|
|
from spacy.tokens.doc import Doc as SpacyDoc
|
|
|
|
filepath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/de_clean_ticket_content.bin"
|
|
|
|
# load parser
|
|
parser = spacy.load("de")
|
|
|
|
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
|
|
|
|
stringstorepath = corpus_path + 'de_parser/vocab/strings.json'
|
|
with open(stringstorepath) as file:
|
|
parser.vocab.strings.load(file)
|
|
|
|
vocabpath = Path(corpus_path + 'de_parser/vocab/lexemes.bin')
|
|
parser.vocab.load_lexemes(vocabpath)
|
|
|
|
spacy_vocab = parser.vocab
|
|
|
|
def readCorpus(filepath):
|
|
with open_sesame(filepath, mode='rb') as f:
|
|
for bytes_string in SpacyDoc.read_bytes(f):
|
|
yield SpacyDoc(spacy_vocab).from_bytes(bytes_string).text
|
|
|
|
|
|
textacy.fileio.write_file_lines(readCorpus(filepath),"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/result.txt")
|
|
"""
|
|
|
|
|
|
|
|
# load raw corpus and create new one
|
|
#raw_corpus, parser = load_corpusV2(corpus_name=rawCorpus_name, corpus_path=corpus_de_path)
|
|
|
|
#printRandomDoc(raw_corpus)
|
|
|
|
|
|
"""
|
|
spacy_doc = PARSER("test")
|
|
save_obj(spacy_doc, "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl")
|
|
|
|
spacy_doc2 = load_obj("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl")
|
|
|
|
print("Doc: {0}".format(spacy_doc2))
|
|
|
|
|
|
|
|
jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/"
|
|
|
|
LLDA_filepath = "{0}labeldict.txt".format(jgibbsLLDA_root)
|
|
laveldict = {'fiona': 10, 'vorlagenerstellung': 36, 'webserver': 29, 'matrix42_hilfe': 18, 'sap': 7, 'pos': 23, 'verwaltung': 4, 'lan': 1}
|
|
with open(LLDA_filepath, 'w') as file:
|
|
file.write(json.dumps(laveldict))
|
|
"""
|
|
"""
|
|
def load_corpus(corpus_path, corpus_name, lang="de"):
|
|
from pathlib import Path
|
|
|
|
# load parser
|
|
parser = spacy.load(lang)
|
|
|
|
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
|
|
with open(stringstorepath) as file:
|
|
parser.vocab.strings.load(file)
|
|
|
|
vocabpath = Path(corpus_path + str(lang) + '_parser'+'/vocab/lexemes.bin')
|
|
parser.vocab.load_lexemes(vocabpath)
|
|
|
|
corpus = textacy.Corpus(parser)
|
|
|
|
|
|
contentpath = corpus_path + corpus_name + "_content.bin"
|
|
metapath = corpus_path + corpus_name + "_meta.json"
|
|
|
|
|
|
|
|
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
|
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
|
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
|
corpus.add_doc(
|
|
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
|
return corpus
|
|
"""
|
|
|
|
|
|
"""
|
|
# THESAURUS
|
|
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
|
|
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
|
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
|
|
|
def build_thesaurus(path2lexicalentries):#, path2synsets):
|
|
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
|
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
|
|
|
lexroot = lextree.getroot()
|
|
#synroot = syntree.getroot()
|
|
|
|
|
|
word2synsets = {}
|
|
template = {"w1": ["s1", "s2"]}
|
|
|
|
for ro in lexroot:
|
|
for elem in ro:
|
|
if elem.tag == "LexicalEntry":
|
|
lex_dictlist = [subentry.attrib for subentry in elem]
|
|
|
|
|
|
|
|
synlist = []
|
|
string = "WORD"
|
|
|
|
for lex_dict in lex_dictlist:
|
|
if "synset" in lex_dict.keys():
|
|
|
|
synset = lex_dict["synset"]
|
|
synlist.append(synset)
|
|
|
|
if 'writtenForm' in lex_dict.keys():
|
|
string = (lex_dict["writtenForm"])
|
|
|
|
# replaceRockDots
|
|
string = re.sub(r'[ß]', "ss", string)
|
|
string = re.sub(r'[ö]', "oe", string)
|
|
string = re.sub(r'[ü]', "ue", string)
|
|
string = re.sub(r'[ä]', "ae", string)
|
|
|
|
# alle punkte raus
|
|
string = re.sub(r'[.]', "", string)
|
|
|
|
# alles in klammern raus
|
|
string = re.sub(r"\((.*)\)", " ", string)
|
|
|
|
# längeres leerzeichen normalisieren
|
|
string = textacy.preprocess.normalize_whitespace(string)
|
|
|
|
string = string.lower().strip()
|
|
|
|
word2synsets[string] = synlist
|
|
|
|
synset2Words = {}
|
|
template = {"s1": ["w1","w2"]}
|
|
|
|
for word,synset in word2synsets.items():
|
|
for syn in synset:
|
|
if syn not in synset2Words.keys():
|
|
synset2Words[syn] = [word]
|
|
else:
|
|
synset2Words[syn].append(word)
|
|
|
|
# nach anzhal der wörter in den strings sortieren
|
|
for synset in word2synsets.values():
|
|
synset.sort(key=lambda x: len(x.split()))
|
|
|
|
thesaurus = {}
|
|
thesaurus_template = {"w1" : "mainsyn"}
|
|
|
|
for word,synset in word2synsets.items():
|
|
try:
|
|
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym
|
|
except:
|
|
pass
|
|
return thesaurus
|
|
|
|
|
|
for r in synroot:
|
|
for element in r:
|
|
|
|
if element.tag == "Synset":
|
|
synset = []
|
|
attrib = element.attrib
|
|
id = attrib["id"]
|
|
|
|
if id not in synset2Words.keys():
|
|
synset2Words[id] = "WORD"
|
|
|
|
"""
|
|
|
|
"""
|
|
from postal.parser import parse_address
|
|
|
|
|
|
address = "Nicolas Rauner LS Biomaterialien und Polymerwissenschaften Fakultät Bio- und Chemieingenieurwesen TU Dortmund D-44227 Dortmund Tel: + 49-(0)231 / 755 - 3015 Fax: + 49-(0)231 / 755 - 2480"
|
|
print(parse_address(address))
|
|
|
|
|
|
address = "Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder- und Lagerwesen LogistikCampus Joseph-von-Fraunhofer-Str. 2-4 D-44227 Dortmund "
|
|
print(parse_address(address))
|
|
"""
|
|
|
|
"""
|
|
|
|
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
|
|
corpus_name = "testcorpus"
|
|
|
|
|
|
#corpi.save(corpus_path, name=corpus_name, compression=corpus_compression)
|
|
#corpi = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
|
|
|
|
|
|
|
|
import pathlib
|
|
|
|
strings_path = pathlib.Path(corpus_path + 'strings.json')
|
|
path_lexemes_bin_ = pathlib.Path(corpus_path + 'lexemes.bin')
|
|
|
|
PARSER.vocab.dump(path_lexemes_bin_)
|
|
nlp.vocab.load_lexemes(path_lexemes_bin_)
|
|
|
|
|
|
def save_corpus(corpus_path,corpus_name):
|
|
|
|
# save stringstore
|
|
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
|
with open(stringstore_path, "w") as file:
|
|
PARSER.vocab.strings.dump(file)
|
|
|
|
|
|
#save content
|
|
contentpath = corpus_path + corpus_name+ "_content.bin"
|
|
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpi),contentpath)
|
|
|
|
|
|
#save meta
|
|
metapath = corpus_path + corpus_name +"_meta.json"
|
|
textacy.fileio.write_json_lines((doc.metadata for doc in corpi), metapath)
|
|
|
|
|
|
|
|
def load_corpus(corpus_path,corpus_name):
|
|
# load new lang
|
|
nlp = spacy.load("de")
|
|
|
|
#load stringstore
|
|
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
|
with open(stringstore_path,"r") as file:
|
|
nlp.vocab.strings.load(file)
|
|
|
|
# define corpi
|
|
corpi = textacy.Corpus(nlp)
|
|
|
|
# load meta
|
|
metapath = corpus_path + corpus_name +"_meta.json"
|
|
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
|
|
|
#load content
|
|
contentpath = corpus_path + corpus_name+ "_content.bin"
|
|
spacy_docs = textacy.fileio.read_spacy_docs(corpi.spacy_vocab, contentpath)
|
|
|
|
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
|
corpi.add_doc(
|
|
textacy.Doc(spacy_doc, lang=corpi.spacy_lang, metadata=metadata))
|
|
|
|
return corpi
|
|
|
|
|
|
save_corpus(corpus_path,corpus_name)
|
|
|
|
print(load_corpus(corpus_path,corpus_name))
|
|
|
|
"""
|
|
|
|
"""
|
|
def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER):
|
|
#return lambda doc : parser(" ".join([tok.lower_ for tok in doc]))
|
|
return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc]))
|
|
|
|
def getFirstSynonym(word, thesaurus, default_return_first_Syn=False):
|
|
if not isinstance(word, str):
|
|
return str(word)
|
|
|
|
word = word.lower()
|
|
|
|
# durch den thesaurrus iterieren
|
|
for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen
|
|
|
|
for syn in syn_block:
|
|
syn = syn.lower()
|
|
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
|
|
if word == syn:
|
|
return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
|
|
else: # falls es ein satz ist
|
|
if word in syn:
|
|
return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
|
|
return str(word) # zur Not, das ursrpüngliche Wort zurückgeben
|
|
|
|
def getHauptform(syn_block, word, default_return_first_Syn=False):
|
|
for syn in syn_block:
|
|
syn = syn.lower()
|
|
|
|
if "hauptform" in syn and len(syn.split(" ")) <= 2:
|
|
# nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus
|
|
for w in syn.split(" "):
|
|
if not re.match(r'\([^)]+\)', w):
|
|
return w
|
|
|
|
if default_return_first_Syn:
|
|
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
|
|
for w in syn_block:
|
|
if not re.match(r'\([^)]+\)', w):
|
|
return w
|
|
return word # zur Not, das ursrpüngliche Wort zurückgeben
|
|
"""
|
|
|
|
"""
|
|
path2xml="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml"
|
|
|
|
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
|
root = tree.getroot()
|
|
|
|
for r in root:
|
|
for element in r:
|
|
|
|
if element.tag == "Synset":
|
|
attrib = element.attrib
|
|
for i,subentry in enumerate(element):
|
|
if subentry.tag == "Lemma" and subentry.attrib["partOfSpeech"] == "n":
|
|
string = (subentry.attrib["writtenForm"])
|
|
# replaceRockDots
|
|
string = re.sub(r'[ß]', "ss", string)
|
|
string = re.sub(r'[ö]', "oe", string)
|
|
string = re.sub(r'[ü]', "ue", string)
|
|
string = re.sub(r'[ä]', "ae", string)
|
|
|
|
# seperate_words_on_regex:
|
|
string = " ".join(re.compile(regex_specialChars).split(string))
|
|
string_list=string.split()
|
|
if len(string_list) == 1:
|
|
nomen.append(string.lower().strip())
|
|
"""
|
|
|
|
"""
|
|
import re
|
|
from collections import Counter
|
|
|
|
def words(text): return re.findall(r'\w+', text.lower())
|
|
|
|
WORDS = Counter(words(open('/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt').read()))
|
|
|
|
def P(word, N=sum(WORDS.values())):
|
|
"Probability of `word`."
|
|
return WORDS[word] / N
|
|
|
|
def correction(word):
|
|
"Most probable spelling correction for word."
|
|
return max(candidates(word), key=P)
|
|
|
|
def candidates(word):
|
|
"Generate possible spelling corrections for word."
|
|
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
|
|
|
|
def known(words):
|
|
"The subset of `words` that appear in the dictionary of WORDS."
|
|
return set(w for w in words if w in WORDS)
|
|
|
|
def edits1(word):
|
|
"All edits that are one edit away from `word`."
|
|
letters = 'abcdefghijklmnopqrstuvwxyz'
|
|
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
|
|
deletes = [L + R[1:] for L, R in splits if R]
|
|
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
|
|
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
|
|
inserts = [L + c + R for L, R in splits for c in letters]
|
|
return set(deletes + transposes + replaces + inserts)
|
|
|
|
def edits2(word):
|
|
"All edits that are two edits away from `word`."
|
|
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
|
|
|
|
"""
|
|
|
|
"""
|
|
### extract from derewo
|
|
|
|
#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html
|
|
|
|
|
|
raw = textacy.fileio.read_file_lines("DeReKo-2014-II-MainArchive-STT.100000.freq")
|
|
|
|
for line in raw:
|
|
line_list=line.split()
|
|
if line_list[2] == "NN":
|
|
string = line_list[1].lower()
|
|
|
|
# replaceRockDots
|
|
string = re.sub(r'[ß]', "ss", string)
|
|
string = re.sub(r'[ö]', "oe", string)
|
|
string = re.sub(r'[ü]', "ue", string)
|
|
string = re.sub(r'[ä]', "ae", string)
|
|
|
|
|
|
nomen.append(string.lower().strip())
|
|
|
|
|
|
textacy.fileio.write_file_lines(nomen,"nomen2.txt")
|
|
"""
|
|
|
|
"""
|
|
stream = textacy.fileio.read_csv("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv", delimiter=";")
|
|
content_collumn_name = "Description"
|
|
content_collumn = 9 # standardvalue
|
|
|
|
de_tickets=[]
|
|
en_tickets=[]
|
|
misc_tickets=[]
|
|
|
|
error_count = 0
|
|
for i, lst in enumerate(stream):
|
|
if i == 0:
|
|
de_tickets.append(lst)
|
|
en_tickets.append(lst)
|
|
misc_tickets.append(lst)
|
|
else:
|
|
try:
|
|
content_collumn_ = lst[content_collumn]
|
|
if detect(content_collumn_) == "de":
|
|
de_tickets.append(lst)
|
|
elif detect(content_collumn_) == "en":
|
|
en_tickets.append(lst)
|
|
else:
|
|
misc_tickets.append(lst)
|
|
|
|
except:
|
|
misc_tickets.append(lst)
|
|
error_count += 1
|
|
|
|
print(error_count)
|
|
|
|
textacy.fileio.write_csv(de_tickets,"M42-Export/de_tickets.csv", delimiter=";")
|
|
textacy.fileio.write_csv(en_tickets,"M42-Export/en_tickets.csv", delimiter=";")
|
|
textacy.fileio.write_csv(misc_tickets,"M42-Export/misc_tickets.csv", delimiter=";")
|
|
|
|
|
|
"""
|
|
|
|
"""
|
|
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
|
|
|
|
|
|
def stringcleaning(stringstream, funclist):
|
|
for string in stringstream:
|
|
for f in funclist:
|
|
string = f(string)
|
|
yield string
|
|
|
|
|
|
def seperate_words_on_regex(regex=regex_specialChars):
|
|
return lambda string: " ".join(re.compile(regex).split(string))
|
|
|
|
|
|
words = [
|
|
"uniaccount",
|
|
"nr54065467",
|
|
"nr54065467",
|
|
"455a33c5,"
|
|
"tvt?=",
|
|
"tanja.saborowski@tu-dortmund.de",
|
|
"-",
|
|
"m-sw1-vl4053.itmc.tu-dortmund.de",
|
|
"------problem--------"
|
|
]
|
|
|
|
|
|
|
|
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
|
|
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
|
|
|
|
for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
|
|
print(s.strip())
|
|
|
|
#print(stringcleaning(w,string_comp))
|
|
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
|
|
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
|
|
#result = specialFinder.sub(" ", w)
|
|
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
|
|
|
|
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
|
|
"""
|
|
|
|
"""
|
|
def replaceRockDots():
|
|
return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
|
|
|
|
|
|
|
de_stop_words = list(textacy.fileio.read_file_lines(filepath="german_stopwords_full.txt"))
|
|
|
|
|
|
#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))
|
|
|
|
#print(blob.entities)
|
|
|
|
de_stop_words = list(map(replaceRockDots(),de_stop_words))
|
|
#LEMMAS = list(map(replaceRockDots(),LEMMAS))
|
|
#VORNAMEN = list(map(replaceRockDots(),VORNAMEN))
|
|
|
|
de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))
|
|
#LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
|
|
#VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))
|
|
|
|
|
|
|
|
|
|
#textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
|
|
#textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
|
|
textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")
|
|
|
|
"""
|
|
|
|
end = time.time()
|
|
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))
|
|
|
|
|
|
|