611 lines
24 KiB
Python
611 lines
24 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
import re
|
|
import time
|
|
import json
|
|
|
|
#import spacy
|
|
#import textacy
|
|
from functools import reduce
|
|
|
|
import textacy
|
|
|
|
start = time.time()
|
|
|
|
import enchant
|
|
|
|
from datetime import datetime
|
|
import os
|
|
import xml.etree.ElementTree as ET
|
|
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
|
from miscellaneous import *
|
|
|
|
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &"
|
|
|
|
|
|
# load config
|
|
config_ini = FILEPATH + "config.ini"
|
|
|
|
config = ConfigParser.ConfigParser()
|
|
with open(config_ini) as f:
|
|
config.read_file(f)
|
|
|
|
|
|
PARSER=spacy.load("de")
|
|
|
|
|
|
corpi = textacy.Corpus(PARSER)
|
|
|
|
testcontetn = [
|
|
"fdsfdsfsd",
|
|
"juzdtjlkö",
|
|
"gfadojplk"
|
|
]
|
|
|
|
testmetda = [
|
|
{"categoryName":"zhb","Solution":"","Subject":"schulungstest"},
|
|
{"categoryName":"neuanschluss","Solution":"subject","Subject":"telephone contract"},
|
|
{"categoryName":"zhb","Solution":"","Subject":"setuji"}
|
|
]
|
|
|
|
|
|
def makecontent(testcontetn):
|
|
for content in testcontetn:
|
|
yield content
|
|
|
|
|
|
def makemeta( testmetda):
|
|
for metdata in testmetda:
|
|
yield metdata
|
|
|
|
|
|
def corpus2Text(corpus):
|
|
for doc in corpus:
|
|
yield doc.text
|
|
|
|
corpi.add_texts(
|
|
makecontent(testcontetn),
|
|
makemeta(testmetda)
|
|
)
|
|
|
|
|
|
save_corpus(corpi,corpus_path="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/test",corpus_name="test")
|
|
|
|
bla = "uni mail account adresse woche falsch laufen schicken gerne januar betreff herr nachricht gruesse dezernat liebe datum freitag anfrage dienstag unicard karte abholen defekt bibliothek abholung dezember beantragung status gerne portal email nummer service id vorname prozess dez schauen eg rechner mitarbeiterin benutzerkonto oktober wissenschaftliche projekt fr download hilfskraft verantwortliche link dringend antrag schnelle arbeitsplatz november admin rahmen stand geschickt server outlook ordner bild konto postfach campus hi ueberpruefung sued beste daten freuen semester login benutzer gerne erstellen stelle frage system boss moeglichkeit student schoen spam alias geld vertrag juni ansprechpartner telefon raum einrichtung gebaeude telefonbuch abteilung element eintrag nutzer raum pc gerne lehrstuhl voraus fakultaet verfuegung herzliche drucker erreichen tlaptop kabel problem klaerung url adapter feedback koeln grundsaetzlich kaufmann problem fehler verbindung anhang meldung client netz netzwerk wenden funktionieren liebe mitarbeiter unterstuetzung aktuell herr benoetigt raumplanung gb weber vorab ueckmeldung software lizenz programm kurze urlaub gerne installation dankbar informieren team service problem loesung bestellung verlaengern verteiler alte aendern februar oeffnen update pdf browser notwendig fenster schulung beginn wege nord tkurs frage studierende personen teilnehmer standort gerne herunterladen voraus zusenden ews veranstaltung datei iso text umstellung absender message date html arbeit kaiser erfolgreich thema ablauf art at einfuehrung umfrage cloud zugang zugreifen montag probleme kollegin profil server handy web file ticket drucker einrichten senden nr mittwoch card mitteilen nrw kontakt mail fax universitaet it institut hardware hinweis fakultaet not strasse loeschen liste funktion auftrag zeitraum verwaltung angebot vorgehen entfernen moeglichkeit gefunden benutzername informatik gruppe eingabe nachname chemie dame b. angepasst name schoene abt post zukommen verlaengerung sommersemester fehlen namensaenderung auskunft tu dr prof pruefung herr namen fakultaet bereich lehrstuhl installieren buero ok anschluss maerz theologie notebook herr berechtigung master vorbeikommen passwort anmelden account hilfe helfen uniaccount anmeldung kennwort problem boss zugriff referat screenshot support laufwerk bildschirm super tastatur button auswaehlen"
|
|
bla = bla.split()
|
|
print(len(bla))
|
|
print(len(set(bla)))
|
|
print()
|
|
|
|
x = {'a':1, 'b': 2}
|
|
y = {'b':10, 'c': 11}
|
|
z = x.update(y)
|
|
|
|
print(x)
|
|
|
|
"""
|
|
#save_corpusV2(corpi,corpus_path=corpus_de_path,corpus_name=rawCorpus_name)
|
|
|
|
#textacy.fileio.write_file_lines(corpus2Text(corpi), filepath=corpus_de_path+"plain.txt")
|
|
|
|
|
|
dict = {"unicard redaktionsteam": 189, "kms": 131, "itmc_st\u00f6rungen": 17, "benutzerverwaltung_probleme": 168, "mailverteiler exchange": 130, "beamer": 70, "cws_confluence": 190, "benutzerverwaltung": 26, "sos": 166, "virtuelle server": 116, "sap": 7, "wlan": 21, "lsf": 6, "gastaufenthalt": 8, "umzug": 5, "firewall betreuung": 129, "ausleihe": 39, "fiona": 10, "kursplanung": 195, "schulungsraum verwaltung": 200, "plagiatserkennung": 32, "designentwicklung": 100, "ub basis it": 184, "tsm": 51, "backup tsm": 110, "raumkalender": 174, "veeam": 149, "linux bs": 42, "hochleistungsrechnen": 90, "e learning": 37, "h\u00f6rsaal\u00fcbertragung": 52, "sophos": 88, "service portal redaktion": 182, "verkauf": 93, "fk 16": 30, "campus app": 54, "dns": 71, "kurse": 196, "itmc schulungsr\u00e4ume": 96, "leitung": 91, "telefon": 14, "housing": 135, "softwarelizenzen": 35, "hcm stammdaten": 68, "semesterticket": 197, "exchange nutzung": 33, "mediendienste": 167, "sam spider": 172, "pvp": 27, "webserver": 29, "werkvertr\u00e4ge": 158, "ibz raumbuchung": 177, "webmailer": 126, "unicard sperrung": 64, "cd dvd produktion": 114, "lizenzserver": 92, "pr\u00fcfungsmanagement": 38, "blogs wikis foren": 87, "unicard ausgabe": 161, "pools": 157, "desktop & basisdienste": 144, "antrag auf rechnungserstellung": 193, "mailalias": 121, "evaexam": 133, "neuanschluss": 0, "mobilfunkvertr\u00e4ge": 69, "ftp server": 191, "haustechnik": 77, "raumbuchungssysteme": 186, "confluence": 181, "uniaccount zugangsdaten": 47, "itmc medienr\u00e4ume ef50": 171, "dokoll support": 128, "elektronisches telefonbuch": 3, "softwareverteilung": 153, "overhead projektor": 104, "sicherheit": 145, "itmc_als": 48, "itmc pools": 160, "zhb": 60, "serversupport": 101, "veranstaltungen": 61, "fk12 webauftritt": 138, "hardware": 142, "unicard produktion": 156, "telefonkonferenzen": 170, "dhcp": 188, "zertifikate server dfn": 139, "lan": 1, "datanet": 49, "neuausstattung": 173, "moodle": 16, "abmeldung": 13, "uni mail": 15, "medienr\u00e4ume ef50": 117, "verschiedene aufgaben": 40, "zentrale webserver": 75, "vorlesungsaufzeichnung": 152, "grafik": 132, "campus management": 72, "hacker angriff": 46, "pos": 23, "zugangsdaten": 41, "serviceportal": 63, "ews": 24, "voicemail box": 150, "service desk itmc": 74, "test": 180, "beschaffung": 57, "bestellung": 185, "vpn": 55, "app feedback": 66, "allgemein": 134, "rundmail": 105, "telefonabrechnung": 199, "limesurvey": 31, "unicard": 28, "eldorado": 140, "uniaccount": 12, "plotter": 125, "mdm mobile device management": 120, "namens\u00e4nderung": 43, "sd": 84, "basis applikationen": 103, "\u00e4nderung": 194, "fileserver einrichtung": 187, "fk14_test": 154, "werkst\u00e4tte": 179, "itmc_aufgaben": 45, "formulare antr\u00e4ge": 81, "facility": 192, "web": 169, "asknet": 136, "server storage": 113, "mail groupware": 20, "rektorat -b\u00fcro": 178, "office": 50, "werkstoffe lehrstuhl bauwesen": 59, "telefonzentrale": 115, "verwaltung": 4, "netze": 22, "beantragung": 82, "d.3 dms": 148, "redmine projektverwaltung": 141, "wsus": 106, "lido": 118, "rechnerr\u00e4ume": 143, "matrix42_hilfe": 18, "boss service desk": 44, "konteneinsicht": 62, "spam phishing": 53, "forensic": 164, "fk 12": 11, "benutzungsverwaltung": 198, "redmine": 79, "basis app": 85, "viren": 95, "fk12 migration": 155, "raumbuchung": 109, "virtuelle desktops citrix": 176, "outlook_einrichtung": 123, "kundenserver": 137, "nrw ticket": 80, "weiterentwicklung": 127, "siport zugangskontrolle": 98, "e mail dienste": 99, "vorlagenerstellung": 36, "video": 19, "studierendensekretariat": 111, "it sicherheit sic": 86, "boss": 25, "technik": 58, "dokoll pvp": 112, "betrieb": 2, "v2 campus app feedback": 151, "mailverteiler": 108, "videoschnitt": 119, "fk raumplanung 09": 9, "sap urlaub": 73, "keine r\u00fcckantwort": 124, "prozess- und projektmanagement": 67, "dienstreise": 34, "webgestaltung": 78, "schulung": 175, "software": 89, "medientechnik": 76, "servicedesk": 107, "service portal": 94, "software entwicklung": 165, "uniflow": 159, "ub_st\u00f6rungen": 162, "fk15": 183, "uhren": 83, "entwicklung": 163, "videokonferenzen": 97, "itmc webauftritt": 102, "joomla itmc website": 147, "changes": 122, "visitenkartenproduktion": 65, "lizenzmanagement": 146, "tonerb\u00f6rse": 201, "arbeitsplatzsupport": 56}
|
|
|
|
list = [(key,value) for key,value in dict.items()]
|
|
|
|
list.sort(key=lambda tup : tup[1])
|
|
"""
|
|
"""
|
|
from spacy.tokens.doc import Doc as SpacyDoc
|
|
|
|
filepath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/de_clean_ticket_content.bin"
|
|
|
|
# load parser
|
|
parser = spacy.load("de")
|
|
|
|
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
|
|
|
|
stringstorepath = corpus_path + 'de_parser/vocab/strings.json'
|
|
with open(stringstorepath) as file:
|
|
parser.vocab.strings.load(file)
|
|
|
|
vocabpath = Path(corpus_path + 'de_parser/vocab/lexemes.bin')
|
|
parser.vocab.load_lexemes(vocabpath)
|
|
|
|
spacy_vocab = parser.vocab
|
|
|
|
def readCorpus(filepath):
|
|
with open_sesame(filepath, mode='rb') as f:
|
|
for bytes_string in SpacyDoc.read_bytes(f):
|
|
yield SpacyDoc(spacy_vocab).from_bytes(bytes_string).text
|
|
|
|
|
|
textacy.fileio.write_file_lines(readCorpus(filepath),"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/result.txt")
|
|
"""
|
|
|
|
|
|
|
|
# load raw corpus and create new one
|
|
#raw_corpus, parser = load_corpusV2(corpus_name=rawCorpus_name, corpus_path=corpus_de_path)
|
|
|
|
#printRandomDoc(raw_corpus)
|
|
|
|
|
|
"""
|
|
spacy_doc = PARSER("test")
|
|
save_obj(spacy_doc, "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl")
|
|
|
|
spacy_doc2 = load_obj("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl")
|
|
|
|
print("Doc: {0}".format(spacy_doc2))
|
|
|
|
|
|
|
|
jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/"
|
|
|
|
LLDA_filepath = "{0}labeldict.txt".format(jgibbsLLDA_root)
|
|
laveldict = {'fiona': 10, 'vorlagenerstellung': 36, 'webserver': 29, 'matrix42_hilfe': 18, 'sap': 7, 'pos': 23, 'verwaltung': 4, 'lan': 1}
|
|
with open(LLDA_filepath, 'w') as file:
|
|
file.write(json.dumps(laveldict))
|
|
"""
|
|
"""
|
|
def load_corpus(corpus_path, corpus_name, lang="de"):
|
|
from pathlib import Path
|
|
|
|
# load parser
|
|
parser = spacy.load(lang)
|
|
|
|
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
|
|
with open(stringstorepath) as file:
|
|
parser.vocab.strings.load(file)
|
|
|
|
vocabpath = Path(corpus_path + str(lang) + '_parser'+'/vocab/lexemes.bin')
|
|
parser.vocab.load_lexemes(vocabpath)
|
|
|
|
corpus = textacy.Corpus(parser)
|
|
|
|
|
|
contentpath = corpus_path + corpus_name + "_content.bin"
|
|
metapath = corpus_path + corpus_name + "_meta.json"
|
|
|
|
|
|
|
|
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
|
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
|
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
|
corpus.add_doc(
|
|
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
|
return corpus
|
|
"""
|
|
|
|
|
|
"""
|
|
# THESAURUS
|
|
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
|
|
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
|
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
|
|
|
def build_thesaurus(path2lexicalentries):#, path2synsets):
|
|
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
|
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
|
|
|
lexroot = lextree.getroot()
|
|
#synroot = syntree.getroot()
|
|
|
|
|
|
word2synsets = {}
|
|
template = {"w1": ["s1", "s2"]}
|
|
|
|
for ro in lexroot:
|
|
for elem in ro:
|
|
if elem.tag == "LexicalEntry":
|
|
lex_dictlist = [subentry.attrib for subentry in elem]
|
|
|
|
|
|
|
|
synlist = []
|
|
string = "WORD"
|
|
|
|
for lex_dict in lex_dictlist:
|
|
if "synset" in lex_dict.keys():
|
|
|
|
synset = lex_dict["synset"]
|
|
synlist.append(synset)
|
|
|
|
if 'writtenForm' in lex_dict.keys():
|
|
string = (lex_dict["writtenForm"])
|
|
|
|
# replaceRockDots
|
|
string = re.sub(r'[ß]', "ss", string)
|
|
string = re.sub(r'[ö]', "oe", string)
|
|
string = re.sub(r'[ü]', "ue", string)
|
|
string = re.sub(r'[ä]', "ae", string)
|
|
|
|
# alle punkte raus
|
|
string = re.sub(r'[.]', "", string)
|
|
|
|
# alles in klammern raus
|
|
string = re.sub(r"\((.*)\)", " ", string)
|
|
|
|
# längeres leerzeichen normalisieren
|
|
string = textacy.preprocess.normalize_whitespace(string)
|
|
|
|
string = string.lower().strip()
|
|
|
|
word2synsets[string] = synlist
|
|
|
|
synset2Words = {}
|
|
template = {"s1": ["w1","w2"]}
|
|
|
|
for word,synset in word2synsets.items():
|
|
for syn in synset:
|
|
if syn not in synset2Words.keys():
|
|
synset2Words[syn] = [word]
|
|
else:
|
|
synset2Words[syn].append(word)
|
|
|
|
# nach anzhal der wörter in den strings sortieren
|
|
for synset in word2synsets.values():
|
|
synset.sort(key=lambda x: len(x.split()))
|
|
|
|
thesaurus = {}
|
|
thesaurus_template = {"w1" : "mainsyn"}
|
|
|
|
for word,synset in word2synsets.items():
|
|
try:
|
|
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym
|
|
except:
|
|
pass
|
|
return thesaurus
|
|
|
|
|
|
for r in synroot:
|
|
for element in r:
|
|
|
|
if element.tag == "Synset":
|
|
synset = []
|
|
attrib = element.attrib
|
|
id = attrib["id"]
|
|
|
|
if id not in synset2Words.keys():
|
|
synset2Words[id] = "WORD"
|
|
|
|
"""
|
|
|
|
"""
|
|
from postal.parser import parse_address
|
|
|
|
|
|
address = "Nicolas Rauner LS Biomaterialien und Polymerwissenschaften Fakultät Bio- und Chemieingenieurwesen TU Dortmund D-44227 Dortmund Tel: + 49-(0)231 / 755 - 3015 Fax: + 49-(0)231 / 755 - 2480"
|
|
print(parse_address(address))
|
|
|
|
|
|
address = "Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder- und Lagerwesen LogistikCampus Joseph-von-Fraunhofer-Str. 2-4 D-44227 Dortmund "
|
|
print(parse_address(address))
|
|
"""
|
|
|
|
"""
|
|
|
|
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
|
|
corpus_name = "testcorpus"
|
|
|
|
|
|
#corpi.save(corpus_path, name=corpus_name, compression=corpus_compression)
|
|
#corpi = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
|
|
|
|
|
|
|
|
import pathlib
|
|
|
|
strings_path = pathlib.Path(corpus_path + 'strings.json')
|
|
path_lexemes_bin_ = pathlib.Path(corpus_path + 'lexemes.bin')
|
|
|
|
PARSER.vocab.dump(path_lexemes_bin_)
|
|
nlp.vocab.load_lexemes(path_lexemes_bin_)
|
|
|
|
|
|
def save_corpus(corpus_path,corpus_name):
|
|
|
|
# save stringstore
|
|
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
|
with open(stringstore_path, "w") as file:
|
|
PARSER.vocab.strings.dump(file)
|
|
|
|
|
|
#save content
|
|
contentpath = corpus_path + corpus_name+ "_content.bin"
|
|
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpi),contentpath)
|
|
|
|
|
|
#save meta
|
|
metapath = corpus_path + corpus_name +"_meta.json"
|
|
textacy.fileio.write_json_lines((doc.metadata for doc in corpi), metapath)
|
|
|
|
|
|
|
|
def load_corpus(corpus_path,corpus_name):
|
|
# load new lang
|
|
nlp = spacy.load("de")
|
|
|
|
#load stringstore
|
|
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
|
with open(stringstore_path,"r") as file:
|
|
nlp.vocab.strings.load(file)
|
|
|
|
# define corpi
|
|
corpi = textacy.Corpus(nlp)
|
|
|
|
# load meta
|
|
metapath = corpus_path + corpus_name +"_meta.json"
|
|
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
|
|
|
#load content
|
|
contentpath = corpus_path + corpus_name+ "_content.bin"
|
|
spacy_docs = textacy.fileio.read_spacy_docs(corpi.spacy_vocab, contentpath)
|
|
|
|
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
|
corpi.add_doc(
|
|
textacy.Doc(spacy_doc, lang=corpi.spacy_lang, metadata=metadata))
|
|
|
|
return corpi
|
|
|
|
|
|
save_corpus(corpus_path,corpus_name)
|
|
|
|
print(load_corpus(corpus_path,corpus_name))
|
|
|
|
"""
|
|
|
|
"""
|
|
def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER):
|
|
#return lambda doc : parser(" ".join([tok.lower_ for tok in doc]))
|
|
return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc]))
|
|
|
|
def getFirstSynonym(word, thesaurus, default_return_first_Syn=False):
|
|
if not isinstance(word, str):
|
|
return str(word)
|
|
|
|
word = word.lower()
|
|
|
|
# durch den thesaurrus iterieren
|
|
for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen
|
|
|
|
for syn in syn_block:
|
|
syn = syn.lower()
|
|
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
|
|
if word == syn:
|
|
return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
|
|
else: # falls es ein satz ist
|
|
if word in syn:
|
|
return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
|
|
return str(word) # zur Not, das ursrpüngliche Wort zurückgeben
|
|
|
|
def getHauptform(syn_block, word, default_return_first_Syn=False):
|
|
for syn in syn_block:
|
|
syn = syn.lower()
|
|
|
|
if "hauptform" in syn and len(syn.split(" ")) <= 2:
|
|
# nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus
|
|
for w in syn.split(" "):
|
|
if not re.match(r'\([^)]+\)', w):
|
|
return w
|
|
|
|
if default_return_first_Syn:
|
|
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
|
|
for w in syn_block:
|
|
if not re.match(r'\([^)]+\)', w):
|
|
return w
|
|
return word # zur Not, das ursrpüngliche Wort zurückgeben
|
|
"""
|
|
|
|
"""
|
|
path2xml="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml"
|
|
|
|
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
|
root = tree.getroot()
|
|
|
|
for r in root:
|
|
for element in r:
|
|
|
|
if element.tag == "Synset":
|
|
attrib = element.attrib
|
|
for i,subentry in enumerate(element):
|
|
if subentry.tag == "Lemma" and subentry.attrib["partOfSpeech"] == "n":
|
|
string = (subentry.attrib["writtenForm"])
|
|
# replaceRockDots
|
|
string = re.sub(r'[ß]', "ss", string)
|
|
string = re.sub(r'[ö]', "oe", string)
|
|
string = re.sub(r'[ü]', "ue", string)
|
|
string = re.sub(r'[ä]', "ae", string)
|
|
|
|
# seperate_words_on_regex:
|
|
string = " ".join(re.compile(regex_specialChars).split(string))
|
|
string_list=string.split()
|
|
if len(string_list) == 1:
|
|
nomen.append(string.lower().strip())
|
|
"""
|
|
|
|
"""
|
|
import re
|
|
from collections import Counter
|
|
|
|
def words(text): return re.findall(r'\w+', text.lower())
|
|
|
|
WORDS = Counter(words(open('/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt').read()))
|
|
|
|
def P(word, N=sum(WORDS.values())):
|
|
"Probability of `word`."
|
|
return WORDS[word] / N
|
|
|
|
def correction(word):
|
|
"Most probable spelling correction for word."
|
|
return max(candidates(word), key=P)
|
|
|
|
def candidates(word):
|
|
"Generate possible spelling corrections for word."
|
|
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
|
|
|
|
def known(words):
|
|
"The subset of `words` that appear in the dictionary of WORDS."
|
|
return set(w for w in words if w in WORDS)
|
|
|
|
def edits1(word):
|
|
"All edits that are one edit away from `word`."
|
|
letters = 'abcdefghijklmnopqrstuvwxyz'
|
|
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
|
|
deletes = [L + R[1:] for L, R in splits if R]
|
|
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
|
|
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
|
|
inserts = [L + c + R for L, R in splits for c in letters]
|
|
return set(deletes + transposes + replaces + inserts)
|
|
|
|
def edits2(word):
|
|
"All edits that are two edits away from `word`."
|
|
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
|
|
|
|
"""
|
|
|
|
"""
|
|
### extract from derewo
|
|
|
|
#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html
|
|
|
|
|
|
raw = textacy.fileio.read_file_lines("DeReKo-2014-II-MainArchive-STT.100000.freq")
|
|
|
|
for line in raw:
|
|
line_list=line.split()
|
|
if line_list[2] == "NN":
|
|
string = line_list[1].lower()
|
|
|
|
# replaceRockDots
|
|
string = re.sub(r'[ß]', "ss", string)
|
|
string = re.sub(r'[ö]', "oe", string)
|
|
string = re.sub(r'[ü]', "ue", string)
|
|
string = re.sub(r'[ä]', "ae", string)
|
|
|
|
|
|
nomen.append(string.lower().strip())
|
|
|
|
|
|
textacy.fileio.write_file_lines(nomen,"nomen2.txt")
|
|
"""
|
|
|
|
"""
|
|
stream = textacy.fileio.read_csv("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv", delimiter=";")
|
|
content_collumn_name = "Description"
|
|
content_collumn = 9 # standardvalue
|
|
|
|
de_tickets=[]
|
|
en_tickets=[]
|
|
misc_tickets=[]
|
|
|
|
error_count = 0
|
|
for i, lst in enumerate(stream):
|
|
if i == 0:
|
|
de_tickets.append(lst)
|
|
en_tickets.append(lst)
|
|
misc_tickets.append(lst)
|
|
else:
|
|
try:
|
|
content_collumn_ = lst[content_collumn]
|
|
if detect(content_collumn_) == "de":
|
|
de_tickets.append(lst)
|
|
elif detect(content_collumn_) == "en":
|
|
en_tickets.append(lst)
|
|
else:
|
|
misc_tickets.append(lst)
|
|
|
|
except:
|
|
misc_tickets.append(lst)
|
|
error_count += 1
|
|
|
|
print(error_count)
|
|
|
|
textacy.fileio.write_csv(de_tickets,"M42-Export/de_tickets.csv", delimiter=";")
|
|
textacy.fileio.write_csv(en_tickets,"M42-Export/en_tickets.csv", delimiter=";")
|
|
textacy.fileio.write_csv(misc_tickets,"M42-Export/misc_tickets.csv", delimiter=";")
|
|
|
|
|
|
"""
|
|
|
|
"""
|
|
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
|
|
|
|
|
|
def stringcleaning(stringstream, funclist):
|
|
for string in stringstream:
|
|
for f in funclist:
|
|
string = f(string)
|
|
yield string
|
|
|
|
|
|
def seperate_words_on_regex(regex=regex_specialChars):
|
|
return lambda string: " ".join(re.compile(regex).split(string))
|
|
|
|
|
|
words = [
|
|
"uniaccount",
|
|
"nr54065467",
|
|
"nr54065467",
|
|
"455a33c5,"
|
|
"tvt?=",
|
|
"tanja.saborowski@tu-dortmund.de",
|
|
"-",
|
|
"m-sw1-vl4053.itmc.tu-dortmund.de",
|
|
"------problem--------"
|
|
]
|
|
|
|
|
|
|
|
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
|
|
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
|
|
|
|
for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
|
|
print(s.strip())
|
|
|
|
#print(stringcleaning(w,string_comp))
|
|
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
|
|
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
|
|
#result = specialFinder.sub(" ", w)
|
|
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
|
|
|
|
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
|
|
"""
|
|
|
|
"""
|
|
def replaceRockDots():
|
|
return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
|
|
|
|
|
|
|
de_stop_words = list(textacy.fileio.read_file_lines(filepath="german_stopwords_full.txt"))
|
|
|
|
|
|
#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))
|
|
|
|
#print(blob.entities)
|
|
|
|
de_stop_words = list(map(replaceRockDots(),de_stop_words))
|
|
#LEMMAS = list(map(replaceRockDots(),LEMMAS))
|
|
#VORNAMEN = list(map(replaceRockDots(),VORNAMEN))
|
|
|
|
de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))
|
|
#LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
|
|
#VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))
|
|
|
|
|
|
|
|
|
|
#textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
|
|
#textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
|
|
textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")
|
|
|
|
"""
|
|
|
|
end = time.time()
|
|
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))
|
|
|
|
|
|
|