# -*- coding: utf-8 -*- import re import time import json import spacy import textacy start = time.time() import enchant from datetime import datetime import xml.etree.ElementTree as ET print(datetime.now()) """ PARSER=spacy.load("de") corpus = textacy.Corpus(PARSER) testcontetn = [ "fdsfdsfsd", "juzdtjlkö", "gfadojplk" ] testmetda = [ {"categoryName":"zhb","Solution":"","Subject":"schulungstest"}, {"categoryName":"neuanschluss","Solution":"subject","Subject":"telephone contract"}, {"categoryName":"zhb","Solution":"","Subject":"setuji"} ] def makecontent(testcontetn): for content in testcontetn: yield content def makemeta( testmetda): for metdata in testmetda: yield metdata corpus.add_texts( makecontent(testcontetn), makemeta(testmetda) ) print(corpus) """ from postal.parser import parse_address address = "Nicolas Rauner LS Biomaterialien und Polymerwissenschaften Fakultät Bio- und Chemieingenieurwesen TU Dortmund D-44227 Dortmund Tel: + 49-(0)231 / 755 - 3015 Fax: + 49-(0)231 / 755 - 2480" print(parse_address(address)) address = "Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder- und Lagerwesen LogistikCampus Joseph-von-Fraunhofer-Str. 2-4 D-44227 Dortmund " print(parse_address(address)) """ corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpus/" corpus_name = "testcorpus" #corpus.save(corpus_path, name=corpus_name, compression=corpus_compression) #corpus = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression) import pathlib strings_path = pathlib.Path(corpus_path + 'strings.json') path_lexemes_bin_ = pathlib.Path(corpus_path + 'lexemes.bin') PARSER.vocab.dump(path_lexemes_bin_) nlp.vocab.load_lexemes(path_lexemes_bin_) def save_corpus(corpus_path,corpus_name): # save stringstore stringstore_path = corpus_path + corpus_name + '_strings.json' with open(stringstore_path, "w") as file: PARSER.vocab.strings.dump(file) #save content contentpath = corpus_path + corpus_name+ "_content.bin" textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpus),contentpath) #save meta metapath = corpus_path + corpus_name +"_meta.json" textacy.fileio.write_json_lines((doc.metadata for doc in corpus), metapath) def load_corpus(corpus_path,corpus_name): # load new lang nlp = spacy.load("de") #load stringstore stringstore_path = corpus_path + corpus_name + '_strings.json' with open(stringstore_path,"r") as file: nlp.vocab.strings.load(file) # define corpus corpus = textacy.Corpus(nlp) # load meta metapath = corpus_path + corpus_name +"_meta.json" metadata_stream = textacy.fileio.read_json_lines(metapath) #load content contentpath = corpus_path + corpus_name+ "_content.bin" spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath) for spacy_doc, metadata in zip(spacy_docs, metadata_stream): corpus.add_doc( textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata)) return corpus save_corpus(corpus_path,corpus_name) print(load_corpus(corpus_path,corpus_name)) """ """ def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER): #return lambda doc : parser(" ".join([tok.lower_ for tok in doc])) return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc])) def getFirstSynonym(word, thesaurus, default_return_first_Syn=False): if not isinstance(word, str): return str(word) word = word.lower() # durch den thesaurrus iterieren for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen for syn in syn_block: syn = syn.lower() if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist if word == syn: return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn)) else: # falls es ein satz ist if word in syn: return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn)) return str(word) # zur Not, das ursrpüngliche Wort zurückgeben def getHauptform(syn_block, word, default_return_first_Syn=False): for syn in syn_block: syn = syn.lower() if "hauptform" in syn and len(syn.split(" ")) <= 2: # nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus for w in syn.split(" "): if not re.match(r'\([^)]+\)', w): return w if default_return_first_Syn: # falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht for w in syn_block: if not re.match(r'\([^)]+\)', w): return w return word # zur Not, das ursrpüngliche Wort zurückgeben """ """ path2xml="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml" tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) root = tree.getroot() for r in root: for element in r: if element.tag == "Synset": attrib = element.attrib for i,subentry in enumerate(element): if subentry.tag == "Lemma" and subentry.attrib["partOfSpeech"] == "n": string = (subentry.attrib["writtenForm"]) # replaceRockDots string = re.sub(r'[ß]', "ss", string) string = re.sub(r'[ö]', "oe", string) string = re.sub(r'[ü]', "ue", string) string = re.sub(r'[ä]', "ae", string) # seperate_words_on_regex: string = " ".join(re.compile(regex_specialChars).split(string)) string_list=string.split() if len(string_list) == 1: nomen.append(string.lower().strip()) """ """ import re from collections import Counter def words(text): return re.findall(r'\w+', text.lower()) WORDS = Counter(words(open('/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt').read())) def P(word, N=sum(WORDS.values())): "Probability of `word`." return WORDS[word] / N def correction(word): "Most probable spelling correction for word." return max(candidates(word), key=P) def candidates(word): "Generate possible spelling corrections for word." return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word]) def known(words): "The subset of `words` that appear in the dictionary of WORDS." return set(w for w in words if w in WORDS) def edits1(word): "All edits that are one edit away from `word`." letters = 'abcdefghijklmnopqrstuvwxyz' splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [L + R[1:] for L, R in splits if R] transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1] replaces = [L + c + R[1:] for L, R in splits if R for c in letters] inserts = [L + c + R for L, R in splits for c in letters] return set(deletes + transposes + replaces + inserts) def edits2(word): "All edits that are two edits away from `word`." return (e2 for e1 in edits1(word) for e2 in edits1(e1)) """ """ ### extract from derewo #http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html raw = textacy.fileio.read_file_lines("DeReKo-2014-II-MainArchive-STT.100000.freq") for line in raw: line_list=line.split() if line_list[2] == "NN": string = line_list[1].lower() # replaceRockDots string = re.sub(r'[ß]', "ss", string) string = re.sub(r'[ö]', "oe", string) string = re.sub(r'[ü]', "ue", string) string = re.sub(r'[ä]', "ae", string) nomen.append(string.lower().strip()) textacy.fileio.write_file_lines(nomen,"nomen2.txt") """ """ stream = textacy.fileio.read_csv("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv", delimiter=";") content_collumn_name = "Description" content_collumn = 9 # standardvalue de_tickets=[] en_tickets=[] misc_tickets=[] error_count = 0 for i, lst in enumerate(stream): if i == 0: de_tickets.append(lst) en_tickets.append(lst) misc_tickets.append(lst) else: try: content_collumn_ = lst[content_collumn] if detect(content_collumn_) == "de": de_tickets.append(lst) elif detect(content_collumn_) == "en": en_tickets.append(lst) else: misc_tickets.append(lst) except: misc_tickets.append(lst) error_count += 1 print(error_count) textacy.fileio.write_csv(de_tickets,"M42-Export/de_tickets.csv", delimiter=";") textacy.fileio.write_csv(en_tickets,"M42-Export/en_tickets.csv", delimiter=";") textacy.fileio.write_csv(misc_tickets,"M42-Export/misc_tickets.csv", delimiter=";") """ """ regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|?]' def stringcleaning(stringstream, funclist): for string in stringstream: for f in funclist: string = f(string) yield string def seperate_words_on_regex(regex=regex_specialChars): return lambda string: " ".join(re.compile(regex).split(string)) words = [ "uniaccount", "nr54065467", "nr54065467", "455a33c5," "tvt?=", "tanja.saborowski@tu-dortmund.de", "-", "m-sw1-vl4053.itmc.tu-dortmund.de", "------problem--------" ] topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE) specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE) for s in stringcleaning((w for w in words),[seperate_words_on_regex()]): print(s.strip()) #print(stringcleaning(w,string_comp)) #print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w))) #print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w))) #result = specialFinder.sub(" ", w) #print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w)) #print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w)) """ """ def replaceRockDots(): return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower()))))))) de_stop_words = list(textacy.fileio.read_file_lines(filepath="german_stopwords_full.txt")) #blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True)) #print(blob.entities) de_stop_words = list(map(replaceRockDots(),de_stop_words)) #LEMMAS = list(map(replaceRockDots(),LEMMAS)) #VORNAMEN = list(map(replaceRockDots(),VORNAMEN)) de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words)) #LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS)) #VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN)) #textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt") #textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt") textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt") """ end = time.time() print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))