# -*- coding: utf-8 -*- import re import time import textacy start = time.time() from datetime import datetime import xml.etree.ElementTree as ET print(datetime.now()) path2xml="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml" tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8")) root = tree.getroot() regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|?]' nomen=[] ### extract from deWordNet.xml #https://github.com/hdaSprachtechnologie/odenet for r in root: for element in r: if element.tag == "LexicalEntry": for i,subentry in enumerate(element): if subentry.tag == "Lemma" and subentry.attrib["partOfSpeech"] == "n": string = (subentry.attrib["writtenForm"]) # replaceRockDots string = re.sub(r'[ß]', "ss", string) string = re.sub(r'[ö]', "oe", string) string = re.sub(r'[ü]', "ue", string) string = re.sub(r'[ä]', "ae", string) # seperate_words_on_regex: string = " ".join(re.compile(regex_specialChars).split(string)) string_list=string.split() if len(string_list) == 1: nomen.append(string.lower().strip()) textacy.fileio.write_file_lines(nomen,"nomen.txt") """ ### extract from derewo #http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html raw = textacy.fileio.read_file_lines("DeReKo-2014-II-MainArchive-STT.100000.freq") for line in raw: line_list=line.split() if line_list[2] == "NN": string = line_list[1].lower() # replaceRockDots string = re.sub(r'[ß]', "ss", string) string = re.sub(r'[ö]', "oe", string) string = re.sub(r'[ü]', "ue", string) string = re.sub(r'[ä]', "ae", string) nomen.append(string.lower().strip()) textacy.fileio.write_file_lines(nomen,"nomen2.txt") """ """ stream = textacy.fileio.read_csv("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv", delimiter=";") content_collumn_name = "Description" content_collumn = 9 # standardvalue de_tickets=[] en_tickets=[] misc_tickets=[] error_count = 0 for i, lst in enumerate(stream): if i == 0: de_tickets.append(lst) en_tickets.append(lst) misc_tickets.append(lst) else: try: content_collumn_ = lst[content_collumn] if detect(content_collumn_) == "de": de_tickets.append(lst) elif detect(content_collumn_) == "en": en_tickets.append(lst) else: misc_tickets.append(lst) except: misc_tickets.append(lst) error_count += 1 print(error_count) textacy.fileio.write_csv(de_tickets,"M42-Export/de_tickets.csv", delimiter=";") textacy.fileio.write_csv(en_tickets,"M42-Export/en_tickets.csv", delimiter=";") textacy.fileio.write_csv(misc_tickets,"M42-Export/misc_tickets.csv", delimiter=";") """ """ regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|?]' def stringcleaning(stringstream, funclist): for string in stringstream: for f in funclist: string = f(string) yield string def seperate_words_on_regex(regex=regex_specialChars): return lambda string: " ".join(re.compile(regex).split(string)) words = [ "uniaccount", "nr54065467", "nr54065467", "455a33c5," "tvt?=", "tanja.saborowski@tu-dortmund.de", "-", "m-sw1-vl4053.itmc.tu-dortmund.de", "------problem--------" ] topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE) specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE) for s in stringcleaning((w for w in words),[seperate_words_on_regex()]): print(s.strip()) #print(stringcleaning(w,string_comp)) #print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w))) #print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w))) #result = specialFinder.sub(" ", w) #print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w)) #print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w)) """ """ def replaceRockDots(): return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower()))))))) de_stop_words = list(textacy.fileio.read_file_lines(filepath="german_stopwords_full.txt")) #blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True)) #print(blob.entities) de_stop_words = list(map(replaceRockDots(),de_stop_words)) #LEMMAS = list(map(replaceRockDots(),LEMMAS)) #VORNAMEN = list(map(replaceRockDots(),VORNAMEN)) de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words)) #LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS)) #VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN)) #textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt") #textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt") textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt") """ end = time.time() print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))