# -*- coding: utf-8 -*- import corenlp as corenlp import os import re import textacy import nltk from textblob_de import TextBlobDE from textblob_de import PatternParser filepath = "lemmatization-de.txt" blob = TextBlobDE(str(textacy.fileio.read_file("teststring.txt")),parser=PatternParser(pprint=True, lemmata=True)) print(blob.parse()) #erste spalte zu {lemma : id} . zweite spalte zu {word : id} """http://www.lexiconista.com/datasets/lemmatization/""" lemma2id = {} word2id = {} for id,line in enumerate(list(textacy.fileio.read_file_lines(filepath=filepath))): lemma = line.split()[0].strip().lower() if lemma not in lemma2id: lemma2id[lemma] = id word = line.split()[1].strip().lower() word2id[word] = lemma2id[word] """ regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|?]' def stringcleaning(stringstream, funclist): for string in stringstream: for f in funclist: string = f(string) yield string def seperate_words_on_regex(regex=regex_specialChars): return lambda string: " ".join(re.compile(regex).split(string)) words = [ "uniaccount", "nr54065467", "nr54065467", "455a33c5," "tvt?=", "tanja.saborowski@tu-dortmund.de", "-", "m-sw1-vl4053.itmc.tu-dortmund.de", "------problem--------" ] topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE) specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE) for s in stringcleaning((w for w in words),[seperate_words_on_regex()]): print(s.strip()) #print(stringcleaning(w,string_comp)) #print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w))) #print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w))) #result = specialFinder.sub(" ", w) #print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w)) #print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w)) """