last commit 11.9.17
This commit is contained in:
parent
67e6f8845c
commit
0319e415a3
19
config.ini
19
config.ini
|
@ -7,14 +7,27 @@ language = de
|
||||||
|
|
||||||
[preprocessing]
|
[preprocessing]
|
||||||
|
|
||||||
ents = WORK_OF_ART,ORG,PRODUCT,LOC
|
ents2keep = WORK_OF_ART,ORG,PRODUCT,LOC
|
||||||
|
|
||||||
custom_words = grüßen,fragen
|
custom_words = grüßen,fragen
|
||||||
|
|
||||||
#lemmatize = True
|
#lemmatize = True
|
||||||
|
|
||||||
default_return_first_Syn = False
|
|
||||||
|
[topic modeling]
|
||||||
|
|
||||||
|
ngrams = (1,2)
|
||||||
|
|
||||||
|
min_df = 0
|
||||||
|
max_df = 1.0
|
||||||
|
no_below = 20
|
||||||
|
no_above = 0.5
|
||||||
|
|
||||||
|
topicModel = lda
|
||||||
|
|
||||||
|
top_topic_words = 5
|
||||||
|
|
||||||
|
top_document_labels_per_topic = 2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ import re
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
import time
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
import sys
|
||||||
import spacy
|
import spacy
|
||||||
import textacy
|
import textacy
|
||||||
from scipy import *
|
from scipy import *
|
||||||
|
@ -113,9 +113,12 @@ def keepOnlyENT(ent_list,parser=PARSER):
|
||||||
def removeAllENT(ent_list, parser=PARSER):
|
def removeAllENT(ent_list, parser=PARSER):
|
||||||
return lambda doc: parser(" ".join([tok.text for tok in doc if tok.ent_type_ not in ent_list]))
|
return lambda doc: parser(" ".join([tok.text for tok in doc if tok.ent_type_ not in ent_list]))
|
||||||
|
|
||||||
|
def keepUniqueTokens(parser=PARSER):
|
||||||
|
return lambda doc: parser(" ".join(set([tok.text for tok in doc])))
|
||||||
|
|
||||||
|
def lemmatize(parser=PARSER):
|
||||||
|
return lambda doc: parser(" ".join([tok.lemma_ for tok in doc]))
|
||||||
|
|
||||||
doc2Set = lambda doc: str(set([tok.text for tok in doc]))
|
|
||||||
doc2String = lambda doc : doc.text
|
doc2String = lambda doc : doc.text
|
||||||
|
|
||||||
|
|
||||||
|
@ -215,21 +218,24 @@ def generate_labled_lines(textacyCorpus):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
ents = config.get("preprocessing","ents").split(",")
|
ents = config.get("preprocessing","ents2keep").split(",")
|
||||||
|
|
||||||
clean_in_content=compose(
|
|
||||||
|
clean_in_content=compose( #anmrk.: unterste-funktion iwrd zuerst ausgeführt
|
||||||
|
|
||||||
doc2String,
|
doc2String,
|
||||||
#normalizeSynonyms(default_return_first_Syn=config.get("preprocessing","default_return_first_Syn")),
|
keepUniqueTokens(),
|
||||||
|
#normalizeSynonyms(default_return_first_Syn=False),
|
||||||
|
lemmatize(),
|
||||||
replaceEmails(),
|
replaceEmails(),
|
||||||
replaceURLs(),
|
replaceURLs(),
|
||||||
replaceTwitterMentions(),
|
replaceTwitterMentions(),
|
||||||
removeWords(stop_words),
|
|
||||||
#removeAllPOS(["SPACE","PUNCT"]),
|
|
||||||
#removeAllENT(ents),
|
#removeAllENT(ents),
|
||||||
keepOnlyPOS(['NOUN'])
|
keepOnlyPOS(['NOUN'])
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
clean_in_meta = {
|
clean_in_meta = {
|
||||||
"Loesung":removeAllPOS(["SPACE"]),
|
"Loesung":removeAllPOS(["SPACE"]),
|
||||||
"Zusammenfassung":removeAllPOS(["SPACE","PUNCT"])
|
"Zusammenfassung":removeAllPOS(["SPACE","PUNCT"])
|
||||||
|
@ -238,9 +244,6 @@ clean_in_meta = {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## add files to textacy-corpus,
|
## add files to textacy-corpus,
|
||||||
print("add texts to textacy-corpus...")
|
print("add texts to textacy-corpus...")
|
||||||
corpus.add_texts(
|
corpus.add_texts(
|
||||||
|
@ -250,9 +253,18 @@ corpus.add_texts(
|
||||||
|
|
||||||
printRandomDoc(corpus)
|
printRandomDoc(corpus)
|
||||||
|
|
||||||
|
#idee 3 versch. Corpi
|
||||||
|
|
||||||
|
|
||||||
####################'####################' Variablen todo alles in config
|
|
||||||
|
####################'####################'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
####################'####################' todo alles in config
|
||||||
|
|
||||||
ngrams = (1,2)
|
ngrams = (1,2)
|
||||||
|
|
||||||
|
@ -268,7 +280,7 @@ weighting = ('tf' if topicModel == 'lda' else 'tfidf')
|
||||||
top_topic_words = 5
|
top_topic_words = 5
|
||||||
top_document_labels_per_topic = 2
|
top_document_labels_per_topic = 2
|
||||||
|
|
||||||
n_topics = 4
|
n_topics = len(set(corpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue