Edward Lance Lorilla |
【ANDROID STUDIO】Working With Interfaces Dagger Posted: 01 Jul 2021 08:17 AM PDT package com.example.dagger
package com.example.dagger |
【PYTHON】Word Cloud and remove punctuation, numbers, stopwords Posted: 01 Jul 2021 08:14 AM PDT import string translator = str.maketrans('', '', string.punctuation + string.digits) text = text.translate(translator) text[:100] import nltk nltk.download('stopwords') from nltk.corpus import stopwords en_stopwords = stopwords.words('english') print(en_stopwords[:10]) en_stopwords = set(en_stopwords) words = text.lower().split() words[:10] words = [w for w in words if w not in en_stopwords and len(w) > 3] words[:10] bigrams = list([' '.join(bg) for bg in nltk.bigrams(words)]) bigrams[:3] ug_fdist = nltk.FreqDist(words) bg_fdist = nltk.FreqDist(bigrams) ug_fdist.most_common(20) bg_fdist.most_common(20) import matplotlib.pyplot as plt ax = ug_fdist.plot(20) bg_fdist.plot(20) from wordcloud import WordCloud plt.figure(figsize=(5.5, 5.5)) wordcloud = WordCloud(collocations=False, height=300, width=300, scale=3).generate(' '.join(words)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.savefig('B17030_06_02.png', dpi=300) plt.show() Let's create a function which will take a path to a folder and generate unigram and bigram frequency plots. import os from glob import glob import textract import nltk en_stopwords = set(nltk.corpus.stopwords.words('english')) def create_fdist_visualizations(path): """ Takes a path to a folder with .docx files, reads and cleans text, then plots unigram and bigram frequency distributions. """ word_docs = glob(os.path.join(path, '*.docx')) text = ' '.join([textract.process(w).decode('utf-8') for w in word_docs])
# remove punctuation, numbers, stopwords translator = str.maketrans('', '', string.punctuation + string.digits) text = text.translate(translator) words = text.lower().split() words = [w for w in words if w not in en_stopwords and len(w) > 3]
unigram_fd = nltk.FreqDist(words) bigrams = bigrams = list([' '.join(bg) for bg in nltk.bigrams(words)]) bigram_fd = nltk.FreqDist(bigrams)
unigram_fd.plot(20) bigram_fd.plot(20) create_fdist_visualizations(r'data/gfsr_docs/docx/') ### .doc files textract can also parse .doc files (an older Word format), but requires a software called 'antiword'. This is available [here](https://www.softpedia.com/get/Office-tools/Other-Office-Tools/Antiword.shtml) for Windows (instructions [here](https://stackoverflow.com/a/65982286/4549682)), and the [installation instructions](https://textract.readthedocs.io/en/stable/installation.html) for textract show how to install for Mac or Ubuntu/Debian systems. doc_text = textract.process(r'data/gfsr_docs/gfsa03-04rpt.doc') doc_text = doc_text.decode('utf-8') doc_text[:100] We can check the encoding with a few different libraries - in this case, it looks to be windows-1252. with open(r'data/gfsr_docs/gfsa03-04rpt.doc', 'rb') as f: blob = f.read() suggestion = UnicodeDammit(blob) print(suggestion.original_encoding) ### python-docx We can use python-docx as well, though there is an extra step to get the text: import docx doc = docx.Document(word_files[0]) text = ' '.join([p.text for p in doc.paragraphs]) words = text.split() words[:10] |
You are subscribed to email updates from Edward Lance Lorilla. To stop receiving these emails, you may unsubscribe now. | Email delivery powered by Google |
Google, 1600 Amphitheatre Parkway, Mountain View, CA 94043, United States |
No comments:
Post a Comment