e-techbytes: Edward Lance Lorilla

Friday, July 2, 2021

Edward Lance Lorilla

【ANDROID STUDIO】Working With Interfaces Dagger

Posted: 01 Jul 2021 08:17 AM PDT

package com.example.dagger


import android.util.Log
import javax.inject.Inject

interface Battery{
    fun getPower()
}

package com.example.dagger

import dagger.Binds
import dagger.Module
import dagger.Provides
@Module
abstract class NCBatteryModule {
    @Binds
    abstract fun bindsNCBattery(nickelCadmiumBattery: NickelCadmiumBattery):Battery
}
package com.example.dagger

import android.util.Log
import javax.inject.Inject

class NickelCadmiumBattery @Inject constructor(): Battery {
    override fun getPower() {
        Log.i("MYTAG", " Power from NickelCadmiumBattery")
    }
}
package com.example.dagger

import android.util.Log
import javax.inject.Inject

class MemoryCard {
    init {
        Log.i("MYTAG", "Memory Card Constructed")
    }

    fun getSpaceAvailablity() {
        Log.i("MYTAG", "Memory space available")
    }

}
package com.example.dagger

import dagger.Module
import dagger.Provides

@Module
class MemoryCardModule {

    @Provides
    fun providesMemoryCard(): MemoryCard {
        return MemoryCard()
    }
}

【PYTHON】Word Cloud and remove punctuation, numbers, stopwords

Posted: 01 Jul 2021 08:14 AM PDT

import string

translator = str.maketrans('', '', string.punctuation + string.digits)

text = text.translate(translator)

text[:100]

import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords

en_stopwords = stopwords.words('english')

print(en_stopwords[:10])

en_stopwords = set(en_stopwords)

words = text.lower().split()

words[:10]

words = [w for w in words if w not in en_stopwords and len(w) > 3]

words[:10]

bigrams = list([' '.join(bg) for bg in nltk.bigrams(words)])

bigrams[:3]

ug_fdist = nltk.FreqDist(words)

bg_fdist = nltk.FreqDist(bigrams)

ug_fdist.most_common(20)

bg_fdist.most_common(20)

import matplotlib.pyplot as plt

ax = ug_fdist.plot(20)

bg_fdist.plot(20)

from wordcloud import WordCloud

plt.figure(figsize=(5.5, 5.5))

wordcloud = WordCloud(collocations=False, height=300, width=300, scale=3).generate(' '.join(words))

plt.imshow(wordcloud, interpolation='bilinear')

plt.axis("off")

plt.savefig('B17030_06_02.png', dpi=300)

plt.show()

Let's create a function which will take a path to a folder and generate unigram and bigram frequency plots.

import os

from glob import glob

import textract

import nltk

en_stopwords = set(nltk.corpus.stopwords.words('english'))

def create_fdist_visualizations(path):

"""

Takes a path to a folder with .docx files, reads and cleans text,

then plots unigram and bigram frequency distributions.

"""

word_docs = glob(os.path.join(path, '*.docx'))

text = ' '.join([textract.process(w).decode('utf-8') for w in word_docs])

# remove punctuation, numbers, stopwords

translator = str.maketrans('', '', string.punctuation + string.digits)

text = text.translate(translator)

words = text.lower().split()

words = [w for w in words if w not in en_stopwords and len(w) > 3]

unigram_fd = nltk.FreqDist(words)

bigrams = bigrams = list([' '.join(bg) for bg in nltk.bigrams(words)])

bigram_fd = nltk.FreqDist(bigrams)

unigram_fd.plot(20)

bigram_fd.plot(20)

create_fdist_visualizations(r'data/gfsr_docs/docx/')

### .doc files

textract can also parse .doc files (an older Word format), but requires a software called 'antiword'. This is available [here](https://www.softpedia.com/get/Office-tools/Other-Office-Tools/Antiword.shtml) for Windows (instructions [here](https://stackoverflow.com/a/65982286/4549682)), and the [installation instructions](https://textract.readthedocs.io/en/stable/installation.html) for textract show how to install for Mac or Ubuntu/Debian systems.

doc_text = textract.process(r'data/gfsr_docs/gfsa03-04rpt.doc')

doc_text = doc_text.decode('utf-8')

doc_text[:100]

We can check the encoding with a few different libraries - in this case, it looks to be windows-1252.

with open(r'data/gfsr_docs/gfsa03-04rpt.doc', 'rb') as f:

blob = f.read()

suggestion = UnicodeDammit(blob)

print(suggestion.original_encoding)

### python-docx

We can use python-docx as well, though there is an extra step to get the text:

import docx

doc = docx.Document(word_files[0])

text = ' '.join([p.text for p in doc.paragraphs])

words = text.split()

words[:10]

e-techbytes

Friday, July 2, 2021

Edward Lance Lorilla

Edward Lance Lorilla

No comments:

Post a Comment