Markov models

Markov models for text analysis

Issues

Cleaning strings

text cleaning example

import string
print(string.punctuation)
## !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
def clean_string(s,delete_chars=string.punctuation):
    for i in delete_chars:
        s = s.replace(i,"")
    return(s)
x = "ab,Cde!?Q@#$I"
print(clean_string(x))
## abCdeQI

Markov text model algorithm

  1. Open and read the text file.
  2. Clean the file.
  3. Create the text dictionary with each word as a key and the words that come next in the text as a list.
  4. Randomly select a starting word from the text and then create a “sentence” of a specified length using randomly selected words from the dictionary

markov_create function (outline)

def markov_create(file_name, sentence_length = 20):
    ## open the file and store its contents in a string
    text_file = open(file_name, 'r')
    text = text_file.read()
    ## clean the text and then split it into words
    clean_text = clean_string(text)
    word_list = clean_text.split()
    ## create the markov dictionary
    text_dict = markov_dict(word_list)
    ## Produce a sentence (a list of strings) of length
    ## sentence_length using the dictionary
    sentence = markov_sentence(text_dict, sentence_length)
    ## print out the sentence as a string using
    ## the .join() method.
    return " ".join(sentence)

the rest of it

To complete this exercise, we need to produce the following functions:

the random module

random examples

import random
random.seed(101)    ## any integer you want
random.randrange(2, 102, 2) # random even integers
## 76
random.choice([1, 2, 3, 4, 5]) # random choice from list
## random.choices([1, 2, 3, 4, 5], 9) # multiple choices (Python >=3.6)
## 2
random.sample([1, 2, 3, 4, 5], 3) # rand. sample of 3 items
## [5, 3, 2]
random.random() # uniform random float between 0 and 1
## 0.048520987208713895
random.uniform(3, 7) # uniform random between 3 and 7
## 5.014081424907534

why random-number seeds?

random.seed(101)
for i in range(3):
    print(random.randrange(10))
## 9
## 3
## 8
random.seed(101)
for i in range(3):
    print(random.randrange(10))
## 9
## 3
## 8

numpy Installation

numpy is the fundamental package for scientific computing with Python. It contains among other things:

numpy should already be installed with Anaconda or on syzygy. If not, you Good documentation can be found here and here.

arrays

array examples

import numpy as np  ## use "as np" so we can abbreviate
x = [1, 2, 3]
a = np.array([1, 4, 5, 8], dtype=float)
print(a)
## [1. 4. 5. 8.]
print(type(a))
## <class 'numpy.ndarray'>
print(a.shape)
## (4,)

shape

a1 = np.array([1,2])
print(a1.dtype)
## int64
print(a1.shape)
## (2,)
print(len(a1))
## 2
a2 = np.array([1,2],float)
print(a2.dtype)
## float64

more array examples

x = [1, 'a', 3]
a = np.array(x)  ## what happens?
b = np.array(range(10), float)
c = np.arange(5, dtype=float)
d = np.arange(2,4, 0.5, dtype=float)
np.ones(10)
## array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
np.zeros(4)
## array([0., 0., 0., 0.])

slicing and indexing

slicing/indexing examples

a1 = np.array([1.0, 2, 3, 4, 5, 6])
a1[1]
## 2.0
a1[:-3]
## array([1., 2., 3.])
b1 = a1
c1 = a1.copy()
b1[1] = 23
a1[1]
## 23.0
c1[1]
## 2.0

Multi-dimensional arrays

examples

nested = [[1, 2, 3], [4, 5, 6]]
a = np.array(nested, float)
nested[0][2]
## 3
a[0,2]
## 3.0
a
## array([[1., 2., 3.],
##        [4., 5., 6.]])
a.shape
## (2, 3)

slicing and reshaping multi-dimensional arrays

examples

a = np.array([[1, 2, 3], [4, 5, 6]], float)
a[1, :]     ## row index 1
## array([4., 5., 6.])
a[:, 2]     ## column index 2
## array([3., 6.])
a[-1:, -2:] ## slicing rows and columns
## array([[5., 6.]])

reshaping

An array can be reshaped using the reshape(t) method, where we specify a tuple t that gives the new dimensions of the array.

a = np.array(range(10), float)
a = a.reshape((5,2))
print(a)
## [[0. 1.]
##  [2. 3.]
##  [4. 5.]
##  [6. 7.]
##  [8. 9.]]

flattening an array

.flatten() converts an array with a given shape to a 1-D array:

a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(a)
## [[1 2 3]
##  [4 5 6]
##  [7 8 9]]
print(a.flatten())
## [1 2 3 4 5 6 7 8 9]

zero/one arrays

b = np.ones_like(a)
b.fill(33)

identity matrices


print(np.identity(4, dtype=float)),
## [[1. 0. 0. 0.]
##  [0. 1. 0. 0.]
##  [0. 0. 1. 0.]
##  [0. 0. 0. 1.]]
## (None,)
print(np.eye(4, k = -1, dtype=int))
## [[0 0 0 0]
##  [1 0 0 0]
##  [0 1 0 0]
##  [0 0 1 0]]

array mathematics

a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
b = np.array([[10, 11,12], [13, 14, 15], [16, 17, 18]])
print(np.concatenate((a,b)))
## [[ 1  2  3]
##  [ 4  5  6]
##  [ 7  8  9]
##  [10 11 12]
##  [13 14 15]
##  [16 17 18]]

array operators

print(a+b)
## [[11 13 15]
##  [17 19 21]
##  [23 25 27]]
print(a*b)
## [[ 10  22  36]
##  [ 52  70  90]
##  [112 136 162]]
print(a**b)
## [[                 1               2048             531441]
##  [          67108864         6103515625       470184984576]
##  [    33232930569601   2251799813685248 150094635296999121]]

adding arrays and scalars

print(a + 1)
## [[ 2  3  4]
##  [ 5  6  7]
##  [ 8  9 10]]
print(a/2)
## [[0.5 1.  1.5]
##  [2.  2.5 3. ]
##  [3.5 4.  4.5]]
print(a ** 3)
## [[  1   8  27]
##  [ 64 125 216]
##  [343 512 729]]

more math functions

print(np.sin(a))
## [[ 0.84147098  0.90929743  0.14112001]
##  [-0.7568025  -0.95892427 -0.2794155 ]
##  [ 0.6569866   0.98935825  0.41211849]]
print(a.sum())
## 45
print(a.prod())
## 362880
print(a.mean())
## 5.0