8.3 RNN in Keras for Text Data (NLP)

import os

import numpy as np
np.random.seed(123)
print("NumPy:{}".format(np.__version__))

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize']=15,10
print("Matplotlib:{}".format(mpl.__version__))

import tensorflow as tf
tf.set_random_seed(123)
print("TensorFlow:{}".format(tf.__version__))

from keras.models import Sequential
from keras.layers import Dense, LSTM, Activation
from keras.losses import mean_squared_error as k_mse
from keras.backend import sqrt as k_sqrt
import keras.backend as K
import keras
print("Keras:{}".format(keras.__version__))
NumPy:1.13.1
Matplotlib:2.1.0
TensorFlow:1.4.1


Using TensorFlow backend.


Keras:2.0.9
DATASETSLIB_HOME = os.path.join(os.path.expanduser('~'),'dl-ts','datasetslib')
import sys
if not DATASETSLIB_HOME in sys.path:
    sys.path.append(DATASETSLIB_HOME)
%reload_ext autoreload
%autoreload 2
import datasetslib

from datasetslib import util as dsu
datasetslib.datasets_root = os.path.join(os.path.expanduser('~'),'datasets')

Text Generation with Text8 Data in Keras

Load and Prepare Text8 data

from datasetslib.text8 import Text8
text8 = Text8()
text8.load_data(clip_at=5000) # downloads data, converts words to ids, converts files to a list of ids
print('Train:', text8.part['train'][0:5])
#print(text8.part['test'][0:5])
#print(text8.part['valid'][0:5])
print('Vocabulary Length = ',text8.vocab_len)
Already exists: /home/armando/datasets/text8/text8.zip
Train: [  8 497   7   5 116]
Vocabulary Length =  1457
def id2string(ids):
    return ' '.join([text8.id2word[x_i] for x_i in ids])
print(id2string(text8.part['train'][0:100]))
anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing
# parameters

n_x = 5 # number of input words
n_y = 1 # number of output words
n_x_vars = 1 # in case of our text, there is only 1 variable at each timestep
n_y_vars = text8.vocab_len

random5 = np.random.choice(n_x * 50, n_x, replace=False)
print('Random 5 words: ',id2string(random5))
first5 = text8.part['train'][0:n_x].copy()
print('First 5 words: ',id2string(first5))
Random 5 words:  free bolshevik be n another
First 5 words:  anarchism originated as a term
# reset the jupyter buffers
tf.reset_default_graph()
keras.backend.clear_session()
# get the data
x_train, y_train = text8.seq_to_xy(seq=text8.part['train'],n_tx=n_x,n_ty=n_y)

# reshape input to be [samples, time steps, features]
x_train = x_train.reshape(x_train.shape[0], x_train.shape[1],1)
#x_test = x_test.reshape(x_test.shape[0], X_train.shape[1], 1)
y_onehot = np.zeros(shape=[y_train.shape[0],text8.vocab_len],dtype=np.float32)
for i in range(y_train.shape[0]):
    y_onehot[i,y_train[i]]=1


# parameters
n_epochs = 1000
batch_size=128
state_size=128
n_epochs_display=100

# create and fit the LSTM model
model = Sequential()
model.add(LSTM(units=state_size, 
               input_shape=(x_train.shape[1], x_train.shape[2]),
               return_sequences=False
              )
         )
model.add(Dense(text8.vocab_len))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

#random5 = np.random.choice(n_x * 100, n_x, replace=False)
print('Random 5 words: ',id2string(random5))
#first5 = text8.part['train'][0:n_x].copy()
print('First 5 words: ',id2string(first5))

print('\nLet\'s train and predict now:\n')
for j in range(n_epochs // n_epochs_display):
    model.fit(x_train, y_onehot, epochs=n_epochs_display, batch_size=batch_size,verbose=0)

    # generate text
    y_pred_r5 = np.empty([10])
    y_pred_f5 = np.empty([10])

    x_test_r5 = random5.copy()
    x_test_f5 = first5.copy()
    # let us generate text of 10 words after feeding 5 words
    for i in range(10):
        for x,y in zip([x_test_r5,x_test_f5],[y_pred_r5,y_pred_f5]):
            x_input = x.copy()
            x_input = x_input.reshape(-1, n_x, n_x_vars)
            y_pred = model.predict(x_input)[0]
            y_pred_id = np.argmax(y_pred)
            y[i]=y_pred_id
            x[:-1] = x[1:]
            x[-1] = y_pred_id
    print('Epoch: ',((j+1) * n_epochs_display)-1)
    print('  Random5 prediction:',id2string(y_pred_r5))
    print('  First5 prediction:',id2string(y_pred_f5))
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_1 (LSTM)                (None, 128)               66560     
_________________________________________________________________
dense_1 (Dense)              (None, 1457)              187953    
_________________________________________________________________
activation_1 (Activation)    (None, 1457)              0         
=================================================================
Total params: 254,513
Trainable params: 254,513
Non-trainable params: 0
_________________________________________________________________
Random 5 words:  free bolshevik be n another
First 5 words:  anarchism originated as a term

Let's train and predict now:

Epoch:  99
  Random5 prediction: anarchistic anarchistic wrote wrote wrote wrote wrote wrote wrote wrote
  First5 prediction: self nature nature war than than than than than than
Epoch:  199
  Random5 prediction: anarchistic anarchistic wrote wrote wrote wrote wrote wrote wrote wrote
  First5 prediction: self i nature french french french french french french french
Epoch:  299
  Random5 prediction: anarchistic anarchistic wrote wrote wrote wrote wrote wrote wrote wrote
  First5 prediction: term i revolutionary revolutionary french french french french french french
Epoch:  399
  Random5 prediction: anarchistic anarchistic amongst wrote wrote wrote wrote wrote wrote wrote
  First5 prediction: term i revolutionary revolutionary french french french french french french
Epoch:  499
  Random5 prediction: tolstoy anarchistic amongst wrote wrote wrote wrote wrote wrote wrote
  First5 prediction: term i revolutionary revolutionary french french french french french french
Epoch:  599
  Random5 prediction: tolstoy anarchistic true wrote wrote wrote wrote wrote wrote wrote
  First5 prediction: term i revolutionary revolutionary french french french french french french
Epoch:  699
  Random5 prediction: tolstoy anarchistic true tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy
  First5 prediction: term i revolutionary revolutionary had french french french french french
Epoch:  799
  Random5 prediction: tolstoy anarchistic tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy
  First5 prediction: term i revolutionary revolutionary french french french french french french
Epoch:  899
  Random5 prediction: tolstoy anarchistic tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy
  First5 prediction: term i revolutionary revolutionary had french french french french french
Epoch:  999
  Random5 prediction: tolstoy anarchistic tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy tolstoy
  First5 prediction: term i revolutionary revolutionary had french french french french french

results matching ""

    No results matching ""