pip3 install -r requirements.txt
to import all the necessary libraries and then to import nltk
, make a new directory with mkdir nltk_data
, cd
into it and then run python3 -m nltk.downloader
. You should see a window like this, select all packages as shown in the screenshot below:ngrok http 5000
in the directory your code is in.{ "loves me": [ "do you want some food", "you're so nice", "i got you some food", "I like your hair", "You looked nice today", "Let's dance", "I spent time on this for you", "i got this for you", "heyyyyyyy", "i got you pizza" ], "loves me not": [ "I didn't have the time", "Can you get your own food", "You'll have to get your own food", "Do it yourself", "i can't", "next time", "i'm sorry", "you up", "hey", "wyd", "k", "idk man", "cool" ] }
open_file
to save the data from data.json
as a variable data
.import re from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer import numpy as np import tflearn import tensorflow as tf import random import json from twilio.twiml.messaging_response import MessagingResponse from flask import Flask, request def open_file(file): with open(file, 'r') as f: data = json.load(f) return data data = open_file('data.json') print(data)
data
variable declaration, initialize the lemmatizer and make this function to stem each word:lemma = WordNetLemmatizer() def tokenize_and_stem_text(text): return [lemma.lemmatize(word.lower()) for word in text]
binary_categories = list(data.keys()) training_words = [] json_data = []
json_data
to hold tuples of words from the sentence and also the label name. The training_words
list will contain all the unique stemmed words from the training data JSON and binary_categories
contains the possible categories they could classify as.def read_training_data(data): for label in data.keys(): for text in data[label]: for word in text.split(): if word.lower() in contractions: text = text.replace(word, contractions[word.lower()]) text = re.sub("[^a-zA-Z' ]+", ' ', text) training_words.extend(word_tokenize(text)) json_data.append((word_tokenize(text), label)) return json_data
json_data
returned is a list of words from each sentence and either loves_me or loves_me_not; for example, one element of that list is ([“do”, “you”, “want”, "some", "food], “loves_me”)
. This list does not cover every possible contraction but you get the idea:contractions = { "aren't": "are not", "can't": "cannot", "could've": "could have", "couldn't": "could not", "didn't": "did not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "how'd": "how did", "how's": "how is", "i'd": "I had", "i'll": "I will", "i'm": "I am", "i've": "I have", "isn't": "is not", "let's": "let us", "should've": "should have", "shouldn't": "should not", "that'd": "that had", "that's": "that is", "there's": "there is", "wasn't": "was not", "we'd": "we would", "we'll": "we will", "we're": "we are", "we've": "we have", "what'll": "what will", "what's": "what is", "when's": "when is", "where'd": "where did", "where's": "where is", "won't": "will not", "would've": "would have", "wouldn't": "would not", "you'd": "you had", "you'll": "you will", "you're": "you are", }
read_training_data
function.training_words = tokenize_and_stem_text(training_words) print(read_training_data(data)) read_training_data(data)
vector
here. We loop through the words in the phrase, stemming them and comparing with each word in the vocabulary. If the sentence has a word in our training data or vocabulary, 1
is appended to the vector, signaling which label the word belongs to. If not, a 0
is appended.training = [] for item in json_data: bag_vector = [] token_words = item[0] token_words = [lemma.lemmatize(word.lower()) for word in token_words] for word in training_words: if word in token_words: bag_vector.append(1) else: bag_vector.append(0) output_row = list([0] * len(binary_categories)) output_row[binary_categories.index(item[1])] = 1 training.append([bag_vector, output_row])
training
to a numpy
array so TensorFlow can process it as well, and split it into two variables: data
has the bag of words and labels
has the label.training = np.array(training) data = list(training[:, 0]) labels = list(training[:, 1])
input_data
input layer is for inputting or feeding data to a network, and the input to the network has size len(data[0])
for the length of our encoded bag of words and labels.softmax
activation function in this case because the labels are exclusive.tf.reset_default_graph() net = tflearn.input_data(shape=[None, len(data[0])]) net = tflearn.fully_connected(net, 32) net = tflearn.fully_connected(net, len(labels[0]), activation='softmax') net = tflearn.regression(net)
fit
method begins training and applies the gradient descent algorithm, a common first-order optimization deep learning algorithm. n_epoch
is the number of times the network will see all the data and batch_size
is the size data is sliced in to for the model to train on.model = tflearn.DNN(net) model.fit(data, labels, n_epoch=100, batch_size=16, show_metric=True)
def clean_for_tf(text): input_words = tokenize_and_stem_text(word_tokenize(text)) vector = [0]*len(training_words) for input_word in input_words: for ind, word in enumerate(training_words): if word == input_word: vector[ind] = 1 return(np.array(vector))
tensor = model.predict([clean_for_tf(INSERT-TEXT-HERE)]) print(binary_categories[np.argmax(tensor)])
predict
method on the model, getting the position of the largest value which represents the prediction.app = Flask(__name__) @app.route("/sms", methods=['POST']) def sms(): resp = MessagingResponse() inbMsg = request.values.get('Body').lower().strip() tensor = model.predict([clean_for_tf(inbMsg)]) resp.message( f'The message {inbMsg!r} corresponds to {binary_categories[np.argmax(tensor)]!r}.') return str(resp)