Commit 26fce1d6 authored by Vermillion's avatar Vermillion
Browse files

keras implemented, install tensorflow and keras (probably in a virtual...

keras implemented, install tensorflow and keras (probably in a virtual environment, you can read that from the page) and run classifier.py; commented out printing in arr_to_class function of pandas_stuff.py
parent b6c12de4
# LSTM with Dropout for sequence classification in the IMDB dataset
import json
import os.path
import numpy as np
import pandas_stuff as pd
import keras.preprocessing.text as kpt
from keras.models import Sequential, model_from_json
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
__path__ = "C:\\txtcls\\project.dir\\classifier.py"
top_words = 5000
max_length = 1000
def make_dict(df, row, outfile='dict.json'):
if os.path.exists(outfile):
# read in our saved dictionary
with open(outfile, 'r') as f:
dictionary = json.load(f)
return dictionary
x = df[row]
tokenizer = Tokenizer(num_words=top_words) # dictionary size = 5000
tokenizer.fit_on_texts(x)
with open(outfile, 'w') as f:
json.dump(tokenizer.word_index, f) # save dictionary of word indices
return tokenizer.word_index
def text_to_indices(text, dictionary):
return [(dictionary[word] if dictionary[word] < top_words else 0) for word in kpt.text_to_word_sequence(text)]
# Returns the vectorized job posting and expected output from the dataframe df.
def get_from_frame(df, row, cols, elem):
x = df[row] # get the data as text
y = df[cols].values # get expected output
dictionary = make_dict(df,row)
# Convert raw text data into (dictionary-)indexed format
x = np.asarray(text_to_indices(x[elem],dictionary))
if len(x) > max_length: x = np.resize(x, max_length)
x = np.pad(x, pad_width=(max_length - len(x), 0), mode='constant') # pad array to max length
return (x, y[elem]) # return the (x,y) pair at index elem
def ML_lstm(df, row, cols, model=None):
x = df[row] # get the data as text
y = df[cols].values
dictionary = make_dict(df,row)
# Convert raw text data into (dictionary-)indexed format
x = np.asarray([text_to_indices(text,dictionary) for text in x])
# split train/test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.1, shuffle=True)
# truncate/pad input sequences
x_train = sequence.pad_sequences(x_train, maxlen=max_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_length)
# build
vector_len = 32
if model == None:
model = Sequential()
model.add(Embedding(top_words, vector_len, input_length=max_length))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(5, activation='softmax')) # was originally sigmoid activation, try this new stuff out
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(x_train, y_train, epochs=2, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))
return model
# fix random seed for reproducibility
np.random.seed(7)
model = None
y_cols = ["hs_ged","associates","bachelors","masters","phd_prof"]
df = pd.load_df("training.csv")
if os.path.exists('model.json') and os.path.exists('model.h5'):
loading = input("Model found, load and test? y/n: ")
if loading in ['y','Y','yes']:
# load previously-trained model
with open('model.json', 'r') as json_file:
loaded_model_json = json_file.read()
model = model_from_json(loaded_model_json)
model.load_weights('model.h5')
else:
model = ML_lstm(df, 'body', y_cols) # let's try it
# now we can run some tests
print("Model generated / loaded")
while 1:
val = input("Input a number from 0~8000 or so: ")
if not val.isdigit():
print("non-integer value, exiting...")
break
val = int(val)
x,y = get_from_frame(df, "body", y_cols, val)
x = np.asarray([x])
pred = model.predict(x) # get model's prediction of input sample
print("Element #%d actual y value = " % val, y)
print("Element #%d predicted y value = " % val, pred)
# save everything
should_save = input("Save network? (y/n): ")
if should_save in ['y','Y','yes']:
model_json = model.to_json()
with open('model.json', 'w') as json_file:
json_file.write(model_json)
model.save_weights('model.h5')
print('Keras network model saved.')
This diff is collapsed.
File added
{"class_name": "Sequential", "config": {"name": "sequential_1", "layers": [{"class_name": "Embedding", "config": {"name": "embedding_1", "trainable": true, "batch_input_shape": [null, 1000], "dtype": "float32", "input_dim": 5000, "output_dim": 32, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 1000}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.2, "noise_shape": null, "seed": null}}, {"class_name": "LSTM", "config": {"name": "lstm_1", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "units": 100, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 1}}, {"class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "rate": 0.2, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 5, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}]}, "keras_version": "2.2.4", "backend": "tensorflow"}
\ No newline at end of file
......@@ -33,7 +33,7 @@ def nd_arr_to_class(nd_arr):
classes.append(num)
num_classes.add(num)
if num > 6:
print(row)
#print(row)
num_bad += 1
print("Num Classes", len(num_classes))
print("Num Bad", num_bad)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment