predict_companies.py

# -*- coding: utf-8 -*-
"""predict_erwin.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1RFp77w0sjKF_xJFnXSXIdNJJLcqKLxDo
"""
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('-s', '--string', help='Company name to be predicted')
parser.add_argument('-p', '--load', help='Loaded model')

args = parser.parse_args()

string = args.string
loaded_model = args.load

s = string

import tensorflow as tf

model = tf.keras.models.load_model(loaded_model)

from sklearn.feature_extraction.text import CountVectorizer
import re
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/bookings_Erwin.csv', low_memory= False)

df.dropna(subset = ["booking_account"], inplace=True)

df['description'] = df.apply(lambda row : row['description'].lower(), axis = 1)
df['description'] = df.apply(lambda row : re.sub(r'[0-9]', '', row['description']), axis = 1)
df['description'] = df.apply(lambda row : re.sub(r'[\W_]+', ' ', row['description']), axis = 1)

vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(df['description'])
corpus  = vectorizer.vocabulary_

def text_to_num(s):
  temp = []
  s = s.split(' ')
  for word in s:
    if word in corpus: temp.append(corpus[word])
    else: temp.append(0)
  return temp

labels = df["booking_account"].unique()
labels.sort()

num_words= 5

## cleaning texts
s= s.lower()
s = re.sub(r'[0-9]', '', s)
re.sub(r'[\W_]+', ' ', s)

s = text_to_num(s)

data = np.expand_dims(s, axis=0)
data = pad_sequences(data, maxlen=num_words)

hist = model.predict(data)

print('Prediction:', labels[np.argmax(hist)])