Chatbot for reading PDF files using Python
In one of my articles, I explained how to read pdf files and return a tokenised corpus of the pdf. Here, I have used horticulture data as my pdf.
So moving on to the next steps we will be requiring the following thing overall :
Importing useful libraries
import random
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('punkt')
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import os
Function to read PDF files that returns text corpus.
def get_pdf_data(user_resp):
PDF_file = "horticulturae-03-00030-v2.pdf"
pages = convert_from_path(PDF_file, 500,poppler_path= r'C:\Program Files (x86)\poppler-0.68.0\bin')
image_counter = 1
for page in pages:
filename = "page_"+str(image_counter)+".jpg"
page.save(filename, 'JPEG')
image_counter = image_counter + 1filelimit = image_counter-1
outfile = "out_text.txt"
#f = open(outfile, "a")
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
corpus =''
for i in range(1, filelimit + 1):
filename = "page_"+str(i)+".jpg"
text = str(((pytesseract.image_to_string(Image.open(filename)))))
text = text.replace('-\n', '')
corpus+= text
sent_tokens = nltk.sent_tokenize(corpus)
return sent_tokens
Clean the data and make it usable.
def LemNormalize(corpus):
return nltk.word_tokenize(corpus.lower().translate(remove_punct_dict))#Greeting Inputs
GREETING_INPUTS = ["hi", "hello", "hola", "greetings", "wassup", "hey"]#Greeting responses back to the user
GREETING_RESPONSES=["howdy", "hi", "hey", "what's good", "hello", "hey there"]#Function to return a random greeting response to a users greeting
def greeting(sentence):
#if the user's input is a greeting, then return a randomly chosen greeting response
for word in sentence.split():
if word.lower() in GREETING_INPUTS:
return random.choice(GREETING_RESPONSES)
Create a text similarity detection function that matches the user inputs and returns similar sentences.
Here I have used TF-IDF vectors and cosine similarity scores for matching the data with user-input
def response(user_response):#The users response / query
user_response = user_response.lower() #Make the response lower case###Print the users query/ response
#print(user_response)#Set the chatbot response to an empty string
robo_response = ''
sent_tokens = get_pdf_data(user_response)#Append the users response to the sentence list
sent_tokens.append(user_response)###Print the sentence list after appending the users response
#print(sent_tokens)#Create a TfidfVectorizer Object
TfidfVec = TfidfVectorizer(tokenizer = LemNormalize, stop_words='english')#Convert the text to a matrix of TF-IDF features
tfidf = TfidfVec.fit_transform(sent_tokens)###Print the TFIDF features
#print(tfidf)
#Get the measure of similarity (similarity scores)
vals = cosine_similarity(tfidf[-1], tfidf)#Print the similarity scores
#print(vals)#Get the index of the most similar text/sentence to the users response
try:
idx = vals.argsort()[0][-2]
except IndexError :
robo_response = robo_response+"I apologize, I don't understand."
return robo_response#Reduce the dimensionality of vals
flat = vals.flatten()#sort the list in ascending order
flat.sort()#Get the most similar score to the users response
score = flat[-2]#Print the similarity score
#print(score)#If the variable 'score' is 0 then their is no text similar to the users response
if(score == 0):
robo_response = robo_response+"I apologize, I don't understand."
else:
robo_response = robo_response+sent_tokens[idx]
#Print the chat bot response
#print(robo_response)
#Remove the users response from the sentence tokens list
sent_tokens.remove(user_response)
return robo_response
A function that handles the input of the users.
flag = True
print("GrassBot: Hi! I will answer your queries.Please Ask. If you want to exit, type Bye!")
while(flag == True):
user_response = input()
user_response = user_response.lower()
if(user_response != 'bye'):
if(user_response == 'thanks' or user_response =='thank you'):
flag=False
print("GrassBot: You are welcome !")
else:
if(greeting(user_response) != None):
print("GrassBot: "+greeting(user_response))
else:
print("GrassBot: "+response(user_response))
else:
flag = False
print("GrassBot: Chat with you later !")
Sample output :
GrassBot: Hi! I will answer your queries.Please Ask. If you want to exit, type Bye!
Hi
GrassBot: howdy
what are the Ions that contribute to soil salinity ?
GrassBot: Ions that contribute to soil salinity include Cl-, SO,?-, HCO,-, Nat, Ca2+, Mg?t,
and, rarely, NO; or Kt.
what are nutritional effects of salinity ?
GrassBot: Nutritional Effects
SALINITY ANDCATIONNUTRITION The major nutritional effects of salinity are
those associated with cation nutrition.
thanks
GrassBot: You are welcome !
I have tried to keep it as simple for starting points, there are many optimisations we can do here and would love to hear from you as well in the comments :)
Thanks!