def compare_documents(documentA, documentB): """ Dsc: computes cosine text similarity from tf idf of given document. Inf - source: https://stackoverflow.com/questions/8897593/similarity-between-two-text-documents Dependencies: http://scikit-learn.org/stable/install.html from sklearn.feature_extraction.text import TfidfVectorizer pip install -U scikit-learn numpy scipy """ try: print("compare_documents, documentA: '%s'" % documentA) except Exception as err: print("compare_documents, Could not print documentA, err: '%s'" % err) sys.stdout.flush() try: print("compare_documents, documentB: '%s'" % documentB) except Exception as err: print("compare_documents, Could not print documentB, err: '%s'" % err) sys.stdout.flush() input = [documentA, documentB] vect = TfidfVectorizer(min_df=1, decode_error='ignore') tfidf = vect.fit_transform(input) sim_matrix = (tfidf * tfidf.T).A return sim_matrix[0][1]