spacepaste

  1.  
  2. def compare_documents(documentA, documentB):
  3. """
  4. Dsc: computes cosine text similarity from tf idf of given document.
  5. Inf - source: https://stackoverflow.com/questions/8897593/similarity-between-two-text-documents
  6. Dependencies: http://scikit-learn.org/stable/install.html
  7. from sklearn.feature_extraction.text import TfidfVectorizer
  8. pip install -U scikit-learn numpy scipy
  9. """
  10. try:
  11. print("compare_documents, documentA: '%s'" % documentA)
  12. except Exception as err:
  13. print("compare_documents, Could not print documentA, err: '%s'" % err)
  14. sys.stdout.flush()
  15. try:
  16. print("compare_documents, documentB: '%s'" % documentB)
  17. except Exception as err:
  18. print("compare_documents, Could not print documentB, err: '%s'" % err)
  19. sys.stdout.flush()
  20. input = [documentA, documentB]
  21. vect = TfidfVectorizer(min_df=1, decode_error='ignore')
  22. tfidf = vect.fit_transform(input)
  23. sim_matrix = (tfidf * tfidf.T).A
  24. return sim_matrix[0][1]
  25.