spacepaste

  1.  
  2. '''
  3. The following simply checks a file of links (new.txt) against an existing file of links (existing.txt)
  4. and outputs any unique links that are found (output.txt).
  5. The existing.txt file is then updated ready to be checked against next time.
  6. You should load your existing database links (basically your first batch of links) into the new.txt file first and not the existing.txt
  7. file as the links go through a 'cleaning process.' Then the existing.txt file will populate with the 'cleaned links'
  8. '''
  9. import codecs
  10. import time
  11. from urlparse import urlparse
  12. def strip_bom(lines):
  13. ''' Return a list of strings with the leading BOM_UTF8 removed. '''
  14. return [ line if not line.startswith(codecs.BOM_UTF8) else line[3:] for line in lines]
  15. def load_existing():
  16. ''' Return a list of existing URLs with leading BOM_UTF8 removed. '''
  17. with open('existing.txt') as f:
  18. return strip_bom(f.readlines())
  19. def load_new():
  20. ''' Return a list of new URLs that have been normalized. '''
  21. with open('new.txt', 'r') as f:
  22. lines = strip_bom(f.readlines())
  23. lines = [line.rstrip('\n') for line in lines]
  24. urls = [urlparse(line).netloc+'/\n' for line in lines]
  25. urls = [url if not url.startswith('www.') else url[4:] for url in urls]
  26. return urls
  27. def log_new_links(links):
  28. ''' Write unique new links to a file. '''
  29. date = time.strftime("%d_%m_%Y")
  30. with open("output_" + date + ".txt", "w") as f:
  31. f.writelines(links)
  32. def append_to_existing(lines):
  33. ''' Append unique new links to existing links. '''
  34. # this appends the newly found links to the existing database of links ready for next time.
  35. with open('existing.txt', 'a') as f:
  36. f.writelines(lines)
  37. def main():
  38. new, existing = load_new(), load_existing()
  39. links = set(new).difference(existing)
  40. log_new_links(links)
  41. append_to_existing(links)
  42. main()
  43. print 'done'
  44.