Paste #zcfEZM7pEDXBtmKWUJT5 at spacepaste

'''
The following simply checks a file of links (new.txt) against an existing file of links (existing.txt)
and outputs any unique links that are found (output.txt).
The existing.txt file is then updated ready to be checked against next time.
You should load your existing database links (basically your first batch of links) into the new.txt file first and not the existing.txt
file as the links go through a 'cleaning process.' Then the existing.txt file will populate with the 'cleaned links'
'''
import codecs
import time
from urlparse import urlparse
def strip_bom(lines):
''' Return a list of strings with the leading BOM_UTF8 removed. '''
return [ line if not line.startswith(codecs.BOM_UTF8) else line[3:] for line in lines]
def load_existing():
''' Return a list of existing URLs with leading BOM_UTF8 removed. '''
with open('existing.txt') as f:
return strip_bom(f.readlines())
def load_new():
''' Return a list of new URLs that have been normalized. '''
with open('new.txt', 'r') as f:
lines = strip_bom(f.readlines())
lines = [line.rstrip('\n') for line in lines]
urls = [urlparse(line).netloc+'/\n' for line in lines]
urls = [url if not url.startswith('www.') else url[4:] for url in urls]
return urls
def log_new_links(links):
''' Write unique new links to a file. '''
date = time.strftime("%d_%m_%Y")
with open("output_" + date + ".txt", "w") as f:
f.writelines(links)
def append_to_existing(lines):
''' Append unique new links to existing links. '''
# this appends the newly found links to the existing database of links ready for next time.
with open('existing.txt', 'a') as f:
f.writelines(lines)
def main():
new, existing = load_new(), load_existing()
links = set(new).difference(existing)
log_new_links(links)
append_to_existing(links)
main()
print 'done'

spacepaste

Paste details