-
- '''
- The following simply checks a file of links (new.txt) against an existing file of links (existing.txt)
- and outputs any unique links that are found (output.txt).
-
- The existing.txt file is then updated ready to be checked against next time.
-
- You should load your existing database links (basically your first batch of links) into the new.txt file first and not the existing.txt
- file as the links go through a 'cleaning process.' Then the existing.txt file will populate with the 'cleaned links'
- '''
- import codecs
- import time
- from urlparse import urlparse
-
- def strip_bom(lines):
- ''' Return a list of strings with the leading BOM_UTF8 removed. '''
- return [ line if not line.startswith(codecs.BOM_UTF8) else line[3:] for line in lines]
-
- def load_existing():
- ''' Return a list of existing URLs with leading BOM_UTF8 removed. '''
- with open('existing.txt') as f:
- return strip_bom(f.readlines())
-
- def load_new():
- ''' Return a list of new URLs that have been normalized. '''
- with open('new.txt', 'r') as f:
- lines = strip_bom(f.readlines())
- lines = [line.rstrip('\n') for line in lines]
- urls = [urlparse(line).netloc+'/\n' for line in lines]
- urls = [url if not url.startswith('www.') else url[4:] for url in urls]
- return urls
-
- def log_new_links(links):
- ''' Write unique new links to a file. '''
- date = time.strftime("%d_%m_%Y")
- with open("output_" + date + ".txt", "w") as f:
- f.writelines(links)
-
- def append_to_existing(lines):
- ''' Append unique new links to existing links. '''
- # this appends the newly found links to the existing database of links ready for next time.
- with open('existing.txt', 'a') as f:
- f.writelines(lines)
-
- def main():
- new, existing = load_new(), load_existing()
- links = set(new).difference(existing)
-
- log_new_links(links)
- append_to_existing(links)
-
- main()
- print 'done'
-