from bs4 import BeautifulSoup import requests import urllib2 #url = raw_input("Enter a website to extract the URL's from: ") import tempfile import os import json import copy from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from cStringIO import StringIO from pypdfocr.pypdfocr import PyPDFOCR import urllib def get_sheriff_sale_data(url): r = requests.get(url) data = r.text soup = BeautifulSoup(data) properties = [] propertyTableRows = soup.find_all('th', '', 'Sale date')[0].parent.parent.find_all('tr') propertyTableRows = propertyTableRows[1:] #get rid of the table header #[1].find_all('td')[3].text for tr in propertyTableRows: tdSibs = tr.find_all('td') if len(tdSibs) == 5: property = {} property['date'] = tdSibs[0].text property['property_type'] = tdSibs[1].text property['city'] = tdSibs[2].text property['notice_of_sale'] = tdSibs[3].text property['pdf'] = tdSibs[3].find('a').get('href') property['supporting_docs'] = [a.get('href') for a in tdSibs[4].find_all('a')] properties.append(property) return properties def remove_filedata_from_properties(properties): fileData=[] fileName ='sherrifSalePropertiesParsed.json' # #open the existing file if it exists if os.path.isfile(fileName): with open(fileName, 'r') as jsonFile: try: fileData = json.load(jsonFile) except: pass for item in fileData: #to prevent re-downloading PDFs, remove any current properties that were in the file if item in properties: properties.remove(item) def get_json_data(fileName): fileData=[] #open the existing file if it exists if os.path.isfile(fileName): with open(fileName, 'r') as jsonFile: try: fileData = json.load(jsonFile) except: pass return fileData def update_json_with_downloaded_pdfs(properties): fileName ='sherrifSalePropertiesParsed.json' fileData = get_json_data(fileName) for item in properties: #to prevent re-downloading PDFs, remove any current properties that were in the file if item not in fileData: fileData.append(item) with open(fileName, 'w+') as jsonFile: jsonFile.write(json.dumps(fileData)) def save_json(properties): fileName ='latestProperties.json' with open(fileName, 'w+') as jsonFile: jsonFile.write(json.dumps(properties)) def make_pdf_dir(): try: dirPath = os.path.abspath('pdfs') os.makedirs(dirPath) except: print "error with making pdfs dir in this folder, trying tempfolder instead: %s" % tempfile.gettempdir() dirPath = os.path.abspath(os.path.join(tempfile.gettempdir(), 'pdfs')) os.makedirs(dirPath) return dirPath def download_property_pdfs(properties, siteUrl, dir): for i, property in enumerate(properties): pdfUrl = property['pdf'] name = pdfUrl.split('/')[-1] pdfUrl = siteUrl + pdfUrl rq = urllib2.Request(pdfUrl) res = urllib2.urlopen(rq) pdf = open(os.path.join(dir, name), 'wb+') pdf.write(res.read()) pdf.close() print "PERCENT DONE DOWNLOADING PDFs: %s%%" % str(float(i+1)/len(properties)*100) def convert_pdf(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # fp = file(path, 'rb') #process_pdf(rsrcmgr, device, fp) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, range(10), maxpages=5, password='', caching=True, check_extractable=False): #page.rotate = (page.rotate+0) % 360 interpreter.process_page(page) # fp.close() device.close() # str = retstr.getvalue() retstr.close() return str def get_all_PDF_addresses(properties, dir): for i, property in enumerate(properties): if hasattr(property, 'address'): continue try: pdfUrl = property['pdf'] name = pdfUrl.split('/')[-1] # nameNoExt = name[:-4] nameOCR = nameNoExt + '_ocr.pdf' print dir, name path = os.path.abspath(os.path.join(dir, name)) #path.replace('\\\\','\\') #path1 = '"%s"' % path #use PDFminer to get extract the text text = convert_pdf(path) #if the PDFminer text didn't find our string, try doing OCR if not 'The property is commonly known as' in text: #path.replace('\\\\','\\') #path1 = '"%s"' % path do_PDF_OCR(path) path = os.path.abspath(os.path.join(dir, nameOCR)) text = convert_pdf(path) if not 'The property is commonly known as' in text: print 'OCR failed' sys.exit(0) #tsplit = text.split('\n') idx = text.index('commonly known') tsplit = text[idx:idx+150].split('\n') #addr1 = tsplit[idx].split(':')[1].strip() addr1 = tsplit[2].strip() addr2 = tsplit[4].strip() properties[i]['address'] = addr1 + ', ' + addr2 except Exception as e: print "\n Error during parsing %s" % repr(e) pass print "\nPERCENT DONE: %s%%" % str(float(i+1)/len(properties)*100) def do_PDF_OCR(path): script = PyPDFOCR() print path script.go([path]) googleGeocodeUrl = 'http://maps.googleapis.com/maps/api/geocode/json?' def get_coordinates(query, from_sensor=False): query = query.encode('utf-8') params = { 'address': query, 'sensor': "true" if from_sensor else "false" } url = googleGeocodeUrl + urllib.urlencode(params) json_response = urllib.urlopen(url) response = json.loads(json_response.read()) if response['results']: location = response['results'][0]['geometry']['location'] latitude, longitude = location['lat'], location['lng'] print query, latitude, longitude else: latitude, longitude = None, None print query, "<no results>" return latitude, longitude def convertDictToGeoJSON(properties, fileName='geoJsonSheriffs.geojson'): geoList = [] hits=0 misses=0 for property in properties: if not 'address' in property: misses+=1 continue try: coords = (list(get_coordinates(property['address'])))[::-1] except UnicodeDecodeError: misses+=1 continue if None in coords or 'null' in coords: misses+=1 continue geoDict = {} geoDict['type'] = 'Feature' geoDict["geometry"]= { "type": "Point", "coordinates": coords } geoDict["properties"]={ "name": property['address'] + ' --- \n' + property['notice_of_sale'] + ' --- \n' + property['date'] } geoList.append(geoDict) hits+=1 print "SUCCESS with errors possible: %d, FAIL: %d" % (hits, misses) with open(fileName, 'w+') as jsonFile: jsonFile.write(json.dumps(geoList)) #import os #path = os.path.abspath(r"c:\Users\Chandni\Appdata\local\temp\pdfs\8-15Herscovitz.pdf") #get_all_PDF_addresses(properties, dirPath) #do_PDF_OCR(fileName) #fileText = convert_pdf() if __name__=='__main__': mainUrl = 'http://www.oregonsheriffs.com/' wacoUrl = mainUrl + 'sales-washington.htm' web_properties = get_sheriff_sale_data(wacoUrl) # properties_needing_downloaded=copy.deepcopy(web_properties) remove_filedata_from_properties(properties_needing_downloaded) #dirPath = make_pdf_dir() dirPath = os.path.abspath(os.path.join(tempfile.gettempdir(), 'pdfs')) download_property_pdfs(properties_needing_downloaded, mainUrl, dirPath) update_json_with_downloaded_pdfs(properties_needing_downloaded) file_properties = get_json_data('sherrifSalePropertiesParsed.json') try: get_all_PDF_addresses(file_properties, dirPath) finally: update_json_with_downloaded_pdfs(file_properties)