- from bs4 import BeautifulSoup
- import requests
- import urllib2
- #url = raw_input("Enter a website to extract the URL's from: ")
- import tempfile
- import os
- import json
- import copy
- from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
- from pdfminer.pdfpage import PDFPage
- from pdfminer.converter import TextConverter
- from pdfminer.layout import LAParams
- from cStringIO import StringIO
- from pypdfocr.pypdfocr import PyPDFOCR
- import urllib
- def get_sheriff_sale_data(url):
- r = requests.get(url)
- data = r.text
- soup = BeautifulSoup(data)
- properties = []
- propertyTableRows = soup.find_all('th', '', 'Sale date')[0].parent.parent.find_all('tr')
- propertyTableRows = propertyTableRows[1:] #get rid of the table header
- #[1].find_all('td')[3].text
- for tr in propertyTableRows:
- tdSibs = tr.find_all('td')
- if len(tdSibs) == 5:
- property = {}
- property['date'] = tdSibs[0].text
- property['property_type'] = tdSibs[1].text
- property['city'] = tdSibs[2].text
- property['notice_of_sale'] = tdSibs[3].text
- property['pdf'] = tdSibs[3].find('a').get('href')
- property['supporting_docs'] = [a.get('href') for a in tdSibs[4].find_all('a')]
- properties.append(property)
- return properties
- def remove_filedata_from_properties(properties):
- fileData=[]
- fileName ='sherrifSalePropertiesParsed.json'
- #
- #open the existing file if it exists
- if os.path.isfile(fileName):
- with open(fileName, 'r') as jsonFile:
- try:
- fileData = json.load(jsonFile)
- except:
- pass
- for item in fileData:
- #to prevent re-downloading PDFs, remove any current properties that were in the file
- if item in properties:
- properties.remove(item)
- def get_json_data(fileName):
- fileData=[]
- #open the existing file if it exists
- if os.path.isfile(fileName):
- with open(fileName, 'r') as jsonFile:
- try:
- fileData = json.load(jsonFile)
- except:
- pass
- return fileData
- def update_json_with_downloaded_pdfs(properties):
- fileName ='sherrifSalePropertiesParsed.json'
- fileData = get_json_data(fileName)
- for item in properties:
- #to prevent re-downloading PDFs, remove any current properties that were in the file
- if item not in fileData:
- fileData.append(item)
- with open(fileName, 'w+') as jsonFile:
- jsonFile.write(json.dumps(fileData))
- def save_json(properties):
- fileName ='latestProperties.json'
- with open(fileName, 'w+') as jsonFile:
- jsonFile.write(json.dumps(properties))
- def make_pdf_dir():
- try:
- dirPath = os.path.abspath('pdfs')
- os.makedirs(dirPath)
- except:
- print "error with making pdfs dir in this folder, trying tempfolder instead: %s" % tempfile.gettempdir()
- dirPath = os.path.abspath(os.path.join(tempfile.gettempdir(), 'pdfs'))
- os.makedirs(dirPath)
- return dirPath
- def download_property_pdfs(properties, siteUrl, dir):
- for i, property in enumerate(properties):
- pdfUrl = property['pdf']
- name = pdfUrl.split('/')[-1]
- pdfUrl = siteUrl + pdfUrl
- rq = urllib2.Request(pdfUrl)
- res = urllib2.urlopen(rq)
- pdf = open(os.path.join(dir, name), 'wb+')
- pdf.write(res.read())
- pdf.close()
- print "PERCENT DONE DOWNLOADING PDFs: %s%%" % str(float(i+1)/len(properties)*100)
- def convert_pdf(path):
- rsrcmgr = PDFResourceManager()
- retstr = StringIO()
- codec = 'utf-8'
- laparams = LAParams()
- device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
- #
- fp = file(path, 'rb')
- #process_pdf(rsrcmgr, device, fp)
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- for page in PDFPage.get_pages(fp, range(10),
- maxpages=5, password='',
- caching=True, check_extractable=False):
- #page.rotate = (page.rotate+0) % 360
- interpreter.process_page(page)
- #
- fp.close()
- device.close()
- #
- str = retstr.getvalue()
- retstr.close()
- return str
- def get_all_PDF_addresses(properties, dir):
- for i, property in enumerate(properties):
- if hasattr(property, 'address'):
- continue
- try:
- pdfUrl = property['pdf']
- name = pdfUrl.split('/')[-1]
- #
- nameNoExt = name[:-4]
- nameOCR = nameNoExt + '_ocr.pdf'
- print dir, name
- path = os.path.abspath(os.path.join(dir, name))
- #path.replace('\\\\','\\')
- #path1 = '"%s"' % path
- #use PDFminer to get extract the text
- text = convert_pdf(path)
- #if the PDFminer text didn't find our string, try doing OCR
- if not 'The property is commonly known as' in text:
- #path.replace('\\\\','\\')
- #path1 = '"%s"' % path
- do_PDF_OCR(path)
- path = os.path.abspath(os.path.join(dir, nameOCR))
- text = convert_pdf(path)
- if not 'The property is commonly known as' in text:
- print 'OCR failed'
- sys.exit(0)
- #tsplit = text.split('\n')
- idx = text.index('commonly known')
- tsplit = text[idx:idx+150].split('\n')
- #addr1 = tsplit[idx].split(':')[1].strip()
- addr1 = tsplit[2].strip()
- addr2 = tsplit[4].strip()
- properties[i]['address'] = addr1 + ', ' + addr2
- except Exception as e:
- print "\n Error during parsing %s" % repr(e)
- pass
- print "\nPERCENT DONE: %s%%" % str(float(i+1)/len(properties)*100)
- def do_PDF_OCR(path):
- script = PyPDFOCR()
- print path
- script.go([path])
- googleGeocodeUrl = 'http://maps.googleapis.com/maps/api/geocode/json?'
- def get_coordinates(query, from_sensor=False):
- query = query.encode('utf-8')
- params = {
- 'address': query,
- 'sensor': "true" if from_sensor else "false"
- }
- url = googleGeocodeUrl + urllib.urlencode(params)
- json_response = urllib.urlopen(url)
- response = json.loads(json_response.read())
- if response['results']:
- location = response['results'][0]['geometry']['location']
- latitude, longitude = location['lat'], location['lng']
- print query, latitude, longitude
- else:
- latitude, longitude = None, None
- print query, "<no results>"
- return latitude, longitude
- def convertDictToGeoJSON(properties, fileName='geoJsonSheriffs.geojson'):
- geoList = []
- hits=0
- misses=0
- for property in properties:
- if not 'address' in property:
- misses+=1
- continue
- try:
- coords = (list(get_coordinates(property['address'])))[::-1]
- except UnicodeDecodeError:
- misses+=1
- continue
- if None in coords or 'null' in coords:
- misses+=1
- continue
- geoDict = {}
- geoDict['type'] = 'Feature'
- geoDict["geometry"]= {
- "type": "Point",
- "coordinates": coords
- }
- geoDict["properties"]={
- "name": property['address'] + ' --- \n' + property['notice_of_sale'] + ' --- \n' + property['date']
- }
- geoList.append(geoDict)
- hits+=1
- print "SUCCESS with errors possible: %d, FAIL: %d" % (hits, misses)
- with open(fileName, 'w+') as jsonFile:
- jsonFile.write(json.dumps(geoList))
- #import os
- #path = os.path.abspath(r"c:\Users\Chandni\Appdata\local\temp\pdfs\8-15Herscovitz.pdf")
- #get_all_PDF_addresses(properties, dirPath)
- #do_PDF_OCR(fileName)
- #fileText = convert_pdf()
- if __name__=='__main__':
- mainUrl = 'http://www.oregonsheriffs.com/'
- wacoUrl = mainUrl + 'sales-washington.htm'
- web_properties = get_sheriff_sale_data(wacoUrl)
- #
- properties_needing_downloaded=copy.deepcopy(web_properties)
- remove_filedata_from_properties(properties_needing_downloaded)
- #dirPath = make_pdf_dir()
- dirPath = os.path.abspath(os.path.join(tempfile.gettempdir(), 'pdfs'))
- download_property_pdfs(properties_needing_downloaded, mainUrl, dirPath)
- update_json_with_downloaded_pdfs(properties_needing_downloaded)
- file_properties = get_json_data('sherrifSalePropertiesParsed.json')
- try:
- get_all_PDF_addresses(file_properties, dirPath)
- finally:
- update_json_with_downloaded_pdfs(file_properties)