Paste #7rOIMaAUtg8uB0UoOIW5 at spacepaste

from bs4 import BeautifulSoup
import requests
import urllib2
#url = raw_input("Enter a website to extract the URL's from: ")
import tempfile
import os
import json
import copy
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
from pypdfocr.pypdfocr import PyPDFOCR
import urllib
def get_sheriff_sale_data(url):
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data)
properties = []
propertyTableRows = soup.find_all('th', '', 'Sale date')[0].parent.parent.find_all('tr')
propertyTableRows = propertyTableRows[1:] #get rid of the table header
#[1].find_all('td')[3].text
for tr in propertyTableRows:
tdSibs = tr.find_all('td')
if len(tdSibs) == 5:
property = {}
property['date'] = tdSibs[0].text
property['property_type'] = tdSibs[1].text
property['city'] = tdSibs[2].text
property['notice_of_sale'] = tdSibs[3].text
property['pdf'] = tdSibs[3].find('a').get('href')
property['supporting_docs'] = [a.get('href') for a in tdSibs[4].find_all('a')]
properties.append(property)
return properties
def remove_filedata_from_properties(properties):
fileData=[]
fileName ='sherrifSalePropertiesParsed.json'
#
#open the existing file if it exists
if os.path.isfile(fileName):
with open(fileName, 'r') as jsonFile:
try:
fileData = json.load(jsonFile)
except:
pass
for item in fileData:
#to prevent re-downloading PDFs, remove any current properties that were in the file
if item in properties:
properties.remove(item)
def get_json_data(fileName):
fileData=[]
#open the existing file if it exists
if os.path.isfile(fileName):
with open(fileName, 'r') as jsonFile:
try:
fileData = json.load(jsonFile)
except:
pass
return fileData
def update_json_with_downloaded_pdfs(properties):
fileName ='sherrifSalePropertiesParsed.json'
fileData = get_json_data(fileName)
for item in properties:
#to prevent re-downloading PDFs, remove any current properties that were in the file
if item not in fileData:
fileData.append(item)
with open(fileName, 'w+') as jsonFile:
jsonFile.write(json.dumps(fileData))
def save_json(properties):
fileName ='latestProperties.json'
with open(fileName, 'w+') as jsonFile:
jsonFile.write(json.dumps(properties))
def make_pdf_dir():
try:
dirPath = os.path.abspath('pdfs')
os.makedirs(dirPath)
except:
print "error with making pdfs dir in this folder, trying tempfolder instead: %s" % tempfile.gettempdir()
dirPath = os.path.abspath(os.path.join(tempfile.gettempdir(), 'pdfs'))
os.makedirs(dirPath)
return dirPath
def download_property_pdfs(properties, siteUrl, dir):
for i, property in enumerate(properties):
pdfUrl = property['pdf']
name = pdfUrl.split('/')[-1]
pdfUrl = siteUrl + pdfUrl
rq = urllib2.Request(pdfUrl)
res = urllib2.urlopen(rq)
pdf = open(os.path.join(dir, name), 'wb+')
pdf.write(res.read())
pdf.close()
print "PERCENT DONE DOWNLOADING PDFs: %s%%" % str(float(i+1)/len(properties)*100)
def convert_pdf(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
#
fp = file(path, 'rb')
#process_pdf(rsrcmgr, device, fp)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, range(10),
maxpages=5, password='',
caching=True, check_extractable=False):
#page.rotate = (page.rotate+0) % 360
interpreter.process_page(page)
#
fp.close()
device.close()
#
str = retstr.getvalue()
retstr.close()
return str
def get_all_PDF_addresses(properties, dir):
for i, property in enumerate(properties):
if hasattr(property, 'address'):
continue
try:
pdfUrl = property['pdf']
name = pdfUrl.split('/')[-1]
#
nameNoExt = name[:-4]
nameOCR = nameNoExt + '_ocr.pdf'
print dir, name
path = os.path.abspath(os.path.join(dir, name))
#path.replace('\\\\','\\')
#path1 = '"%s"' % path
#use PDFminer to get extract the text
text = convert_pdf(path)
#if the PDFminer text didn't find our string, try doing OCR
if not 'The property is commonly known as' in text:
#path.replace('\\\\','\\')
#path1 = '"%s"' % path
do_PDF_OCR(path)
path = os.path.abspath(os.path.join(dir, nameOCR))
text = convert_pdf(path)
if not 'The property is commonly known as' in text:
print 'OCR failed'
sys.exit(0)
#tsplit = text.split('\n')
idx = text.index('commonly known')
tsplit = text[idx:idx+150].split('\n')
#addr1 = tsplit[idx].split(':')[1].strip()
addr1 = tsplit[2].strip()
addr2 = tsplit[4].strip()
properties[i]['address'] = addr1 + ', ' + addr2
except Exception as e:
print "\n Error during parsing %s" % repr(e)
pass
print "\nPERCENT DONE: %s%%" % str(float(i+1)/len(properties)*100)
def do_PDF_OCR(path):
script = PyPDFOCR()
print path
script.go([path])
googleGeocodeUrl = 'http://maps.googleapis.com/maps/api/geocode/json?'
def get_coordinates(query, from_sensor=False):
query = query.encode('utf-8')
params = {
'address': query,
'sensor': "true" if from_sensor else "false"
}
url = googleGeocodeUrl + urllib.urlencode(params)
json_response = urllib.urlopen(url)
response = json.loads(json_response.read())
if response['results']:
location = response['results'][0]['geometry']['location']
latitude, longitude = location['lat'], location['lng']
print query, latitude, longitude
else:
latitude, longitude = None, None
print query, "<no results>"
return latitude, longitude
def convertDictToGeoJSON(properties, fileName='geoJsonSheriffs.geojson'):
geoList = []
hits=0
misses=0
for property in properties:
if not 'address' in property:
misses+=1
continue
try:
coords = (list(get_coordinates(property['address'])))[::-1]
except UnicodeDecodeError:
misses+=1
continue
if None in coords or 'null' in coords:
misses+=1
continue
geoDict = {}
geoDict['type'] = 'Feature'
geoDict["geometry"]= {
"type": "Point",
"coordinates": coords
}
geoDict["properties"]={
"name": property['address'] + ' --- \n' + property['notice_of_sale'] + ' --- \n' + property['date']
}
geoList.append(geoDict)
hits+=1
print "SUCCESS with errors possible: %d, FAIL: %d" % (hits, misses)
with open(fileName, 'w+') as jsonFile:
jsonFile.write(json.dumps(geoList))
#import os
#path = os.path.abspath(r"c:\Users\Chandni\Appdata\local\temp\pdfs\8-15Herscovitz.pdf")
#get_all_PDF_addresses(properties, dirPath)
#do_PDF_OCR(fileName)
#fileText = convert_pdf()
if __name__=='__main__':
mainUrl = 'http://www.oregonsheriffs.com/'
wacoUrl = mainUrl + 'sales-washington.htm'
web_properties = get_sheriff_sale_data(wacoUrl)
#
properties_needing_downloaded=copy.deepcopy(web_properties)
remove_filedata_from_properties(properties_needing_downloaded)
#dirPath = make_pdf_dir()
dirPath = os.path.abspath(os.path.join(tempfile.gettempdir(), 'pdfs'))
download_property_pdfs(properties_needing_downloaded, mainUrl, dirPath)
update_json_with_downloaded_pdfs(properties_needing_downloaded)
file_properties = get_json_data('sherrifSalePropertiesParsed.json')
try:
get_all_PDF_addresses(file_properties, dirPath)
finally:
update_json_with_downloaded_pdfs(file_properties)

spacepaste

Paste details