from bs4 import BeautifulSoup

import requests
import urllib2
#url = raw_input("Enter a website to extract the URL's from: ")

import tempfile
import os
import json 
import copy

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO


from pypdfocr.pypdfocr import PyPDFOCR

import urllib

def get_sheriff_sale_data(url):
	r  = requests.get(url)

	data = r.text

	soup = BeautifulSoup(data)

	properties = []
	propertyTableRows = soup.find_all('th', '', 'Sale date')[0].parent.parent.find_all('tr')
	propertyTableRows = propertyTableRows[1:] #get rid of the table header

	#[1].find_all('td')[3].text
	for tr in propertyTableRows:
		tdSibs = tr.find_all('td')
		if len(tdSibs) == 5:
			property = {}
			property['date'] = tdSibs[0].text
			property['property_type'] = tdSibs[1].text
			property['city'] = tdSibs[2].text
			property['notice_of_sale'] = tdSibs[3].text
			property['pdf'] = tdSibs[3].find('a').get('href')
			property['supporting_docs'] = [a.get('href') for a in tdSibs[4].find_all('a')]
			properties.append(property)
	return properties
	
def remove_filedata_from_properties(properties):
	fileData=[]
	fileName ='sherrifSalePropertiesParsed.json'
	#
	#open the existing file if it exists
	if os.path.isfile(fileName):
		with open(fileName, 'r') as jsonFile:
			try:
				fileData = json.load(jsonFile)
			except:
				pass
			for item in fileData:
				#to prevent re-downloading PDFs, remove any current properties that were in the file
				if item in properties:
					properties.remove(item)

def get_json_data(fileName):
	fileData=[]
	#open the existing file if it exists
	if os.path.isfile(fileName):
		with open(fileName, 'r') as jsonFile:
			try:
				fileData = json.load(jsonFile)
			except:
				pass
	return fileData

def update_json_with_downloaded_pdfs(properties):
	fileName ='sherrifSalePropertiesParsed.json'
	fileData = get_json_data(fileName)
	for item in properties:
		#to prevent re-downloading PDFs, remove any current properties that were in the file
		if item not in fileData:
			fileData.append(item)
	with open(fileName, 'w+') as jsonFile:
			jsonFile.write(json.dumps(fileData))

def save_json(properties):
	fileName ='latestProperties.json'
	with open(fileName, 'w+') as jsonFile:
		jsonFile.write(json.dumps(properties))
			

def make_pdf_dir():
	try:
		dirPath = os.path.abspath('pdfs')
		os.makedirs(dirPath)
	except:
		print "error with making pdfs dir in this folder, trying tempfolder instead: %s" % tempfile.gettempdir()
		dirPath = os.path.abspath(os.path.join(tempfile.gettempdir(), 'pdfs'))
		os.makedirs(dirPath)
	return dirPath
	
def download_property_pdfs(properties, siteUrl, dir):	
	for i, property in enumerate(properties):
		pdfUrl = property['pdf']
		name = pdfUrl.split('/')[-1]
		pdfUrl = siteUrl + pdfUrl	
		rq = urllib2.Request(pdfUrl)
		res = urllib2.urlopen(rq)	
		pdf = open(os.path.join(dir, name), 'wb+')
		pdf.write(res.read())
		pdf.close()
		print "PERCENT DONE DOWNLOADING PDFs: %s%%" % str(float(i+1)/len(properties)*100)


def convert_pdf(path):
	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	codec = 'utf-8'
	laparams = LAParams()
	device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
	#
	fp = file(path, 'rb')
	#process_pdf(rsrcmgr, device, fp)
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	for page in PDFPage.get_pages(fp, range(10),
								  maxpages=5, password='',
								  caching=True, check_extractable=False):
		#page.rotate = (page.rotate+0) % 360
		interpreter.process_page(page)
	#
	fp.close()
	device.close()
	#
	str = retstr.getvalue()
	retstr.close()
	return str

def get_all_PDF_addresses(properties, dir):
	for i, property in enumerate(properties):
		if hasattr(property, 'address'):
			continue
		try:
			pdfUrl = property['pdf']
			name = pdfUrl.split('/')[-1]
			#
			nameNoExt = name[:-4]
			nameOCR = nameNoExt + '_ocr.pdf'
			print dir, name
			path = os.path.abspath(os.path.join(dir, name))
			#path.replace('\\\\','\\')
			#path1 = '"%s"' % path
			#use PDFminer to get extract the text
			text = convert_pdf(path)
			#if the PDFminer text didn't find our string, try doing OCR
			if not 'The property is commonly known as' in text:
				#path.replace('\\\\','\\')
				#path1 = '"%s"' % path
				do_PDF_OCR(path)
				path = os.path.abspath(os.path.join(dir, nameOCR))
				text = convert_pdf(path)
				if not 'The property is commonly known as' in text:
					print 'OCR failed'
					sys.exit(0)
			#tsplit = text.split('\n')
			idx = text.index('commonly known')
			tsplit = text[idx:idx+150].split('\n')
			#addr1 = tsplit[idx].split(':')[1].strip()
			addr1 = tsplit[2].strip()
			addr2 = tsplit[4].strip()
			properties[i]['address'] = addr1 + ', ' + addr2
		except Exception as e:
			print "\n Error during parsing %s" % repr(e)
			pass
		print "\nPERCENT DONE: %s%%" % str(float(i+1)/len(properties)*100)

def do_PDF_OCR(path):
	script = PyPDFOCR()
	print path
	script.go([path])

	

googleGeocodeUrl = 'http://maps.googleapis.com/maps/api/geocode/json?'

def get_coordinates(query, from_sensor=False):
    query = query.encode('utf-8')
    params = {
        'address': query,
        'sensor': "true" if from_sensor else "false"
    }
    url = googleGeocodeUrl + urllib.urlencode(params)
    json_response = urllib.urlopen(url)
    response = json.loads(json_response.read())
    if response['results']:
        location = response['results'][0]['geometry']['location']
        latitude, longitude = location['lat'], location['lng']
        print query, latitude, longitude
    else:
        latitude, longitude = None, None
        print query, "<no results>"
    return latitude, longitude

	
	
def convertDictToGeoJSON(properties, fileName='geoJsonSheriffs.geojson'):
	geoList = []
	hits=0
	misses=0
	for property in properties:
		if not 'address' in property:
			misses+=1
			continue
		try:
			coords = (list(get_coordinates(property['address'])))[::-1]
		except UnicodeDecodeError:
			misses+=1
			continue
		if None in coords or 'null' in coords:
			misses+=1
			continue
		geoDict = {}
		geoDict['type'] = 'Feature'
		geoDict["geometry"]= {
			"type": "Point",
			"coordinates": coords
		  }
		geoDict["properties"]={
			"name": property['address'] + ' --- \n' + property['notice_of_sale'] + ' --- \n' + property['date']
		}
		geoList.append(geoDict)
		hits+=1
	print "SUCCESS with errors possible: %d, FAIL: %d" % (hits, misses)
	with open(fileName, 'w+') as jsonFile:
		jsonFile.write(json.dumps(geoList))
	
#import os
#path = os.path.abspath(r"c:\Users\Chandni\Appdata\local\temp\pdfs\8-15Herscovitz.pdf")
	
#get_all_PDF_addresses(properties, dirPath)
#do_PDF_OCR(fileName)

#fileText = convert_pdf()	



if __name__=='__main__':
	mainUrl = 'http://www.oregonsheriffs.com/'
	wacoUrl = mainUrl + 'sales-washington.htm'
	web_properties = get_sheriff_sale_data(wacoUrl)
	#
	properties_needing_downloaded=copy.deepcopy(web_properties)
	remove_filedata_from_properties(properties_needing_downloaded)
	
	#dirPath = make_pdf_dir()
	dirPath = os.path.abspath(os.path.join(tempfile.gettempdir(), 'pdfs'))
	download_property_pdfs(properties_needing_downloaded, mainUrl, dirPath)
	update_json_with_downloaded_pdfs(properties_needing_downloaded)
	file_properties = get_json_data('sherrifSalePropertiesParsed.json')
	try:
		get_all_PDF_addresses(file_properties, dirPath)
	finally:
		update_json_with_downloaded_pdfs(file_properties)