spacepaste

  1.  
  2. from bs4 import BeautifulSoup
  3. import requests
  4. import urllib2
  5. #url = raw_input("Enter a website to extract the URL's from: ")
  6. import tempfile
  7. import os
  8. import json
  9. import copy
  10. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  11. from pdfminer.pdfpage import PDFPage
  12. from pdfminer.converter import TextConverter
  13. from pdfminer.layout import LAParams
  14. from cStringIO import StringIO
  15. from pypdfocr.pypdfocr import PyPDFOCR
  16. import urllib
  17. def get_sheriff_sale_data(url):
  18. r = requests.get(url)
  19. data = r.text
  20. soup = BeautifulSoup(data)
  21. properties = []
  22. propertyTableRows = soup.find_all('th', '', 'Sale date')[0].parent.parent.find_all('tr')
  23. propertyTableRows = propertyTableRows[1:] #get rid of the table header
  24. #[1].find_all('td')[3].text
  25. for tr in propertyTableRows:
  26. tdSibs = tr.find_all('td')
  27. if len(tdSibs) == 5:
  28. property = {}
  29. property['date'] = tdSibs[0].text
  30. property['property_type'] = tdSibs[1].text
  31. property['city'] = tdSibs[2].text
  32. property['notice_of_sale'] = tdSibs[3].text
  33. property['pdf'] = tdSibs[3].find('a').get('href')
  34. property['supporting_docs'] = [a.get('href') for a in tdSibs[4].find_all('a')]
  35. properties.append(property)
  36. return properties
  37. def remove_filedata_from_properties(properties):
  38. fileData=[]
  39. fileName ='sherrifSalePropertiesParsed.json'
  40. #
  41. #open the existing file if it exists
  42. if os.path.isfile(fileName):
  43. with open(fileName, 'r') as jsonFile:
  44. try:
  45. fileData = json.load(jsonFile)
  46. except:
  47. pass
  48. for item in fileData:
  49. #to prevent re-downloading PDFs, remove any current properties that were in the file
  50. if item in properties:
  51. properties.remove(item)
  52. def get_json_data(fileName):
  53. fileData=[]
  54. #open the existing file if it exists
  55. if os.path.isfile(fileName):
  56. with open(fileName, 'r') as jsonFile:
  57. try:
  58. fileData = json.load(jsonFile)
  59. except:
  60. pass
  61. return fileData
  62. def update_json_with_downloaded_pdfs(properties):
  63. fileName ='sherrifSalePropertiesParsed.json'
  64. fileData = get_json_data(fileName)
  65. for item in properties:
  66. #to prevent re-downloading PDFs, remove any current properties that were in the file
  67. if item not in fileData:
  68. fileData.append(item)
  69. with open(fileName, 'w+') as jsonFile:
  70. jsonFile.write(json.dumps(fileData))
  71. def save_json(properties):
  72. fileName ='latestProperties.json'
  73. with open(fileName, 'w+') as jsonFile:
  74. jsonFile.write(json.dumps(properties))
  75. def make_pdf_dir():
  76. try:
  77. dirPath = os.path.abspath('pdfs')
  78. os.makedirs(dirPath)
  79. except:
  80. print "error with making pdfs dir in this folder, trying tempfolder instead: %s" % tempfile.gettempdir()
  81. dirPath = os.path.abspath(os.path.join(tempfile.gettempdir(), 'pdfs'))
  82. os.makedirs(dirPath)
  83. return dirPath
  84. def download_property_pdfs(properties, siteUrl, dir):
  85. for i, property in enumerate(properties):
  86. pdfUrl = property['pdf']
  87. name = pdfUrl.split('/')[-1]
  88. pdfUrl = siteUrl + pdfUrl
  89. rq = urllib2.Request(pdfUrl)
  90. res = urllib2.urlopen(rq)
  91. pdf = open(os.path.join(dir, name), 'wb+')
  92. pdf.write(res.read())
  93. pdf.close()
  94. print "PERCENT DONE DOWNLOADING PDFs: %s%%" % str(float(i+1)/len(properties)*100)
  95. def convert_pdf(path):
  96. rsrcmgr = PDFResourceManager()
  97. retstr = StringIO()
  98. codec = 'utf-8'
  99. laparams = LAParams()
  100. device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
  101. #
  102. fp = file(path, 'rb')
  103. #process_pdf(rsrcmgr, device, fp)
  104. interpreter = PDFPageInterpreter(rsrcmgr, device)
  105. for page in PDFPage.get_pages(fp, range(10),
  106. maxpages=5, password='',
  107. caching=True, check_extractable=False):
  108. #page.rotate = (page.rotate+0) % 360
  109. interpreter.process_page(page)
  110. #
  111. fp.close()
  112. device.close()
  113. #
  114. str = retstr.getvalue()
  115. retstr.close()
  116. return str
  117. def get_all_PDF_addresses(properties, dir):
  118. for i, property in enumerate(properties):
  119. if hasattr(property, 'address'):
  120. continue
  121. try:
  122. pdfUrl = property['pdf']
  123. name = pdfUrl.split('/')[-1]
  124. #
  125. nameNoExt = name[:-4]
  126. nameOCR = nameNoExt + '_ocr.pdf'
  127. print dir, name
  128. path = os.path.abspath(os.path.join(dir, name))
  129. #path.replace('\\\\','\\')
  130. #path1 = '"%s"' % path
  131. #use PDFminer to get extract the text
  132. text = convert_pdf(path)
  133. #if the PDFminer text didn't find our string, try doing OCR
  134. if not 'The property is commonly known as' in text:
  135. #path.replace('\\\\','\\')
  136. #path1 = '"%s"' % path
  137. do_PDF_OCR(path)
  138. path = os.path.abspath(os.path.join(dir, nameOCR))
  139. text = convert_pdf(path)
  140. if not 'The property is commonly known as' in text:
  141. print 'OCR failed'
  142. sys.exit(0)
  143. #tsplit = text.split('\n')
  144. idx = text.index('commonly known')
  145. tsplit = text[idx:idx+150].split('\n')
  146. #addr1 = tsplit[idx].split(':')[1].strip()
  147. addr1 = tsplit[2].strip()
  148. addr2 = tsplit[4].strip()
  149. properties[i]['address'] = addr1 + ', ' + addr2
  150. except Exception as e:
  151. print "\n Error during parsing %s" % repr(e)
  152. pass
  153. print "\nPERCENT DONE: %s%%" % str(float(i+1)/len(properties)*100)
  154. def do_PDF_OCR(path):
  155. script = PyPDFOCR()
  156. print path
  157. script.go([path])
  158. googleGeocodeUrl = 'http://maps.googleapis.com/maps/api/geocode/json?'
  159. def get_coordinates(query, from_sensor=False):
  160. query = query.encode('utf-8')
  161. params = {
  162. 'address': query,
  163. 'sensor': "true" if from_sensor else "false"
  164. }
  165. url = googleGeocodeUrl + urllib.urlencode(params)
  166. json_response = urllib.urlopen(url)
  167. response = json.loads(json_response.read())
  168. if response['results']:
  169. location = response['results'][0]['geometry']['location']
  170. latitude, longitude = location['lat'], location['lng']
  171. print query, latitude, longitude
  172. else:
  173. latitude, longitude = None, None
  174. print query, "<no results>"
  175. return latitude, longitude
  176. def convertDictToGeoJSON(properties, fileName='geoJsonSheriffs.geojson'):
  177. geoList = []
  178. hits=0
  179. misses=0
  180. for property in properties:
  181. if not 'address' in property:
  182. misses+=1
  183. continue
  184. try:
  185. coords = (list(get_coordinates(property['address'])))[::-1]
  186. except UnicodeDecodeError:
  187. misses+=1
  188. continue
  189. if None in coords or 'null' in coords:
  190. misses+=1
  191. continue
  192. geoDict = {}
  193. geoDict['type'] = 'Feature'
  194. geoDict["geometry"]= {
  195. "type": "Point",
  196. "coordinates": coords
  197. }
  198. geoDict["properties"]={
  199. "name": property['address'] + ' --- \n' + property['notice_of_sale'] + ' --- \n' + property['date']
  200. }
  201. geoList.append(geoDict)
  202. hits+=1
  203. print "SUCCESS with errors possible: %d, FAIL: %d" % (hits, misses)
  204. with open(fileName, 'w+') as jsonFile:
  205. jsonFile.write(json.dumps(geoList))
  206. #import os
  207. #path = os.path.abspath(r"c:\Users\Chandni\Appdata\local\temp\pdfs\8-15Herscovitz.pdf")
  208. #get_all_PDF_addresses(properties, dirPath)
  209. #do_PDF_OCR(fileName)
  210. #fileText = convert_pdf()
  211. if __name__=='__main__':
  212. mainUrl = 'http://www.oregonsheriffs.com/'
  213. wacoUrl = mainUrl + 'sales-washington.htm'
  214. web_properties = get_sheriff_sale_data(wacoUrl)
  215. #
  216. properties_needing_downloaded=copy.deepcopy(web_properties)
  217. remove_filedata_from_properties(properties_needing_downloaded)
  218. #dirPath = make_pdf_dir()
  219. dirPath = os.path.abspath(os.path.join(tempfile.gettempdir(), 'pdfs'))
  220. download_property_pdfs(properties_needing_downloaded, mainUrl, dirPath)
  221. update_json_with_downloaded_pdfs(properties_needing_downloaded)
  222. file_properties = get_json_data('sherrifSalePropertiesParsed.json')
  223. try:
  224. get_all_PDF_addresses(file_properties, dirPath)
  225. finally:
  226. update_json_with_downloaded_pdfs(file_properties)
  227.