spacepaste

  1.  
  2. from splinter.browser import Browser
  3. browser = Browser('zope.testbrowser')
  4. realProxies = []
  5. #Loop through all pages, any big number will do because loop cancels when finds no proxies
  6. for i in range(1,100):
  7. #Urls < 10 have prefix of 0
  8. if i <= 9:
  9. i = '0%d' % i
  10. #Build URL String
  11. url = 'http://www.samair.ru/proxy/proxy-%s%s' % (i, '.htm')
  12. browser.visit(url)
  13. #Grab the table cell which the proxies are in
  14. proxies = browser.find_by_css("#main_content .proxylist .box_text .tablelist tr td")
  15. for proxy in proxies:
  16. #Proxies are sometimes thrown in with other crap, this weeds out the other crap
  17. if not ":" in proxy.text or not "." in proxy.text or "o" in proxy.text: continue
  18. realProxies.append(proxy.text)
  19. #If there were under 50 table cells on this page, then it's the last page, so break the loop
  20. if len(proxies) < 50:
  21. break
  22. #Firefox RAM usage seems to built up over time, so every 5 pages, restart firefox
  23. if int(i) % 5 == 0:
  24. browser.quit()
  25. browser = Browser()
  26. browser.quit()
  27. file = open('proxies.txt','w')
  28. file.write('\n'.join(realProxies))
  29. print str(len(realProxies)) + ' Added to proxies.txt'
  30. --------------------------
  31. Traceback (most recent call last):
  32. File "/Users/Jake/Desktop/python/01/NewPythonProject/src/test.py", line 25, in <module>
  33. if not ":" in proxy.text or not "." in proxy.text or "o" in proxy.text: continue
  34. TypeError: argument of type 'NoneType' is not iterable
  35. None
  36.