-
- from splinter.browser import Browser
- browser = Browser('zope.testbrowser')
-
- realProxies = []
-
-
- #Loop through all pages, any big number will do because loop cancels when finds no proxies
- for i in range(1,100):
- #Urls < 10 have prefix of 0
- if i <= 9:
- i = '0%d' % i
-
- #Build URL String
- url = 'http://www.samair.ru/proxy/proxy-%s%s' % (i, '.htm')
-
- browser.visit(url)
-
- #Grab the table cell which the proxies are in
- proxies = browser.find_by_css("#main_content .proxylist .box_text .tablelist tr td")
-
- for proxy in proxies:
-
- #Proxies are sometimes thrown in with other crap, this weeds out the other crap
- if not ":" in proxy.text or not "." in proxy.text or "o" in proxy.text: continue
-
- realProxies.append(proxy.text)
-
- #If there were under 50 table cells on this page, then it's the last page, so break the loop
- if len(proxies) < 50:
- break
-
- #Firefox RAM usage seems to built up over time, so every 5 pages, restart firefox
- if int(i) % 5 == 0:
- browser.quit()
- browser = Browser()
-
-
- browser.quit()
-
- file = open('proxies.txt','w')
- file.write('\n'.join(realProxies))
- print str(len(realProxies)) + ' Added to proxies.txt'
-
-
-
-
- --------------------------
-
-
-
- Traceback (most recent call last):
- File "/Users/Jake/Desktop/python/01/NewPythonProject/src/test.py", line 25, in <module>
- if not ":" in proxy.text or not "." in proxy.text or "o" in proxy.text: continue
- TypeError: argument of type 'NoneType' is not iterable
- None
-