[英]Python - make script loop until condition met and use a different proxy address for each loop
我是菜鳥的定義。 我對python幾乎一無所知,正在尋求幫助。 我只能讀足夠多的代碼來更改變量以適合我的需求,但是當我要做某些原始代碼沒有要求的事情時,我迷路了。
因此,這是一筆交易,我找到了一個craigslist(CL)標記腳本,該腳本最初搜索所有CL站點並標記了包含特定關鍵字的帖子(編寫該腳本以標記所有提及科學論的帖子)。
我將其更改為僅在我的常規區域中搜索CL網站(15個網站,而不是437個網站),但它仍在查找已更改的特定關鍵字。 我想自動標記連續對CL進行垃圾郵件的人員,並且由於我在CL上進行了大量的業務(從分類到發布),因此很難進行分類。
我希望腳本執行的是循環操作,直到在每次循環后不再能夠找到滿足更改代理服務器條件的帖子為止。 在腳本中的一個地方,我會在其中放置代理IP地址
我期待您的答復。
這是我更改的代碼:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib
from twill.commands import * # gives us go()
areas = ['sfbay', 'chico', 'fresno', 'goldcountry', 'humboldt', 'mendocino', 'modesto', 'monterey', 'redding', 'reno', 'sacramento', 'siskiyou', 'stockton', 'yubasutter', 'reno']
def expunge(url, area):
page = urllib.urlopen(url).read() # <-- and v and vv gets you urls of ind. postings
page = page[page.index('<hr>'):].split('\n')[0]
page = [i[:i.index('">')] for i in page.split('href="')[1:-1] if '<font size="-1">' in i]
for u in page:
num = u[u.rfind('/')+1:u.index('.html')] # the number of the posting (like 34235235252)
spam = 'https://post.craigslist.org/flag?flagCode=15&postingID='+num # url for flagging as spam
go(spam) # flag it
print 'Checking ' + str(len(areas)) + ' areas...'
for area in ['http://' + a + '.craigslist.org/' for a in areas]:
ujam = area + 'search/?query=james+"916+821+0590"+&catAbb=hhh'
udre = area + 'search/?query="DRE+%23+01902542+"&catAbb=hhh'
try:
jam = urllib.urlopen(ujam).read()
dre = urllib.urlopen(udre).read()
except:
print 'tl;dr error for ' + area
if 'Found: ' in jam:
print 'Found results for "James 916 821 0590" in ' + area
expunge(ujam, area)
print 'All "James 916 821 0590" listings marked as spam for area'
if 'Found: ' in dre:
print 'Found results for "DRE # 01902542" in ' + area
expunge(udre, area)
print 'All "DRE # 01902542" listings marked as spam for area'
您可以這樣創建一個常量循環:
while True:
if condition :
break
Itertools提供了一些用於迭代http://docs.python.org/2/library/itertools.html的技巧
值得注意的是,請查看itertools.cycle
(這些都是作為正確方向的指針。您可以使用一個,另一個甚至兩個來制定解決方案)
我對您的代碼做了一些更改。 在我看來,函數expunge
已經遍歷頁面中的所有結果,因此我不確定您需要執行哪個循環,但是有一個示例,您可以檢查最后是否找到結果,但是沒有中斷的循環。
不知道如何更改代理/ ip。
順便說一句,你有兩次'reno'
。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib
from twill.commands import go
areas = ['sfbay', 'chico', 'fresno', 'goldcountry', 'humboldt',
'mendocino', 'modesto', 'monterey', 'redding', 'reno',
'sacramento', 'siskiyou', 'stockton', 'yubasutter']
queries = ['james+"916+821+0590"','"DRE+%23+01902542"']
def expunge(url, area):
page = urllib.urlopen(url).read() # <-- and v and vv gets you urls of ind. postings
page = page[page.index('<hr>'):].split('\n')[0]
page = [i[:i.index('">')] for i in page.split('href="')[1:-1] if '<font size="-1">' in i]
for u in page:
num = u[u.rfind('/')+1:u.index('.html')] # the number of the posting (like 34235235252)
spam = 'https://post.craigslist.org/flag?flagCode=15&postingID='+num # url for flagging as spam
go(spam) # flag it
print 'Checking ' + str(len(areas)) + ' areas...'
for area in areas:
for query in queries:
qurl = 'http://' + area + '.craigslist.org/search/?query=' + query + '+&catAbb=hhh'
try:
q = urllib.urlopen(qurl).read()
except:
print 'tl;dr error for {} in {}'.format(query, area)
break
if 'Found: ' in q:
print 'Found results for {} in {}'.format(query, area)
expunge(qurl, area)
print 'All {} listings marked as spam for area'.format(query)
elif 'Nothing found for that search' in q:
print 'No results for {} in {}'.format(query, area)
break
else:
break
我進行了一些更改...不確定它們的運行狀況如何,但是沒有出現任何錯誤。 如果發現任何錯誤或遺漏的東西,請告訴我。 - 謝謝
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib, urllib2
from twill.commands import go
proxy = urllib2.ProxyHandler({'https': '108.60.219.136:8080'})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
proxy2 = urllib2.ProxyHandler({'https': '198.144.186.98:3128'})
opener2 = urllib2.build_opener(proxy2)
urllib2.install_opener(opener2)
proxy3 = urllib2.ProxyHandler({'https': '66.55.153.226:8080'})
opener3 = urllib2.build_opener(proxy3)
urllib2.install_opener(opener3)
proxy4 = urllib2.ProxyHandler({'https': '173.213.113.111:8080'})
opener4 = urllib2.build_opener(proxy4)
urllib2.install_opener(opener4)
proxy5 = urllib2.ProxyHandler({'https': '198.154.114.118:3128'})
opener5 = urllib2.build_opener(proxy5)
urllib2.install_opener(opener5)
areas = ['sfbay', 'chico', 'fresno', 'goldcountry', 'humboldt',
'mendocino', 'modesto', 'monterey', 'redding', 'reno',
'sacramento', 'siskiyou', 'stockton', 'yubasutter']
queries = ['james+"916+821+0590"','"DRE+%23+01902542"']
def expunge(url, area):
page = urllib.urlopen(url).read() # <-- and v and vv gets you urls of ind. postings
page = page[page.index('<hr>'):].split('\n')[0]
page = [i[:i.index('">')] for i in page.split('href="')[1:-1] if '<font size="-1">' in i]
for u in page:
num = u[u.rfind('/')+1:u.index('.html')] # the number of the posting (like 34235235252)
spam = urllib2.urlopen('https://post.craigslist.org/flag?flagCode=15&postingID='+num )
spam2 = urllib2.urlopen('https://post.craigslist.org/flag?flagCode=28&postingID='+num )
spam3 = urllib2.urlopen('https://post.craigslist.org/flag?flagCode=16&postingID='+num )
go(spam) # flag it
go(spam2) # flag it
go(spam3) # flag it
print 'Checking ' + str(len(areas)) + ' areas...'
for area in areas:
for query in queries:
qurl = 'http://' + area + '.craigslist.org/search/?query=' + query + '+&catAbb=hhh'
try:
q = urllib.urlopen(qurl).read()
except:
print 'tl;dr error for {} in {}'.format(query, area)
break
if 'Found: ' in q:
print 'Found results for {} in {}'.format(query, area)
expunge(qurl, area)
print 'All {} listings marked as spam for {}'.format(query, area)
print ''
print ''
elif 'Nothing found for that search' in q:
print 'No results for {} in {}'.format(query, area)
print ''
print ''
break
else:
break
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib, urllib2
from twill.commands import go
proxy = urllib2.ProxyHandler({'https': '108.60.219.136:8080'})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
proxy2 = urllib2.ProxyHandler({'https': '198.144.186.98:3128'})
opener2 = urllib2.build_opener(proxy2)
urllib2.install_opener(opener2)
proxy3 = urllib2.ProxyHandler({'https': '66.55.153.226:8080'})
opener3 = urllib2.build_opener(proxy3)
urllib2.install_opener(opener3)
proxy4 = urllib2.ProxyHandler({'https': '173.213.113.111:8080'})
opener4 = urllib2.build_opener(proxy4)
urllib2.install_opener(opener4)
proxy5 = urllib2.ProxyHandler({'https': '198.154.114.118:3128'})
opener5 = urllib2.build_opener(proxy5)
urllib2.install_opener(opener5)
areas = ['capecod']
queries = ['rent','rental','home','year','falmouth','lease','credit','tenant','apartment','bedroom','bed','bath']
def expunge(url, area):
page = urllib.urlopen(url).read() # <-- and v and vv gets you urls of ind. postings
page = page[page.index('<hr>'):].split('\n')[0]
page = [i[:i.index('">')] for i in page.split('href="')[1:-1] if '<font size="-1">' in i]
for u in page:
num = u[u.rfind('/')+1:u.index('.html')] # the number of the posting (like 34235235252)
spam = urllib2.urlopen('https://post.craigslist.org/flag?flagCode=15&postingID='+num )
spam2 = urllib2.urlopen('https://post.craigslist.org/flag?flagCode=28&postingID='+num )
spam3 = urllib2.urlopen('https://post.craigslist.org/flag?flagCode=16&postingID='+num )
go(spam) # flag it
go(spam2) # flag it
go(spam3) # flag it
print 'Checking ' + str(len(areas)) + ' areas...'
for area in areas:
for query in queries:
qurl = 'http://' + area + '.craigslist.org/search/?query=' + query + '+&catAbb=hhh'
try:
q = urllib.urlopen(qurl).read()
except:
print 'tl;dr error for {} in {}'.format(query, area)
break
if 'Found: ' in q:
print 'Found results for {} in {}'.format(query, area)
expunge(qurl, area)
print 'All {} listings marked as spam for {}'.format(query, area)
print ''
print ''
elif 'Nothing found for that search' in q:
print 'No results for {} in {}'.format(query, area)
print ''
print ''
break
else:
break
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.