[英]python web-scraping ==> Google search
我需要幫助,因為我被困住了。 所以我實際上是在 Web-scraper 項目上,問題是我無法獲得搜索詞的正確“a”標簽,我只能得到www.google.com或 URL 之一用途是:
url_dorked ="https://www.google.com/search?q={}&sources=lnms&tbs=nws".format(dork_used)
當我在瀏覽器上嘗試 URL 時,他向我顯示了正確的頁面。 但是python腳本只接收google主頁的標簽。
這是完整的腳本:
#!/bin/bash
# -*- coding: utf-8 -*-
#Codded by Naylor From Exploit-Zone
#Join-us ! ==> https://forum.exploit-zone.eu/
#Create a folder nammed ==> Dork
from urllib.request import *
import os
from bs4 import BeautifulSoup
"""
http ==> 80
https ==> 443
----------
200 ==> OK
400 ==> Bad request
403 ==> Forbidden
404 ==> Not found
"""
def checker():
dork_used = ""
url_dorked ="https://www.google.co.ve/?gws_rd=cr&ei=DgBqVpWJMoPA-gHy25fACg#q={}".format(dork_used)
dorks = open("Dork/{}.txt".format(txtdork_path),"r")
list_dorks = []
dorks_lines = dorks.readlines()
tot_dorks = len(dorks_lines)
tot_dorks -= 1
for line in dorks_lines:
list_dorks.append(line)
print("\t{}\n (--) Has been charged\n".format(line))
print("\n(--) All {} dorks charged\n".format(tot_dorks))
dorks.close()
choosen_dork = int(input("Witch line do you want to use ? (write a number between 0 and {})\n>".format(tot_dorks)))
if choosen_dork >= 0 and choosen_dork <= tot_dorks:
pass
else:
print("The choosen number is to big !")
choosen_dork = int(input("Witch line do you want to use ? (write a number between 0 and {})\n>".format(tot_dorks)))
dork_used = str(list_dorks[choosen_dork])
print("\n(--) Selected Dorks ==> {}".format(dork_used))
req_fbypss = Request(url_dorked, headers = {'User-Agent': 'Mozilla/5.0'}) #user-agent to bypass Anti-crawl)
init_google = urlopen(req_fbypss)#init connection
print("(--) Google connection response ==> {}\n".format(init_google.code))
html_google = init_google.read() #read response of init
html_decoded = html_google.decode("utf-8")#like ISO-8859-1
soup = BeautifulSoup(html_google, "html.parser") #start an html html_parser
result_link = soup.findAll('a')
for i in result_link:
print(i,"\n")
"""
with open("dork_{}.txt".format(choosen_dork),"a") as f:
for result in result_1:
f.write(result)
"""
print("\n\n\welcome\n\n")
print("here Are the available dork file :\n")
dork_list = str(os.listdir('Dork/.'))
print("=> {}\n".format(dork_list))
txtdork_path = str(input("Enter dork file's name (without '.txt'/'[]'/''')\n>"))
check_file = os.path.isfile("Dork/{}.txt".format(txtdork_path))
if check_file == True:
print("\n(--) {} as been selected".format(txtdork_path))
else:
print("\nWrong name!\n (write only the name of the .txt file like : Google dork 2019)\n the .txt file have to be on the Dork folder\n\n")
exit()
checker()
腳本還沒有完成,它只是一個開發版本。
我的研究結果是:
- 要么有反爬蟲,用戶代理沒有繞過他。
- 要么是 URL 的問題,我必須修改她。
謝謝你幫我^^
def checker():
dork_used = ""
url_dorked ="https://www.google.co.ve/?gws_rd=cr&ei=DgBqVpWJMoPA-gHy25fACg#q={}".format(dork_used)
當url_dorked
被賦值時, dork_used
是一個空字符串,因此q=
是空的。 你是故意的嗎?
另外,我認為它應該是&q={}
,而不是#q={}
。
我按照您的建議將代碼替換為:
#!/bin/bash
# -*- coding: utf-8 -*-
#Codded by Naylor From Exploit-Zone
#Join-us ! ==> https://forum.exploit-zone.eu/
#Create a folder nammed ==> Dork
from urllib.request import *
import os
from bs4 import BeautifulSoup
"""
http ==> 80
https ==> 443
----------
200 ==> OK
400 ==> Bad request
403 ==> Forbidden
404 ==> Not found
"""
def checker():
dorks = open("Dork/{}.txt".format(txtdork_path),"r")
list_dorks = []
dorks_lines = dorks.readlines()
tot_dorks = len(dorks_lines)
tot_dorks -= 1
for line in dorks_lines:
list_dorks.append(line)
print("\t{}\n (--) Has been charged\n".format(line))
print("\n(--) All {} dorks charged\n".format(tot_dorks))
dorks.close()
choosen_dork = int(input("Witch line do you want to use ? (write a number between 0 and {})\n>".format(tot_dorks)))
if choosen_dork >= 0 and choosen_dork <= tot_dorks:
pass
else:
print("The choosen number is to big !")
choosen_dork = int(input("Witch line do you want to use ? (write a number between 0 and {})\n>".format(tot_dorks)))
dork_used = str(list_dorks[choosen_dork])
print("\n(--) Selected Dorks ==> {}".format(dork_used))
url_dorked ="https://www.google.com/search?q={}&sources=lnms&tbs=nws".format(dork_used)
req_fbypss = Request(url_dorked, headers = {'User-Agent': 'Mozilla/5.0'}) #user-agent to bypass Anti-crawl)
init_google = urlopen(req_fbypss)#init connection
print("(--) Google connection response ==> {}\n".format(init_google.code))
html_google = init_google.read() #read response of init
html_decoded = html_google.decode("ISO-8859-1")#like utf-8
soup = BeautifulSoup(html_google, "html.parser") #start an html html_parser
result_link = soup.findAll('a')
for i in result_link:
print(i,"\n")
"""
with open("dork_{}.txt".format(choosen_dork),"a") as f:
for result in result_1:
f.write(result)
"""
print("\n\n\tWelcome\n\n")
print("here Are the available dork file :\n")
dork_list = str(os.listdir('Dork/.'))
print("=> {}\n".format(dork_list))
txtdork_path = str(input("Enter dork file's name (without '.txt'/'[]'/''')\n>"))
check_file = os.path.isfile("Dork/{}.txt".format(txtdork_path))
if check_file == True:
print("\n(--) {} as been selected".format(txtdork_path))
else:
print("\nWrong name!\n (write only the name of the .txt file like : Google dork 2019)\n the .txt file have to be on the Dork folder\n\n")
exit()
checker()
所以我更換了 url_dorked 因為我放的那個是錯誤的,我忘記更換了^^
我改變了 url_dorked 的位置,把他放在 dork_used 的選擇之后
他現在要求使用 ISO-8859-1 而不是 utf-8 進行解碼,但仍然無法正常工作:/
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.