I am working on a task in which I have to get details from a web page using python request library. I have successfully got the details from the page but it has a button "Show Details" which fetches more details using ajax call, now I need to fetch those extra details also. can anyone help me in doing so? here's the link to website:- http://ipindiaonline.gov.in/tmrpublicsearch/frmmain.aspx and a screen shot of website here:
Here's my code which i have made.
from bs4 import BeautifulSoup
import requests,json
def returnJson(wordmark, page_class):
url = "http://ipindiaonline.gov.in/tmrpublicsearch/frmmain.aspx"
r_init = requests.get(url)
soup = BeautifulSoup(r_init.text, 'html.parser')
event_validation = soup.find("input", attrs={"name" : "__EVENTVALIDATION"})['value']
view_state = soup.find("input", attrs={"name" : "__VIEWSTATE"})['value']
search_type = 'WM'
postdata = {
'ctl00$ContentPlaceHolder1$DDLFilter' : '0',
'ctl00$ContentPlaceHolder1$DDLSearchType' : search_type,
'ctl00$ContentPlaceHolder1$TBWordmark' : wordmark,
'ctl00$ContentPlaceHolder1$TBClass' : page_class,
'__EVENTVALIDATION' : event_validation,
"__EVENTTARGET" : "ctl00$ContentPlaceHolder1$BtnSearch",
"__VIEWSTATE" : view_state,
}
r = requests.post(url, data=postdata)
return r
def scraping(r):
soup = BeautifulSoup(r.text, 'html.parser')
counter=len(soup.findAll('tr',attrs={'class':'row'}))
counter+=len(soup.findAll('tr',attrs={'class':'alt'}))
wordmark_idvalue='ContentPlaceHolder1_MGVSearchResult_lblsimiliarmark_'
proprietor_idvalue='ContentPlaceHolder1_MGVSearchResult_LblVProprietorName_'
applicationno_idvalue='ContentPlaceHolder1_MGVSearchResult_lblapplicationnumber_'
class_idvalue='ContentPlaceHolder1_MGVSearchResult_lblsearchclass_'
status_idvalue='ContentPlaceHolder1_MGVSearchResult_Label6_'
words_list=[]
for i in range(0,counter):
words_dict={}
row=soup.find('span',attrs={'id':(wordmark_idvalue+str(i))})
words_dict['Wordmark']=row.text
row=soup.find('span',attrs={'id':(proprietor_idvalue+str(i))})
words_dict['Proprietor']=row.text
row=soup.find('span',attrs={'id':(applicationno_idvalue+str(i))})
words_dict['Application Number']=row.text
row=soup.find('span',attrs={'id':(class_idvalue+str(i))})
words_dict['Class ']=row.text
row=soup.find('span',attrs={'id':(status_idvalue+str(i))})
words_dict['Status']=row.text
words_list.append(words_dict)
return words_list
def showDetails(wordmark, page_class):
if(len(wordmark)>2 and page_class.isalnum()==1):
var=json.dumps(scraping(returnJson(wordmark, page_class)))
return var
else:
print("Please Enter Valid Parameters\n")
showDetails('AIWA','2')
You need to create another POST request using information from the first POST request. The following shows how the Goods & Services Description
could be extracted from the returned data:
from operator import itemgetter
from bs4 import BeautifulSoup
import requests,json
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
def get_input_value(soup, name):
return soup.find("input", attrs={"name" : name})['value']
def returnJson(wordmark, page_class):
url = "http://ipindiaonline.gov.in/tmrpublicsearch/frmmain.aspx"
r_init = requests.get(url)
soup = BeautifulSoup(r_init.text, 'html.parser')
postdata = {
"ctl00$ContentPlaceHolder1$DDLFilter" : "0",
"ctl00$ContentPlaceHolder1$DDLSearchType" : "WM",
"ctl00$ContentPlaceHolder1$TBWordmark" : wordmark,
"ctl00$ContentPlaceHolder1$TBClass" : page_class,
"__EVENTVALIDATION" : get_input_value(soup, "__EVENTVALIDATION"),
"__EVENTTARGET" : "ctl00$ContentPlaceHolder1$BtnSearch",
"__VIEWSTATE" : get_input_value(soup, "__VIEWSTATE"),
}
r = requests.post(url, headers=headers, data=postdata)
return r
def scraping(r):
soup = BeautifulSoup(r.content, 'html.parser')
counter = len(soup.find_all('tr', attrs={'class':'row'}))
counter += len(soup.find_all('tr', attrs={'class':'alt'}))
words_list = []
fields = [
("Wordmark", "ContentPlaceHolder1_MGVSearchResult_lblsimiliarmark_{}"),
("Proprietor", "ContentPlaceHolder1_MGVSearchResult_LblVProprietorName_{}"),
("Application Number", "ContentPlaceHolder1_MGVSearchResult_lblapplicationnumber_{}"),
("Class", "ContentPlaceHolder1_MGVSearchResult_lblsearchclass_{}"),
("Status", "ContentPlaceHolder1_MGVSearchResult_Label6_{}"),
]
for index in range(0, counter):
words_dict = {}
for key, field in fields:
words_dict[key] = soup.find('span', attrs={'id' : field.format(index)}).text
print("Wordmark: {}".format(words_dict["Wordmark"]))
# Construct a POST request for the Show Details panel
# Locate matching 'Show details' link
span = soup.find('span', attrs={'id' : fields[0][1].format(index)})
a = span.find_next('a', class_='LnkshowDetails')
lnk_show_details = a['href'].split("'")[1]
data = {
"__EVENTTARGET" : lnk_show_details,
"__VIEWSTATE" : get_input_value(soup, "__VIEWSTATE"),
"__VIEWSTATEENCRYPTED" : "",
"__EVENTVALIDATION" : get_input_value(soup, "__EVENTVALIDATION"),
"__ASYNCPOST" : "true",
}
url = "http://ipindiaonline.gov.in/tmrpublicsearch" + soup.form["action"].strip(".")
r_details = requests.post(url, headers=headers, data=data)
html = b''.join(itemgetter(7, 8)(r_details.content.split(b"|")))
soup_details = BeautifulSoup(html, "html.parser")
details = {}
for tr in soup_details.find_all('tr'):
row = [td.text for td in tr.find_all('td')] # Note: Journal No and Used since would need more work
details[row[0]] = row[1]
# Copy description
desc = 'Goods & Services Description'
words_dict[desc] = details[desc]
words_list.append(words_dict)
return words_list
def showDetails(wordmark, page_class):
if len(wordmark) > 2 and page_class.isalnum() == 1:
var = json.dumps(scraping(returnJson(wordmark, page_class)))
return var
else:
print("Please Enter Valid Parameters\n")
print(showDetails('AIWA','2'))
This would display:
Wordmark: AIWA
Wordmark: AIWACEM
Wordmark: AIWAPRIME (LABEL)
[{"Wordmark": "AIWA", "Proprietor": "AIWA CO. LTD.", "Application Number": "683935", "Class": "2", "Status": "Registered", "Goods & Services Description": "PAINTS, VARNISHES, LACQUERS, PRESERVATIVES AGAINST RUST AND AGAINST DESTRIORATION OF WOOD, COLOURING MATTERS, DYESTUFFS, MORDANTS, NATURAL RESINS, METALS IN FOIL AND POWDER FROM FOR PAINTERS AND DECORATORS."}, {"Wordmark": "AIWACEM ", "Proprietor": "AMITA B. MEHTA", "Application Number": "1108415", "Class": "2", "Status": "Registered", "Goods & Services Description": "waterproof cement paint."}, {"Wordmark": "AIWAPRIME (LABEL)", "Proprietor": "AMITA B. MEHTA", "Application Number": "1165809", "Class": "2", "Status": "Registered", "Goods & Services Description": "WATER BASED CEMENT PRIMER INCLUDED IN CLASS 2."}]
Note: The data returned contains other fields which are separated by the |
character. The HTML for the details also happens to contain this character so it is necessary to extract fields 7
and 8
to get just the HTML.
I wouldn't attempt this using BeautifulSoup. However, I think you could do it with Selenium ( see https://selenium-python.readthedocs.io/ .)
Using its facilities you can click the 'Show Details' button of your choice, then wait for the requested information to appear in the right panel, and then pick up the requested information more or less as you would using BeautifulSoup from that panel.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.