I've created a script using requests library to get the tabular content available in a webpage. When I visit that site manually using this link , I see a page in which I need to hit the AGREE
button first in order to see the tabular content.
Once again this is the website link
I tried to observe closely in the.network section in chrome dev tools and mimicked the same using the script below to access the content. However, all I get is the following whereas I'm supposed to get the tabular content in some json format according to dev tools.
Output I'm getting:
b'\n\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\n\n\n{}'
Expected output (truncated):
{T:{"Columns":[{"tradeQuantity":"1125000","quantityAsString":"1125000",
I've tried with:
import json
import requests
start_url = 'https://finra-markets.morningstar.com/BondCenter/BondTradeActivitySearchResult.jsp?'
link = 'https://finra-markets.morningstar.com/bondSearch.jsp'
qsp = {
'ticker': 'C679131',
'startdate': '10/03/2019',
'enddate': '10/03/2020'
}
payload = {
'postData': {'Keywords':[]},
'ticker': 'C679131',
'startDate': '',
'endDate': '',
'showResultsAs': 'B',
'debtOrAssetClass': '',
'spdsType': ''
}
params = {
'count': '20',
'sortfield': 'tradeDate',
'sorttype': '2',
'start': '0',
'searchtype': 'T',
'query': {"Keywords":[{"Name":"securityId","Value":"C679131"},{"Name":"tradeDate","minValue":"10/03/2019","maxValue":"10/03/2020"}]}
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
s.headers['Referer'] = 'https://finra-markets.morningstar.com/BondCenter/UserAgreement.jsp'
r = s.post(start_url,params=qsp,data=payload)
s.headers['Referer'] = 'https://finra-markets.morningstar.com/BondCenter/BondTradeActivitySearchResult.jsp?ticker=C679131&startdate=10%2F03%2F2019&enddate=10%2F03%2F2020'
s.headers['X-Requested-With'] = 'XMLHttpRequest'
r = s.post(link,json=params)
print(r.status_code)
print(r.content)
How can I get the tabular content from that webpage using requests?
You need to make a call to:
POST https://finra-markets.morningstar.com/finralogin.jsp
while storing cookies using requests.Session()
. Also, the Referer
header is needed for the call to:
POST https://finra-markets.morningstar.com/bondSearch.jsp
After that, the result is not quite JSON as pointed by baduker, you can use regex to remodel it:
import requests
from urllib import parse
import json
import re
import pandas as pd
host = "https://finra-markets.morningstar.com"
path = "/BondCenter/BondTradeActivitySearchResult.jsp"
qsp = {
'ticker': 'C679131',
'startdate': '10/03/2019',
'enddate': '10/03/2020'
}
s = requests.Session()
s.post("https://finra-markets.morningstar.com/finralogin.jsp",
data = {
"redirectPage": f"{path}?{parse.urlencode(qsp)}"
}
)
r = s.post("https://finra-markets.morningstar.com/bondSearch.jsp",
headers= {
"Referer": f"{host}{path}?{parse.urlencode(qsp)}",
},
data = {
"count": 20,
"sortfield": "tradeDate",
"sorttype": 2,
"start": 0,
"searchtype": "T",
"query": json.dumps({
"Keywords":[
{"Name":"securityId","Value": qsp["ticker"]},
{"Name":"tradeDate","minValue": qsp["startdate"],"maxValue":qsp["enddate"]}
]
})
})
dataReg = re.search('{T:(.*)}', r.text, re.MULTILINE)
data = json.loads(dataReg.group(1))
df = pd.DataFrame(data["Columns"])
print(df)
Output:
tradeQuantity quantityAsString timeOfExecution settlementDate tradeModifier secondModifier specialPriceIndicator ... tradeDate symbol cusip callable commissionIndicator ATSIndicator remuneration
0 1125000 1125000 11:46:02 10/2/2020 _ _ - ... 10/2/2020 None None None N N
1 60000 60000 10:23:55 10/5/2020 _ _ - ... 10/1/2020 None None None N N
2 60000 60000 10:23:54 10/5/2020 _ _ - ... 10/1/2020 None None None M M
3 200000 200000 16:27:43 10/2/2020 _ _ - ... 9/30/2020 None None None
4 200000 200000 16:27:43 10/2/2020 _ _ - ... 9/30/2020 None None None N N
5 2900000 2900000 15:39:16 10/2/2020 _ _ - ... 9/30/2020 None None None M M
6 20000 20000 12:24:48 10/2/2020 _ _ - ... 9/30/2020 None None None M M
.........
In the Chrome Developer console, in the.network tab you can right click: "headers options/Set Cookies" to quickly catch which call are setting cookies
The trick was to correctly mimic the request with the exact same headers and cookies. I took the cookie raw string from the Developer Tool.
Here's how to get the raw text data:
import json
from http.cookies import SimpleCookie
from urllib.parse import urlencode
import requests
link = 'https://finra-markets.morningstar.com/bondSearch.jsp'
payload = {
'count': '20',
'sortfield': 'tradeDate',
'sorttype': '2',
'start': '0',
'searchtype': 'T',
'query': {"Keywords": [{"Name": "securityId", "Value": "C679131"},
{"Name": "tradeDate", "minValue": "10/03/2019", "maxValue": "10/03/2020"}]}
}
cookies_raw_data = "__cfduid=db2d21a652ef313fcff3704bd87e839401602408581; qs_wsid=1CBF0E77A1169ED03A3EB86A6A8A991D; __cfruid=0ef7fb90b47b06df86311ff32918c0c9c441617d-1602408582; SessionID=1CBF0E77A1169ED03A3EB86A6A8A991D; UsrID=41151; UsrName=FINRA.QSAPIDEF@morningstar.com; Instid=FINRA; msFinraHasAgreed=true"
cookie = SimpleCookie()
cookie.load(cookies_raw_data)
cookies = {}
for key, morsel in cookie.items():
cookies[key] = morsel.value
ref_payload = urlencode(dict(ticker="C679131", startdate="10/03/2019", enddate="10/03/2020"))
referer = f"https://finra-markets.morningstar.com/BondCenter/BondTradeActivitySearchResult.jsp?{ref_payload}"
headers = {
"Accept": "text/plain, */*; q=0.01",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
"Content-Length": "278",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Host": "finra-markets.morningstar.com",
"Origin": "https://finra-markets.morningstar.com",
"Referer": referer,
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
}
response = requests.post(link, data=urlencode(payload), headers=headers, cookies=cookies).text
print(response.strip())
Output:
{T:{"Columns":[{"tradeQuantity":"1125000","quantityAsString":"1125000","timeOfExecution":"11:46:02","settlementDate":"10/2/2020","tradeModifier":"_","secondModifier":"_","specialPriceIndicator":"-","asOfTrade":"-","reportingParty":"B","tradeStatus":"T","reportingPartyType":"D","contraPartyType":"C","securityId":"C679131","issueIdentifier":"EXC4479862","descriptionOfIssuer":"EXELON CORP","subproductType":"Corporate Bond","couponRate":3.497,"maturityDate":"06/01/2022","price":104.576,"yield":0.584,"tradeDate":"10/2/2020","symbol":null,"cusip":null,"callable":null,"commissionIndicator":"N","ATSIndicator":" ","remuneration":"N"},{"tradeQuantity":"60000","quantityAsString":"60000","timeOfExecution":"10:23:55","settlementDate":"10/5/2020","tradeModifier":"_","secondModifier":"_","specialPriceIndicator":"-","asOfTrade":"-","reportingParty":"S","tradeStatus":"T","reportingPartyType":"D",
and so on...
The data itself is a plain text that turns out to be an invalid JSON
. I couldn't parse it right away. After a couple of tries, I realized that the first key T
is not in "
so this wasn't passing as a valid JSON
but... a simple hack did the trick!
To get the JSON
object use this (I'll edit this if I find a less hacky way):
data = json.loads(response.strip()[3:-1])
for t in data['Columns']:
print(f"{t['descriptionOfIssuer']} - {t['tradeQuantity']} - {t['price']}")
Output:
EXELON CORP - 1125000 - 104.576
EXELON CORP - 60000 - 104.642
EXELON CORP - 60000 - 104.618
EXELON CORP - 200000 - 104.612
EXELON CORP - 200000 - 104.612
EXELON CORP - 2900000 - 104.597
EXELON CORP - 20000 - 104.6
EXELON CORP - 225000 - 104.553
EXELON CORP - 64000 - 104.581
EXELON CORP - 64000 - 104.596
EXELON CORP - 50000 - 104.553
EXELON CORP - 2100000 - 104.634
EXELON CORP - 230000 - 104.551
EXELON CORP - 97000 - 104.566
EXELON CORP - 15000 - 104.551
EXELON CORP - 342000 - 104.582
EXELON CORP - 1400000 - 104.616
EXELON CORP - 200000 - 104.501
EXELON CORP - 200000 - 104.511
EXELON CORP - 220000 - 104.397
EDIT:
To prove that even short-lived (and hard-coded) cookies are better than no data at all, here's a modified version of the script that produces a data dump for that ticker you're after.
This should work even with those damned cookies, becasue you're requesting archival data that's unlikely to change. So, you can fetch it, save it, and move on.
Note: if the cookies that I'm using are outdated, just replace these with whatever values you have in the Developer Tool -> XHR -> bondSearch.jsp -> Headers -> Request Headers -> Cookie
:
__cfduid
qs_wsid
__cfruid
SessionID
(this is always the same as qs_wsid
The code:
import json
import time
from urllib.parse import urlencode
import requests
ref_payload = urlencode(dict(ticker="C679131", startdate="10/03/2019", enddate="10/03/2020"))
referer = f"https://finra-markets.morningstar.com/BondCenter/BondTradeActivitySearchResult.jsp?{ref_payload}"
headers = {
"Accept": "text/plain, */*; q=0.01",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
"Content-Length": "278",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Host": "finra-markets.morningstar.com",
"Origin": "https://finra-markets.morningstar.com",
"Referer": referer,
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
}
cookies = {
"__cfduid": "d1820cb5f1d1e8ec40513d0f8326ce1881602492151",
"qs_wsid": "92CD4948C2AC7FCEC0989B34B86C1ADB",
"__cfruid": "4dec9a2deb6d70c86ee5b8fa4046748994ef6254-1602492151}",
"SessionID": "92CD4948C2AC7FCEC0989B34B86C1ADB",
"UsrID": "41151",
"UsrName": "FINRA.QSAPIDEF@morningstar.com",
"Instid": "FINRA",
"msFinraHasAgreed": "true",
}
start_counter = 0
final_output = []
while True:
payload = {
'count': '20',
'sortfield': 'tradeDate',
'sorttype': '2',
'start': str(start_counter),
'searchtype': 'T',
'query': {
"Keywords": [
{"Name": "securityId", "Value": "C679131"},
{"Name": "tradeDate", "minValue": "10/03/2019", "maxValue": "10/03/2020"},
]
}
}
response = requests.post(
'https://finra-markets.morningstar.com/bondSearch.jsp',
data=urlencode(payload),
headers=headers,
cookies=cookies,
).text
data = json.loads(response.strip()[3:-1])["Columns"]
if data:
print(f"Fetching data for counter {start_counter}...")
final_output.extend(data)
start_counter += 20
else:
break
with open(f"data_dump_securityID_C679131.json", "w") as d:
json.dump(final_output, d, indent=4, sort_keys=True)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.