![](/img/trans.png)
[英]How to extract data from Json format in html using beautifulsoup
[英]Parse HTML and extract data from a tag's string (in JSON format)
從 Html 文件中提取了一個標簽
from bs4 import BeautifulSoup
html=open("centrimo.html")
parsed_html=BeautifulSoup(html)
script_data=parsed_cen.script
現在,我想從腳本標簽中包含的字符串中提取變量“sequences”、“neg_sequences”、“seqs”和“nseqs”中的信息。
<script type="text/javascript">
//@JSON_VAR data
var data = {
"version": "5.3.3",
"revision": "1667d7719daf2af1693cc039ba463bc4d2304d23",
"release": "Sun Feb 7 15:39:52 2021 -0800",
"program": "CentriMo",
"options": {
"cd": false,
"neg_sequences": true,
"noseq": false,
"mcc": false
},
"seqlen": 101,
"tested": 435,
"alphabet": {
"name": "DNA",
"like": "dna",
"ncore": 4
},
"background": [0.2788, 0.2212, 0.2212, 0.2788],
"sequences": [
"AT1G04100.1_CDS", "AT1G05860.1_CDS", "AT1G13910.1_CDS",
"AT1G21065.1_CDS", "AT1G26190.1_CDS", "AT1G32940.1_CDS",
"AT1G50575.1_CDS", "AT1G55810.1_CDS", "AT1G66430.1_CDS",
"AT1G71430.1_CDS", "AT1G77170.1_CDS", "AT1G78610.1_CDS",
"AT2G02955.1_CDS", "AT2G16280.1_CDS", "AT2G17080.1_CDS",
"AT2G19620.1_CDS", "AT2G19640.1_CDS", "AT2G30840.1_CDS",
"AT2G39450.1_CDS", "AT2G41380.1_CDS", "AT2G42580.1_CDS",
"AT3G01680.1_CDS", "AT3G05680.1_CDS", "AT3G20110.1_CDS",
"AT3G20260.1_CDS", "AT3G21360.1_CDS", "AT3G23070.1_CDS",
"AT3G23590.1_CDS", "AT3G46820.1_CDS", "AT3G48250.1_CDS",
"AT3G61200.1_CDS", "AT4G08510.1_CDS", "AT4G15070.1_CDS",
"AT4G24670.1_CDS", "AT4G25450.1_CDS", "AT4G28600.1_CDS",
"AT4G31910.1_CDS", "AT4G34810.1_CDS", "AT4G35030.3_CDS",
"AT4G37170.1_CDS", "AT4G38630.1_CDS", "AT4G39720.1_CDS",
"AT5G07340.1_CDS", "AT5G12970.1_CDS", "AT5G13470.1_CDS",
"AT5G18950.1_CDS", "AT5G22840.1_CDS", "AT5G25590.1_CDS",
"AT5G27395.1_CDS", "AT5G53370.1_CDS", "AT5G63610.1_CDS",
"AT5G64830.1_CDS", "AT5G64900.1_CDS", "AT5G67620.1_CDS"
],
"neg_sequences": [
"AT1G01600.1_CDS", "AT2G32480.1_CDS", "AT2G41740.1_CDS",
"AT3G19490.1_CDS", "AT3G24030.1_CDS", "AT3G25580.1_CDS",
"AT3G48330.1_CDS", "AT3G59220.1_CDS", "AT4G13340.1_CDS",
"AT4G33590.1_CDS", "AT5G03080.1_CDS", "AT5G23700.1_CDS",
"AT5G41010.1_CDS"
],
"motifs": [
{
"db": 2,
"id": "ath-miR419",
"alt": "MIMAT0001327",
"consensus": "CAACATCCTCAGCATTCATAA",
"len": 21,
"motif_evalue": "0.0e+000",
"motif_nsites": 20,
"n_tested": 40,
"score_threshold": 5,
"url": "http://www.mirbase.org/cgi-bin/mature.pl?mature_acc=MIMAT0001327",
"pwm": [
[0.164036, 0.479478, 0.163749, 0.192738],
[0.479764, 0.163749, 0.192452, 0.164036]
],
"total_sites": 10,
"sites": [
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
],
"neg_total_sites": 2,
"neg_sites": [
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
],
"seqs": [0, 1, 13, 15, 16, 23, 27, 36, 44, 48],
"neg_seqs": [3, 10],
"peaks": [
{
"center": 0,
"fisher_log_adj_pvalue": 0
}
]
}
]
};
</script>
我試圖將對象轉換為 json 類型的對象,但出現以下錯誤,
import json
j_script = json.loads(script_data.string)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/python3.7/json/__init__.py", line 348, in loads
return _default_decoder.decode(s)
File "/usr/lib/python3.7/json/decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/usr/lib/python3.7/json/decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 2 column 7 (char 7)
提前致謝
附注。 可以找到我想要解析的完整 html 文件的示例( 此處)
編輯:在原帖中,我提到我遇到了縮進錯誤。 這是在嘗試通過刪除所有空格“\\n”字符來手動編輯 json 對象之后發生的。 雖然我認為它不會從根本上改變問題,但我為錯誤道歉
[更新] 我能夠調整這篇文章中的答案如下
tmp=script_data.string.partition('=')
j_tmp=tmp[2].replace(";\n ","")
j_script=json.loads(j_tmp)
第二行是一個有點笨拙(我無法適應這個其他的答案后),但總體而言,它的伎倆。 現在我試圖獲取包含在“motifs”列表中的“seqs”數據。
對上面代碼中第二行的幫助將不勝感激
以下是我能想到的最佳答案。
from bs4 import BeautifulSoup
import json
html=open("<HTML File>")
parsed_html=BeautifulSoup(html)
script_data=parsed_html.script
tmp=script_data.string.partition('=')
#Replace the last part of the tupple (tmp)
j_tmp=tmp[2].replace(";\n ","")
j_script=json.loads(j_tmp)
seq_id=j_script["sequences"]
#Loop over the dictionaries looking for "seqs"
for idict in j_script["motifs"]:
if "seqs" in idict:
dict_tmp=idict
seq_n=dict_tmp["seqs"]
代碼仍然需要改進,但我認為它現在可以完成這項工作。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.