With Python 3.x, I'm trying to get a list of values that are in what looks to be a JSON variable.
Here's some of the HTML:
<script type="text/javascript">
var BandData = {
id: 171185318,
name: "MASS",
fan_email: null,
account_id: 365569831,
has_discounts: null,
image_id: 39000212
};
var EmbedData = {
tralbum_param: { name: "album", value: 28473799 },
show_campaign: null,
embed_info: {"exclusive_embeddable":null,"public_embeddable":"01 Dec 2011 06:09:19 GMT","no_track_preorder":false,"item_public":true}
};
var FanData = {
logged_in: false,
name: null,
image_id: null,
ip_country_code: null
};
var TralbumData = {
current: {"require_email_0":1,"new_date":"18 Jan 2017 22:59:06 GMT"},
is_preorder: null,
album_is_preorder: null,
album_release_date: "01 Dec 2017 00:00:00 GMT",
preorder_count: null,
hasAudio: true,
art_id: 3862222,
trackinfo: [{"video_featured":null,"has_lyrics":false,"file":{"mp3-128":"https://t4.bcbits.com/stream/064bc3d8bb5/mp3-128/35322674"},"is_capped":null,"sizeof_lyrics":0,"duration":143.244,"encodings_id":830008708},{"video_featured":null,"has_lyrics":false,"license_type":0}],
playing_from: "album page",
featured_track_id: 8612194,
};
Specifically, within TralbumData
, I'm trying to get the URLs within mp3-128
within trackinfo
.
It's tricky for me. It looks like JSON data, but I can't quite get that to work.
So far, I'm able to at least isolate trackinfo
in the TralbumData
variable, with a really kludgy function, but can't quite get it from there. Here's what I have to try and find trackinfo
and then get the URLs within...:
def get_HTML(url):
response = urllib.request.urlopen(url)
page_source = response.read()
site_html = page_source.decode('utf8')
response.close()
JSON = re.compile('TralbumData = ({.*?});', re.DOTALL)
matches = JSON.search(site_html)
info = matches.group(1)
# print(info)
data = info.split("\n")
return data
def get_trackinfo(data):
# print(data[11])
for entry in data:
tmp = entry.split(":")
if tmp[0].strip() == "trackinfo":
for ent in tmp:
tmp = ent.split("mp3-128")
print(tmp)
Doesn't work since it's splitting with :
, effectively separating the http://
part.
I'd think there's a way (and I've looked around and the answers to similar questions here on SO get close, but not quite there), to do say url = my_html['TralbumData']['trackinfo']['mp3-128']
or something.
Here's my solution: 1. get_var
function does initial parsing so then you can try to use JSON functions 2. apply json.loads(var)
and get access to the JSON elements
import re
import json
text = """
<script type="text/javascript">
var BandData = {
id: 171185318,
name: "MASS",
fan_email: null,
account_id: 365569831,
has_discounts: null,
image_id: 39000212
};
var EmbedData = {
tralbum_param: { name: "album", value: 28473799 },
show_campaign: null,
embed_info: {"exclusive_embeddable":null,"public_embeddable":"01 Dec 2011 06:09:19 GMT","no_track_preorder":false,"item_public":true}
};
var FanData = {
logged_in: false,
name: null,
image_id: null,
ip_country_code: null
};
var TralbumData = {
current: {"require_email_0":1,"new_date":"18 Jan 2017 22:59:06 GMT"},
is_preorder: null,
album_is_preorder: null,
album_release_date: "01 Dec 2017 00:00:00 GMT",
preorder_count: null,
hasAudio: true,
art_id: 3862222,
trackinfo: [{"video_featured":null,"has_lyrics":false,"file":{"mp3-128":"https://t4.bcbits.com/stream/064bc3d8bb5/mp3-128/35322674"},"is_capped":null,"sizeof_lyrics":0,"duration":143.244,"encodings_id":830008708},{"video_featured":null,"has_lyrics":false,"license_type":0}],
playing_from: "album page",
featured_track_id: 8612194,
};
"""
def get_var(text, var):
"""
:type text: str
:type var: str
:rtype: str
"""
pattern = 'var\s+' + var.rstrip() + '\s+?=\s+?{'
open_token_found = False
block = '{'
for line in text.splitlines():
line = line.strip()
if not line:
continue
if open_token_found:
if re.match('};', line):
block += '}'
break
else:
segments = line.split(':', 1)
key = segments[0]
if key[0] != '"':
key = '"' + key
if key[-1] != '"':
key = key + '"'
block += key + ':' + segments[1]
elif re.match(pattern, line):
open_token_found = True
if block[-2] == ',':
block = block[:-2] + '}'
return json.loads(block)
var = get_var(text, 'TralbumData')
print(var['trackinfo'][0]['file']['mp3-128'])
Output:
https://t4.bcbits.com/stream/064bc3d8bb5/mp3-128/35322674
Here is a relatively straightforward solution using json
:
import re, json, pprint, urllib.request
regex_data = re.compile(r"""
^\s*var\s+TralbumData\s*=\s*\{(.*?)^\};
""", re.DOTALL | re.MULTILINE | re.VERBOSE)
regex_item = re.compile(r"""
^\s*([\'"]?)([a-z][a-z0-9_]*)\1\s*:\s*(.+?)\s*,?\s*$
""", re.IGNORECASE | re.VERBOSE)
def scrape(url):
result = {}
response = urllib.request.urlopen(url)
html = response.read().decode('utf8')
response.close()
match = regex_data.search(html)
if match is not None:
for line in match.group(0).splitlines():
match = regex_item.match(line)
if match is None:
continue
key, value = match.group(2, 3)
try:
result[key] = json.loads(value)
except json.JSONDecodeError:
pass
return result
tralbumdata = scrape('https://studiomdhr.bandcamp.com/releases')
pprint.pprint(tralbumdata)
This assumes that the layout of TralbumData
object in the javascript code has each of its top-level key:value
items on a separate line. It also assumes that all lower-level javascript objects have string keys, as this is required by the json format . (Note that lines ending in a comment cannot be parsed, because json doesn't support comments at all).
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.