[英]Pull variable value from javascript source using BeautifulSoup4 Python
I'm newbie in python programming. 我是python编程的新手。 I'm learning beautifulsoup to scrap website. 我正在学习beautifulsoup来废弃网站。
I want to extract and store the value of "stream" to my variable. 我想提取“流”的值并将其存储到变量中。
My Python code as follows : 我的Python代码如下:
import bs4 as bs #Importing BeautifulSoup4 Python Library.
import urllib.request
import requests
import json
import re
headers = {'User-Agent':'Mozilla/5.0'}
url = "http://thoptv.com/partners/mhdTVlive/Core.php?level=1200&channel=Dsports_HD"
page = requests.get(url)
soup = bs.BeautifulSoup(page.text,"html.parser")
pattern = re.compile('var stream = (.*?);')
scripts = soup.find_all('script')
for script in scripts:
if(pattern.match(str(script.string))):
data = pattern.match(script.string)
links = json.loads(data.groups()[0])
print(links)
This is the source javascript code to get the stream url value. 这是获取流url值的源JavaScript代码。
https://content.jwplatform.com/libraries/oncyToRO.js'>if( navigator.userAgent.match(/android/i)|| navigator.userAgent.match(/webOS/i)|| navigator.userAgent.match(/iPhone/i)|| navigator.userAgent.match(/iPad/i)|| navigator.userAgent.match(/iPod/i)|| navigator.userAgent.match(/BlackBerry/i)|| navigator.userAgent.match(/Windows Phone/i)) {var stream = " http://ssrigcdnems01.cdnsrv.jio.com/jiotv.live.cdn.jio.com/Dsports_HD/Dsports_HD_800.m3u8?jct=ibxIPxc6rkq1yIUJb4RlEV&pxe=1504146411&st=AQIC5wM2LY4SfczRaEwgGl4Dyvly_3HihdlD_Oduojk5Kxs . AAJTSQACMDIAAlNLABQtNjUxNDEwODczODgxNzkyMzg5OQACUzEAAjYw ";}else{var stream = " http://hd.simiptv.com:8080//index.m3u8?key=VIoVSsGRLRouHWGNo1epzX&exp=932213423&domain=thoptv.stream&id=461 ";}jwplayer("THOPTVPlayer").setup({"title": 'thoptv.stream',"stretching":"exactfit","width": "100%","file": none,"height": "100%","skin": "seven","autostart": "true","logo": {"file":" https://i.imgur.com/EprI2uu.png ","margin":"-0", "position":"top-left","hide":"false","lin https://content.jwplatform.com/libraries/oncyToRO.js'> if(navigator.userAgent.match(/ android / i)|| navigator.userAgent.match(/ webOS / i)|| navigator.userAgent.match (/ iPhone / i)|| navigator.userAgent.match(/ iPad / i)|| navigator.userAgent.match(/ iPod / i)|| navigator.userAgent.match(/ BlackBerry / i)|| navigator.userAgent .match(/ Windows手机/ I)){风险流=“ http://ssrigcdnems01.cdnsrv.jio.com/jiotv.live.cdn.jio.com/Dsports_HD/Dsports_HD_800.m3u8?jct=ibxIPxc6rkq1yIUJb4RlEV&pxe=1504146411&st=AQIC5wM2LY4SfczRaEwgGl4Dyvly_3HihdlD_Oduojk5Kxs 。AAJTSQACMDIAAlNLABQtNjUxNDEwODczODgxNzkyMzg5OQACUzEAAjYw “;} else {var stream =” http://hd.simiptv.com:8080//index.m3u8?key=VIoVSsGRLRouHWGNo1epzX&==player213 “) ({“ title”:'thoptv.stream',“ stretching”:“ exfitfit”,“ width”:“ 100%”,“ file”:无,“ height”:“ 100%”,“ skin”:“七个“,” autostart“:” true“,”徽标“:{” file“:” https://i.imgur.com/EprI2uu.png “,” margin“:”-0“,” position“:” top -left“,” hide“:” false“,” lin k":" http://mhdtvlive.co.in "},"androidhls": true,});jwplayer("THOPTVPlayer").onError(function(){jwplayer().load({file:" http://content.jwplatform.com/videos/7RtXk3vl-52qL9xLP.mp4 ",image:" http://content.jwplatform.com/thumbs/7RtXk3vl-480.jpg "});jwplayer().play();});jwplayer("THOPTVPlayer").onComplete(function(){window.location = window.location.href;});jwplayer("THOPTVPlayer").onPlay(function(){clearTimeout(theTimeout);}); k“:” http://mhdtvlive.co.in “},” androidhls“:true,}); jwplayer(” THOPTVPlayer“)。onError(function(){jwplayer()。load({file:” http: //content.jwplatform.com/videos/7RtXk3vl-52qL9xLP.mp4“,image :” http://content.jwplatform.com/thumbs/7RtXk3vl-480.jpg “}); jwplayer()。play();} ); jwplayer(“ THOPTVPlayer”)。onComplete(function(){window.location = window.location.href;}); jwplayer(“ THOPTVPlayer”)。onPlay(function(){clearTimeout(theTimeout);});;
I need to extract the url from stream. 我需要从流中提取URL。
var stream = " http://ssrigcdnems01.cdnsrv.jio.com/jiotv.live.cdn.jio.com/Dsports_HD/Dsports_HD_800.m3u8?jct=ibxIPxc6rkq1yIUJb4RlEV&pxe=1504146411&st=AQIC5wM2LY4SfczRaEwgGl4Dyvly_3HihdlD_Oduojk5Kxs . AAJTSQACMDIAAlNLABQtNjUxNDEwODczODgxNzkyMzg5OQACUzEAAjYw ";} VAR流= “ http://ssrigcdnems01.cdnsrv.jio.com/jiotv.live.cdn.jio.com/Dsports_HD/Dsports_HD_800.m3u8?jct=ibxIPxc6rkq1yIUJb4RlEV&pxe=1504146411&st=AQIC5wM2LY4SfczRaEwgGl4Dyvly_3HihdlD_Oduojk5Kxs AAJTSQACMDIAAlNLABQtNjUxNDEwODczODgxNzkyMzg5OQACUzEAAjYw。”;}
Rather then thinking complicated with regex, if the link is the only dynamically changing part, you can split
the string with some known separating tokens. 如果链接是唯一动态变化的部分,则不必考虑正则表达式的复杂性,而可以使用一些已知的分隔标记来split
字符串。
x = """
https://content.jwplatform.com/libraries/oncyToRO.js'>if( navigator.userAgent.match(/android/i)|| navigator.userAgent.match(/webOS/i)|| navigator.userAgent.match(/iPhone/i)|| navigator.userAgent.match(/iPad/i)|| navigator.userAgent.match(/iPod/i)|| navigator.userAgent.match(/BlackBerry/i)|| navigator.userAgent.match(/Windows Phone/i)) {var stream = "http://ssrigcdnems01.cdnsrv.jio.com/jiotv.live.cdn.jio.com/Dsports_HD/Dsports_HD_800.m3u8?jct=ibxIPxc6rkq1yIUJb4RlEV&pxe=1504146411&st=AQIC5wM2LY4SfczRaEwgGl4Dyvly_3HihdlD_Oduojk5Kxs.AAJTSQACMDIAAlNLABQtNjUxNDEwODczODgxNzkyMzg5OQACUzEAAjYw";}else{var stream = "http://hd.simiptv.com:8080//index.m3u8?key=VIoVSsGRLRouHWGNo1epzX&exp=932213423&domain=thoptv.stream&id=461";}jwplayer("THOPTVPlayer").setup({"title": 'thoptv.stream',"stretching":"exactfit","width": "100%","file": none,"height": "100%","skin": "seven","autostart": "true","logo": {"file":"https://i.imgur.com/EprI2uu.png","margin":"-0", "position":"top-left","hide":"false","link":"http://mhdtvlive.co.in"},"androidhls": true,});jwplayer("THOPTVPlayer").onError(function(){jwplayer().load({file:"http://content.jwplatform.com/videos/7RtXk3vl-52qL9xLP.mp4",image:"http://content.jwplatform.com/thumbs/7RtXk3vl-480.jpg"});jwplayer().play();});jwplayer("THOPTVPlayer").onComplete(function(){window.location = window.location.href;});jwplayer("THOPTVPlayer").onPlay(function(){clearTimeout(theTimeout);});
"""
left1, right1 = x.split("Phone/i)) {var stream =")
left2, right2 = right1.split(";}else")
print(left2)
# "http://ssrigcdnems01.cdnsrv.jio.com/jiotv.live.cdn.jio.com/Dsports_HD/Dsports_HD_800.m3u8?jct=ibxIPxc6rkq1yIUJb4RlEV&pxe=1504146411&st=AQIC5wM2LY4SfczRaEwgGl4Dyvly_3HihdlD_Oduojk5Kxs.AAJTSQACMDIAAlNLABQtNjUxNDEwODczODgxNzkyMzg5OQACUzEAAjYw"
pattern.match()
matches the pattern from the beginning of the string. pattern.match()
从字符串的开头匹配模式。 Try using pattern.search()
instead - it will match anywhere within the string. 尝试改用pattern.search()
-它会匹配字符串中的任何位置。
Change your for loop to this: 将您的for循环更改为此:
for script in scripts:
data = pattern.search(script.text)
if data is not None:
stream_url = data.groups()[0]
print(stream_url)
You can also get rid of the surrounding quotes by changing the regex pattern to: 您还可以通过将正则表达式模式更改为以下内容来消除周围的引号:
pattern = re.compile('var stream = "(.*?)";')
so that the double quotes are not included in the group. 因此,双引号不会包含在该组中。
You might also have noticed that there are two possible stream
variables depending on the accessing user agent. 您可能还已经注意到,取决于正在访问的用户代理,有两个可能的stream
变量。 For tablet like devices the first would be appropriate, while all other user agents should use the second stream
. 对于类似平板电脑的设备,第一个将是合适的,而所有其他用户代理应使用第二个stream
。 You can use pattern.findall()
to get all of them: 您可以使用pattern.findall()
来获取所有这些信息:
>>> pattern.findall(script.text)
['"http://ssrigcdnems01.cdnsrv.jio.com/jiotv.live.cdn.jio.com/Dsports_HD/Dsports_HD_800.m3u8?jct=LEurobVVelOhbzOZ6EkTwr&pxe=1571716053&st=AQIC5wM2LY4SfczRaEwgGl4Dyvly_3HihdlD_Oduojk5Kxs.*AAJTSQACMDIAAlNLABQtNjUxNDEwODczODgxNzkyMzg5OQACUzEAAjYw*"', '"http://hd.simiptv.com:8080//index.m3u8?key=vaERnLJswnWXM8THmfvDq5&exp=944825312&domain=thoptv.stream&id=461"']
this code works for me 该代码对我有用
import bs4 as bs #Importing BeautifulSoup4 Python Library.
import urllib.request
import requests
import json
headers = {'User-Agent':'Mozilla/5.0'}
url = "http://thoptv.com/partners/mhdTVlive/Core.php?
level=1200&channel=Dsports_HD"
page = requests.get(url)
soup = bs.BeautifulSoup(page.text,"html.parser")
scripts = soup.find_all('script')
out = list()
for c, i in enumerate(scripts): #go over list
text = i.text
if(text[:2] == "if"): #if the (if) comes first
for count, t in enumerate(text): # then we have reached the correct item in the list
if text[count] == "{" and text[count + 1] == "v" and text[count + 5] == "s": # and if this is here that stream is set
tmp = text[count:] # add this to the tmp varible
break # and end
co = 0
for m in tmp: #loop over the results from prev. result
if m == "\"" and co == 0: #if string is starting
co = 1 #set count to "true" 1
elif m == "\"" and co == 1: # if it is ending stop
print(''.join(out)) #results
break
elif co == 1:
# as long as we are looping over the rigth string
out.append(m) #add to out list
pass
result = ''.join(out) #set result
it basicly filters the string manuely. 它基本上是手动过滤字符串。
but if we use user1767754 method (brilliant by the way) we will end up something like this: 但是,如果我们使用user1767754方法(顺便说一句),我们将得到如下结果:
import bs4 as bs #Importing BeautifulSoup4 Python Library.
import urllib.request
import requests
import json
headers = {'User-Agent':'Mozilla/5.0'}
url = "http://thoptv.com/partners/mhdTVlive/Core.php?level=1200&channel=Dsports_HD"
page = requests.get(url)
soup = bs.BeautifulSoup(page.text,"html.parser")
scripts = soup.find_all('script')
x = scripts[3].text
left1, right1 = x.split("Phone/i)) {var stream =")
left2, right2 = right1.split(";}else")
print(left2)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.