[英]Extracting element values from HTML using google sheet script
I have an HTML saved in a cell on the google sheet.我有一个 HTML 保存在谷歌表格的一个单元格中。 Now I would like to extract element values from it.
现在我想从中提取元素值。 Can anyone please guide?
任何人都可以请指导吗?
Here is the sample HTML that I am working with:这是我正在使用的示例 HTML:
<div class="test"><a href="/this-is-page-url" class="cc_a_a"><div data-react-toolbox="card" class="new_test"><div style="background-image:url('https://www.google.com/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png')" class="new_class" title="this is image"><div class="last"></div></div><div class="new_2"><div class="title_test"><div class="card_title">Title Goes Here</div></div></div><div class="for_text"><p>test goes here</p></div><div class="for_date"><p>Jan 1, 2020</p></div></div></a></div>
I would like to extract:我想提取:
Sample code that I am trying to extract href value.我试图提取 href 值的示例代码。 No idea how I can do other element unfortunately.
不幸的是,不知道我该怎么做其他元素。
var variable_for_cell_with_HTML = "MY_HTML_GOES_HERE_FROM_ABOVE";
var myurl = variable_for_cell_with_HTML;
var doc = document.createElement("html");
doc.innerHTML = rawHTML;
var links = doc.getElementsByTagName("a")
var urls = [];
for (var i=0; i<links.length; i++) {
SpreadsheetApp.getActive().getSheetByName('mysheet').getRange('B7').setValue(urls.push(links[i].getAttribute("href")));
}
Getting ERROR得到错误
ReferenceError: document is not defined
If you're trying to extract specific HTML elements from a given URL, you can follow this general format:如果您尝试从给定的 URL 中提取特定的 HTML 元素,则可以遵循以下通用格式:
=importxml(A8,"//div[@class='class of desired div']//h3[@class='class of desired h3 element']")
Where A8 is a cell with the web link to the HTML, and where the div
or h3
are the tags encompassing your desired result from the page.其中 A8 是一个单元格,其中 web 链接到 HTML,其中
div
或h3
是包含页面所需结果的标签。 This is just one example extracting a specific h3
from a specific div
, but you could leave off the [@class==]
stuff to just return all the h3
elements within the prior div
.这只是从特定
div
中提取特定h3
的一个示例,但您可以省略[@class==]
内容,只返回前一个div
中的所有h3
元素。
I'm sure this could be applied to your specific case as well.我相信这也适用于您的具体情况。
It's only html when after it's loaded into the browser.加载到浏览器后,它只是 html。 Before that it's just a string.
在此之前它只是一个字符串。 Use standard javascript string methods
使用标准的 javascript 字符串方法
something like this regex will get you close to the href: url:像这样的正则表达式会让你接近href:url:
/href="([^"]{1,})"/g
this is will get you close to the background url:这将使您接近背景 url:
https:\/\/[^&]{1,}
Regex Tester正则表达式测试器
This is the html file for my regex tester.这是我的正则表达式测试器的 html 文件。 I wrote it a long time ago so it's probably a bit neophyte....ish?
我很久以前写的,所以它可能有点新手....ish?
<!DOCTYPE html>
<html>
<head>
<base target="_top">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
</head>
<script>
$(function(){
google.script.run
.withSuccessHandler(function(rObj){
$('#text').val(rObj.text);
$('#pattern').val(rObj.pattern);
$('#results').css('background','white');
if(rObj.g.toLowerCase()=='yes'){$('#set_g').prop('checked',true);}else{$('#set_g').prop('checked',false);}
if(rObj.i.toLowerCase()=='yes'){$('#set_i').prop('checked',true);}else{$('#set_i').prop('checked',false);}
if(rObj.m.toLowerCase()=='yes'){$('#set_m').prop('checked',true);}else{$('#set_m').prop('checked',false);}
})
.getLastTextPatternFlags();
});
function findData(){
$('#results').css('background','yellow');
$('#results').val('');
var text=$('#text').val();
var pattern=$('#pattern').val();
var flags=getFlags();
try{
var regex=new RegExp(pattern,flags);
}
catch(e){
console.error(e);
$('#results').css('background','white');//This is test very much you should come back and look at this.
$('#results').val('Check Error in Console Log');
}
//var result=regex.exec(text);
result=text.match(regex);
if(result){
var rsltLog='';
for(var i=0;i<result.length;i++){
if(i>0){rsltLog+='\n'};
rsltLog+='result[' + i + ']= ' + result[i];
}
}
console.log('module: %s pattern: %s regex: %s flags: %s result: %s length: %s','findData()',pattern,regex,flags,rsltLog,result.length);
try{
if(result){
$('#results').val(rsltLog);
}else{
$('#results').val("No Results");
}
}
catch(e){
console.error(e);
}
$('#results').css('background','white');
}
function getFlags(){
var g=$('#set_g').is(':checked');
var i=$('#set_i').is(':checked');
var m=$('#set_m').is(':checked');
var flagsA=[];
if(g){flagsA.push('g');}
if(i){flagsA.push('i');}
if(m){flagsA.push('m');}
return flagsA.join('');
}
function saveText(){
$('#text').css('background','yellow');
var txt=$('#text').val();
google.script.run
.withSuccessHandler(function(){
$('#text').css('background','white');
})
.saveText(txt);
}
function savePattern(){
$('#pattern').css('background','yellow');
var txt=$('#pattern').val();
google.script.run
.withSuccessHandler(function(){
$('#pattern').css('background','white');
})
.savePattern(txt);
}
function saveFlags(){
$('#results').css('background','yellow');
var g=$('#set_g').is(':checked');
var i=$('#set_i').is(':checked');
var m=$('#set_m').is(':checked');
var flagObj={g:'no',i:'no',m:'no'};
if(g){flagObj.g='yes';}
if(i){flagObj.i='yes';}
if(m){flagObj.m='yes';}
google.script.run
.withSuccessHandler(function(){
$('#results').css('background','white');
})
.saveFlags(flagObj);
}
console.log('My Code');
</script>
<style>
.btns{margin:2px 2px 2px 0;}
#container{width:100%;}
</style>
<body>
<div id='container'>
TEXT <input class="btns" type="button" value="Save Text" onClick="saveText();" />
<br /><textarea id="text" placeholder="Enter the text to be searched" rows="4" cols="60"></textarea>
<br />PATTERN <input class="btns" type="button" value="Save Pattern" onClick="savePattern();" />
<br /><textarea id="pattern" placeholder="Enter the regex search expression" rows="4" cols="60"></textarea>
<br />RESULTS
<br /><textarea id="results" rows="4" cols="60"></textarea>
<br /><input type="button" value="Search" onClick="findData();" /> <input class="hostcontrol" type="button" value="Close" onClick="google.script.host.close();" />
g <input id="set_g" type="checkbox" />
i <input id="set_i" type="checkbox" />
m <input id="set_m" type="checkbox" />
<input type="button" value="Save Flags" onClick="saveFlags();" />
<p>Don't leave extra carriage returns in search pattern textbox.</p>
</div>
</body>
</html>
And this is the GS code for it:这是它的GS代码:
function onOpen(){
SpreadsheetApp.getUi().createMenu('My Tools')
.addItem('Regex Tool', 'showRegexDialog')
.addToUi();
}
function showRegexDialog(){
var ui=HtmlService.createHtmlOutputFromFile('RegexTester').setWidth(800).setHeight(500);
SpreadsheetApp.getUi().showModelessDialog(ui, 'Regex Tester');
}
function getLastTextPatternFlags(){
var ss=SpreadsheetApp.getActive();
var sh=ss.getSheetByName('Input');
var rg=sh.getDataRange();
var vA=rg.getValues();
var rObj={};
for(var i=0;i<vA.length;i++){
rObj[vA[i][0]]=vA[i][1];
}
Logger.log(rObj);
return rObj;
}
function saveText(txt){
var ss=SpreadsheetApp.getActive();
var sh=ss.getSheetByName('Input');
var rg=sh.getDataRange();
var vA=rg.getValues();
for(var i=0;i<vA.length;i++){
if(vA[i][0]=='text'){
vA[i][1]=txt;
}
}
rg.setValues(vA);
return true;
}
function savePattern(txt){
var ss=SpreadsheetApp.getActive();
var sh=ss.getSheetByName('Input');
var rg=sh.getDataRange();
var vA=rg.getValues();
for(var i=0;i<vA.length;i++){
if(vA[i][0]=='pattern'){
vA[i][1]=txt;
}
}
rg.setValues(vA);
return true;
}
function saveFlags(flagObj){
var ss=SpreadsheetApp.getActive();
var sh=ss.getSheetByName('Input');
var rg=sh.getDataRange();
var vA=rg.getValues();
for(var i=0;i<vA.length;i++){
var n=String(vA[i][0]).toLowerCase();
if(n=='g' || n=='i' || n=='m'){
vA[i][1]=flagObj[n];
}
}
rg.setValues(vA);
return true;
}
function doGet(){
var output=HtmlService.createHtmlOutputFromFile('RegexTester');
output.append('<style>.hostcontrol{display:none;}</style>');
return output.setXFrameOptionsMode(HtmlService.XFrameOptionsMode.ALLOWALL);
}
Pour it into a dialog and play with it..把它倒进一个对话框里玩吧..
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.