简体   繁体   中英

XSLT to extract values from HTML xml

I have one xml

And, I need to extract the values of the table produced. specifically Row values of col 2 and 3 for each of the rows.

HTML looks like

表

And xml looks like:

<DIV><DIV><TABLE border-collapse="collapse" cellspacing="0" style="width: 650px;"><TR></TR><TR><TD colspan="1"><TABLE border-collapse="collapse" cellspacing="0" style="width: 650px;"><COLGROUP><COL width="160px"><COL width="122px"><COL width="122px"><COL width="122px"><COL width="122px"></COLGROUP><TR></TR><TR><TD align="LEFT" colspan="5" style="border: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Nutrition</SPAN></TD></TR><TR><TD colspan="1" style="width: 160px;border-left: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Typical Values</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">Per 100g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">One tart (125g)</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">%RI*</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-right: 1px solid black;"><SPAN style="font-size: inherit;">RI*</SPAN></TD></TR></TABLE></TD></TR><TR><TD colspan="1"><TABLE border-collapse="collapse" cellspacing="0" style="width: 650px;"><COLGROUP><COL width="160px"><COL width="122px"><COL width="122px"><COL width="122px"><COL width="122px"></COLGROUP><TR></TR><TR><TD colspan="1" style="width: 160px;border-left: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Energy</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">1373kJ / 329kcal</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">1717kJ / 411kcal</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">20%</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-right: 1px solid black;"><SPAN style="font-size: inherit;">8400kJ / 2000kcal</SPAN></TD></TR><TR><TD colspan="1" style="width: 160px;border-left: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Fat</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">20.0g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">25.0g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">36%</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-right: 1px solid black;"><SPAN style="font-size: inherit;">70g</SPAN></TD></TR><TR><TD colspan="1" style="width: 160px;border-left: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Saturates</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">11.2g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">14.0g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">70%</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-right: 1px solid black;"><SPAN style="font-size: inherit;">20g</SPAN></TD></TR><TR><TD colspan="1" style="width: 160px;border-left: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Carbohydrate</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">32.9g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">41.1g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">16%</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-right: 1px solid black;"><SPAN style="font-size: inherit;">260g</SPAN></TD></TR><TR><TD colspan="1" style="width: 160px;border-left: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Sugars</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">16.2g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">20.2g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">22%</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-right: 1px solid black;"><SPAN style="font-size: inherit;">90g</SPAN></TD></TR><TR><TD colspan="1" style="width: 160px;border-left: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Fibre</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">1.3g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">1.6g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">&nbsp;</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-right: 1px solid black;"><SPAN style="font-size: inherit;">&nbsp;</SPAN></TD></TR><TR><TD colspan="1" style="width: 160px;border-left: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Protein</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">3.9g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">4.9g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">10%</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-right: 1px solid black;"><SPAN style="font-size: inherit;">50g</SPAN></TD></TR><TR><TD colspan="1" style="width: 160px;border-left: 1px solid black;border-bottom: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Salt</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-bottom: 1px solid black;"><SPAN style="font-size: inherit;">0.1g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-bottom: 1px solid black;"><SPAN style="font-size: inherit;">0.1g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-bottom: 1px solid black;"><SPAN style="font-size: inherit;">2%</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-right: 1px solid black;border-bottom: 1px solid black;"><SPAN style="font-size: inherit;">6g</SPAN></TD></TR></TABLE></TD></TR><TR><TD colspan="1"><TABLE border-collapse="collapse" cellspacing="0" style="width: 650px;"><COLGROUP><COL width="160px"><COL width="122px"><COL width="122px"><COL width="122px"><COL width="122px"></COLGROUP><TR></TR><TR><TD colspan="5" style="border-left: 1px solid black;border-right: 1px solid black;"><SPAN style="font-size: inherit;">Contains 2 servings</SPAN></TD></TR></TABLE></TD></TR><TR><TD colspan="1"><TABLE border-collapse="collapse" cellspacing="0" style="width: 650px;"><COLGROUP><COL width="160px"></COLGROUP><TR></TR><TR><TD colspan="1"><TABLE border-collapse="collapse" cellspacing="0" style="width: 650px;"><TR></TR><TR><TD colspan="1" style="border-left: 1px solid black;border-right: 1px solid black;border-bottom: 1px solid black;padding-left: 3px;"><P><SPAN>* Reference intake of an average adult (8400 kJ / 2000 kcal)</SPAN></P></TD></TR></TABLE></TD></TR></TABLE></TD></TR></TABLE></DIV></DIV>

What have I tried: I need values to be stored in variables in xslt.

<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <for-each select="//SPAN">
        <value-of select="." />
    </for-each>
</xsl:stylesheet>

How will I get values, specifically I would like to know:

EnergyCol2

EnergyCol3

values. And, would like them in variables. How do I know that particular value is column 2 (or 3) and is of type (Energy or Fat etc.)

Although this doesn't answer the question as I have used regex to parse the html xml but it still does my job. So, I am calling java function from XSLT.

And the java code:

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class NutrientValues {

private static final String regex = "Energy.*?<SPAN.*?>(.*?)<\\/SPAN>.*?<SPAN.*?>(.*?)<\\/SPAN>.*?Fat.*?<SPAN.*?>(.*?)<\\/SPAN>.*?<SPAN.*?>(.*?)<\\/SPAN>.*?Saturates.*?<SPAN.*?>(.*?)<\\/SPAN>.*?<SPAN.*?>(.*?)<\\/SPAN>.*?Carbohydrate.*?<SPAN.*?>(.*?)<\\/SPAN>.*?<SPAN.*?>(.*?)<\\/SPAN>.*?Sugars.*?<SPAN.*?>(.*?)<\\/SPAN>.*?<SPAN.*?>(.*?)<\\/SPAN>.*?Fibre.*?<SPAN.*?>(.*?)<\\/SPAN>.*?<SPAN.*?>(.*?)<\\/SPAN>.*?Protein.*?<SPAN.*?>(.*?)<\\/SPAN>.*?<SPAN.*?>(.*?)<\\/SPA    N>.*?Salt.*?<SPAN.*?>(.*?)<\\/SPAN>.*?<SPAN.*?>(.*?)<\\/SPAN>";
    private static final Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE);
    private static Matcher matcher;
    public static boolean process(String htmldoc) {
        matcher = pattern.matcher(htmldoc);
        return matcher.find();
    }
    public static String getEnergyPer100() {
        return matcher.group(1);
    }
    public static String getEnergyPerServ() {
        return matcher.group(2);
    }
    public static String getFatPer100() {
        return matcher.group(3);
    }
    public static String getFatPerServ() {
        return matcher.group(4);
    }
    public static String getSaturatesPer100() {
        return matcher.group(5);
    }
    public static String getSaturatesPerServ() {
        return matcher.group(6);
    }
    public static String getCarbohydratePer100() {
        return matcher.group(7);
    }
    public static String getCarbohydratePerServ() {
        return matcher.group(8);
    }
    public static String getSugarsPer100() {
        return matcher.group(9);
    }
    public static String getSugarsPerServ() {
        return matcher.group(10);
    }
    public static String getFibrePer100() {
        return matcher.group(11);
    }
    public static String getFibrePerServ() {
        return matcher.group(12);
    }
    public static String getProteinPer100() {
        return matcher.group(13);
    }
    public static String getProteinPerServ() {
        return matcher.group(14);
    }
    public static String getSaltPer100() {
        return matcher.group(15);
    }
    public static String getSaltPerServ() {
        return matcher.group(16);
    }
}

Result:

Group 1: 1373kJ / 329kcal
Group 2: 1717kJ / 411kcal
Group 3: 20.0g
Group 4: 25.0g
Group 5: 11.2g
Group 6: 14.0g
Group 7: 32.9g
Group 8: 41.1g
Group 9: 16.2g
Group 10: 20.2g
Group 11: 1.3g
Group 12: 1.6g
Group 13: 3.9g
Group 14: 4.9g
Group 15: 0.1g
Group 16: 0.1g

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM