XPath的简单学习


=Start=

缘由:

趁着用Java解析XML时学习一下XPath的语法,方便后面有需要的时候参考。

正文:

参考解答:
XPath常用语法对照表
表达式说明
//表示返回XML文档中的所有符合查找条件的元素,而忽略文档中元素的位置级别
/表示绝对路径
News/Links/name表示相对路径
*表所所有元素
or
and
其它表达式=,!=,<,>,>=,<=
Text()文本
Node()节点
number last()last 函数返回一个数字,该数字等于从表达式求值上下文中的上下文大小(即返回节点个数)
number position()position函数返回一个数字,该数字等于从表达式求值上下文中的上下文位置(即当前位置)
number count(node-set)count 函数返回在参数node-set中节点的个数。
boolean not(boolean)如果参数为真 not 函数返回真,否则返回假。
number number(object?)number 函数参数依下列各项转换成数字
//*获得所有节点
../*获得当前节点的父节点的所有节点
//Links获得所有的Links节点
//Links[name=”网易”]获得子节点name的文本为“网易”的所有Links节点
//Links[@id=”1”]获得属性ID=1的所有Links节点
//*[name=”新浪”]获得子节点name的文本为“新浪”的所有节点
//*[@id=”1”]获得属性ID=1的所有节点
//*[@id]获得存在属性ID的所有节点点
//*[name]获得存在子节点name的所有节点
//Links[序号]获得返回的N个Links节点中的第序号个节点
//Links[1 or 2]获得返回的N个Links节点的中第一个和第二个节点
//*[name=”网易” and @id=”1”]获得所有子点节name的文本为“网易” 且自己的属生id=“1”的节点
//text()选择所有的正文节点(即叶子节点)
//Links[position()=last()]获得返回的N个Links节点中的最后一个节点等同于//Links[last()]
//*[contains(name,”新”)]获得子节点name的文本中包含“新”的所有节点
//Links[1]获得返回的N个Links节点中的第一个节点
//Links[1]/name[1]获得第一个Links的第一个name节点
//Links/name获得所有Links节点下的所有name节点
//*[@id>1]获得属性ID>1的所有节点
//*[number(@id)+1>1]获得属生ID的值加1>1的所有节点
//*[number(text())>1]获得文本节点的文本值大于1的所有节点
//*[(number(text()) mode 2)=1]获得文本节点的文本值为基数的所有节点
当节点/元素名称中包含分号(:)时该怎么处理?

或者考虑忽略namespace

//*[name()='media:thumbnail']

或者考虑先用个粗略的过滤方法:

/item/*[local-name()='thumbnail']
Java中使用XPath的样例
package com.example;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import java.io.File;
import java.io.StringReader;

/**
 * @author ixyzero
 * Created on 2022-05-08
 */
public class opXml2 {

    public static void main(String[] args) {
        try {
            Document document = loadXMLFromString(getXMLString());

            System.out.println(document.getDocumentElement()); //
            System.out.println(document.getDocumentElement().getNodeName()); //
            System.out.println(document.getDocumentElement().getAttributes().getLength()); //
            System.out.println(Node.ELEMENT_NODE); // 元素节点ELEMENT_NODE的type对应数值为 1

            System.out.println(document.getDocumentElement().getChildNodes().getLength()); // 一级节点(根结点下一层)的个数
            // 遍历一级节点(根结点下一层)
            NodeList nodeList = document.getDocumentElement().getChildNodes();
            for (int i = 0; i < nodeList.getLength(); i++) {
                Node node = nodeList.item(i);
                System.out.println(node.getNodeName() + "\t" + node.getNodeType());
            }

            // 遍历特定tag的元素并打印出其子元素类型为ELEMENT_NODE的子元素的类型和节点名称
            NodeList nList = document.getElementsByTagName("p:sp");
            for (int i = 0; i < nList.getLength(); i++) {
                Node nNode = nList.item(i);
                System.out.println(String.format("\nCurrent Element: %s, childNode count: %d",
                        nNode.getNodeName(), nNode.getChildNodes().getLength()));
                if (nNode.getNodeType() == Node.ELEMENT_NODE) {
                    Element eElement = (Element) nNode;
                    for (int j = 0; j < eElement.getChildNodes().getLength(); j++) {
                        System.out.println(eElement.getChildNodes().item(j).getNodeType() + "\t" + eElement.getChildNodes().item(j).getNodeName());
                    }
                }
            }

            /*

            //*     ->  获得所有节点
            //*[@cx] ->  获得存在属性cx的所有节点
            //*[@cx="1"] ->  获得存在属性cx且值为1的所有节点
            //*[name()='p:sp']  -> 获得所有p:sp节点
            //*[name()='p:sp'][1]  -> 获得第一个p:sp节点
            //*[local-name()='sp']  ->  *:sp
            //element_name  ->  获得所有element_name节点,如果element_name字符串中没有冒号(:)的话直接用这种方式就OK
            //book/title    ->  获得所有book节点下的所有title节点

            book/title      ->  0 node
            /book/title      ->  0 node
            //book/title      ->  4 nodes
            inventory/book/title      ->  3 nodes
            /inventory/book/title      ->  3 nodes

             */
            XPath xPath =  XPathFactory.newInstance().newXPath();
            String expression = "/inventory/book/title";
            System.out.println(expression);
            NodeList nodeList1 = (NodeList) xPath.compile(expression).evaluate(document, XPathConstants.NODESET);
            System.out.println(nodeList1.getLength());
            for (int i = 0; i < nodeList1.getLength(); i++) {
                Node nNode = nodeList1.item(i);
                System.out.println(String.format("Current Element: %s, node content: %s",
                        nNode.getNodeName(), nNode.getTextContent()));
            }

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static Document loadXMLFromFile(String filePath) throws Exception {
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        DocumentBuilder builder = factory.newDocumentBuilder();
        return builder.parse(new File(filePath));
    }

    public static Document loadXMLFromString(String xml) throws Exception {
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        DocumentBuilder builder = factory.newDocumentBuilder();
        InputSource is = new InputSource(new StringReader(xml));
        return builder.parse(is);
    }

    public static String getXMLString() {
        return "<inventory id=\"123\" name=\"test\" year=\"2022\">\n" +
                "    <book year=\"2020\">\n" +
                "        <title>Snow Crash</title>\n" +
                "        <author>Neal Stephenson</author>\n" +
                "        <publisher>Spectra</publisher>\n" +
                "        <isbn>0553380958</isbn>\n" +
                "        <price>14.95</price>\n" +
                "    </book>\n" +
                "    <book year=\"2015\">\n" +
                "        <title>Burning Tower</title>\n" +
                "        <author>Larry Niven</author>\n" +
                "        <publisher>Pocket</publisher>\n" +
                "        <isbn>0743416910</isbn>\n" +
                "        <price>5.99</price>\n" +
                "    </book>\n" +
                "    <book year=\"1995\">\n" +
                "        <title>Zodiac</title>\n" +
                "        <author>Neal Stephenson</author>\n" +
                "        <publisher>Spectra</publisher>\n" +
                "        <isbn>0553573862</isbn>\n" +
                "        <price>7.50</price>\n" +
                "    </book>\n" +
                "\n" +
                "    <p:cSld>\n" +
                "    <book year=\"1995\">\n" +
                "        <title>Zodiac</title>\n" +
                "        <author>Neal Stephenson</author>\n" +
                "        <publisher>Spectra</publisher>\n" +
                "        <isbn>0553573862</isbn>\n" +
                "        <price>7.50</price>\n" +
                "    </book>\n" +
                "        <p:spTree>\n" +
                "            <p:nvGrpSpPr>\n" +
                "                <p:cNvPr id=\"1\" name=\"\"/>\n" +
                "                <p:cNvGrpSpPr/>\n" +
                "                <p:nvPr/>\n" +
                "            </p:nvGrpSpPr>\n" +
                "            <p:grpSpPr>\n" +
                "                <a:xfrm>\n" +
                "                    <a:off x=\"0\" y=\"0\"/>\n" +
                "                    <a:ext cx=\"0\" cy=\"0\"/>\n" +
                "                    <a:chOff x=\"0\" y=\"0\"/>\n" +
                "                    <a:chExt cx=\"0\" cy=\"0\"/>\n" +
                "                </a:xfrm>\n" +
                "            </p:grpSpPr>\n" +
                "            <p:sp>\n" +
                "                <p:nvSpPr>\n" +
                "                    <p:cNvSpPr>\n" +
                "                        <a:spLocks noGrp=\"true\"/>\n" +
                "                    </p:cNvSpPr>\n" +
                "                    <p:nvPr>\n" +
                "                        <p:ph type=\"ctrTitle\"/>\n" +
                "                    </p:nvPr>\n" +
                "                </p:nvSpPr>\n" +
                "                <p:spPr/>\n" +
                "                <p:txBody>\n" +
                "                    <a:bodyPr/>\n" +
                "                    <a:lstStyle/>\n" +
                "                    <a:p>\n" +
                "                        <a:r>\n" +
                "                            <a:rPr kumimoji=\"true\" lang=\"en-US\" altLang=\"zh-CN\" dirty=\"false\"/>\n" +
                "                            <a:t>This is title</a:t>\n" +
                "                        </a:r>\n" +
                "                        <a:endParaRPr kumimoji=\"true\" lang=\"zh-CN\" altLang=\"en-US\" dirty=\"false\"/>\n" +
                "                    </a:p>\n" +
                "                </p:txBody>\n" +
                "            </p:sp>\n" +
                "            <p:sp>\n" +
                "                <p:txBody>\n" +
                "                    <a:bodyPr/>\n" +
                "                    <a:lstStyle/>\n" +
                "                    <a:p>\n" +
                "                        <a:r>\n" +
                "                            <a:rPr kumimoji=\"true\" lang=\"en-US\" altLang=\"zh-CN\" dirty=\"false\"/>\n" +
                "                            <a:t>Subtitle here</a:t>\n" +
                "                        </a:r>\n" +
                "                        <a:endParaRPr kumimoji=\"true\" lang=\"zh-CN\" altLang=\"en-US\" dirty=\"false\"/>\n" +
                "                    </a:p>\n" +
                "                </p:txBody>\n" +
                "            </p:sp>\n" +
                "            <p:pic>\n" +
                "                <p:nvPicPr>\n" +
                "                    <p:cNvPr id=\"4136\" name=\"Picture 1\" descr=\"1.png\"/>\n" +
                "                    <p:nvPr/>\n" +
                "                </p:nvPicPr>\n" +
                "                <p:blipFill>\n" +
                "                    <a:blip cstate=\"print\" r:link=\"rId3\"/>\n" +
                "                </p:blipFill>\n" +
                "            </p:pic>\n" +
                "        </p:spTree>\n" +
                "        <p:extLst>\n" +
                "            <p:ext uri=\"{BB962C8B-B14F-4D97-AF65-F5344CB8AC3E}\">\n" +
                "                <p14:creationId val=\"3160098309\"/>\n" +
                "            </p:ext>\n" +
                "        </p:extLst>\n" +
                "    </p:cSld>\n" +
                "</inventory>";
    }
}
参考链接:

XML 元素
https://www.runoob.com/xml/xml-elements.html

Intro to XPath with Java
https://www.baeldung.com/java-xpath

Java XPath API语法教程 # XPath常用语法
http://www.51gjie.com/java/747.html

XPath query for XML node with colon in node name
https://newbedev.com/xpath-query-for-xml-node-with-colon-in-node-name

Use XPath to parse element name containing a colon
https://stackoverflow.com/questions/4282147/use-xpath-to-parse-element-name-containing-a-colon

XML Namespaces and How They Affect XPath and XSLT
https://docs.microsoft.com/en-us/previous-versions/dotnet/articles/ms950779(v=msdn.10)?redirectedfrom=MSDN

=END=

,

发表回复

您的电子邮箱地址不会被公开。 必填项已用 * 标注