用 docx4j 对pptx文档进行简单读写


=Start=

缘由:

简单记录一下对pptx文件进行简单读写的方法,方便有需要的参考。

正文:

参考解答:

直接看代码吧,简单直接,如果对OOXML的格式规范有一定了解的话理解起来会更容易。

package com.example;

import org.docx4j.TraversalUtil;
import org.docx4j.XmlUtils;
import org.docx4j.dml.CTTextBody;
import org.docx4j.dml.CTTextParagraph;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.packages.PresentationMLPackage;
import org.docx4j.openpackaging.parts.PresentationML.SlidePart;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.pptx4j.jaxb.Context;
import org.pptx4j.pml.Shape;

import java.io.File;
import java.util.List;

/**
 * @author ixyzero
 * Created on 2022-05-04
 */
public class opOfficePptx3 {
    public static void main(String[] args) {
        String filePath = "new-test.pptx";
        try {
            PresentationMLPackage presentationMLPackage = PresentationMLPackage.load(new File(filePath));
            // traverse pptx content
            // printPptxContent(presentationMLPackage);
            printPptxContentStr(presentationMLPackage);
            // add hidden text
            textMark(presentationMLPackage, "test 98308");
            // traverse pptx content again for comparison
            printPptxContentStr(presentationMLPackage);
            presentationMLPackage.save(new File("new-test-pptx3.pptx")); // mark then save
        } catch (Docx4JException e) {
            e.printStackTrace();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return;
    }

    private static void printPptxContentStr(PresentationMLPackage pptxPackage) throws Exception {
        String tag = "a:t";
        // 对于一个pptx文件来说,解压之后在 ppt/slides/ 下面的每一个文件 slideN.xml 就是一个页面,也是一个单独的XML文件
        // 所以如果想要获取pptx中所有的页面内容的话,需要遍历每一个slide然后解析并提取其中的特定节点的内容
        for (int i=0; i<pptxPackage.getMainPresentationPart().getSlideCount(); i++) {
            SlidePart slide = pptxPackage.getMainPresentationPart().getSlide(i);
            System.out.println(String.format("\nthis is slide %d:", i));
            // traverse xml content via jsoup
            String xmlStr = slide.getXML();
            //System.out.println(xmlStr); // print all xml content
            Document doc = Jsoup.parse(xmlStr, "", Parser.xmlParser());
            System.out.println(doc.getElementsByTag(tag)); // <a:t>xxx</a:t> 原来这个叫tag而不是id/css query selector
        }
        return;
    }

    // TraverseSlide.java (https://github.com/plutext/docx4j/tree/VERSION_11_4_6/docx4j-samples-pptx4j)
    private static void printPptxContent(PresentationMLPackage pptxPackage) throws Exception {
        for (int i=0; i<pptxPackage.getMainPresentationPart().getSlideCount(); i++) {
            SlidePart slide = pptxPackage.getMainPresentationPart().getSlide(i);
            System.out.println(String.format("\nthis is slide %d:", i));

            // 下面的这段代码内容其实可以简单的用打印 slide.getXML() 来替代……
            new TraversalUtil(slide.getJaxbElement().getCSld().getSpTree().getSpOrGrpSpOrGraphicFrame(),
                    new TraversalUtil.Callback() {
                        String indent = "";
                        // @Override
                        public List<Object> apply(Object o) {
                            String text = "";
                            try {
                                System.out.println(indent + o.getClass().getName() + "\n\n" + XmlUtils.marshaltoString(o, true, org.pptx4j.jaxb.Context.jcPML));
                            } catch (RuntimeException me) {
                                System.out.println(indent + o.getClass().getName());
                            }

                            if (o instanceof org.pptx4j.pml.Shape) {
                                CTTextBody txBody = ((org.pptx4j.pml.Shape) o).getTxBody();
                                if (txBody != null) {
                                    for (CTTextParagraph tp : txBody.getP()) {
                                        System.out.println(indent + tp.getClass().getName() + "\n\n" + XmlUtils.marshaltoString(tp, true, true, org.pptx4j.jaxb.Context.jcPML,
                                                "http://schemas.openxmlformats.org/presentationml/2006/main", "txBody", CTTextParagraph.class));
                                    }
                                }
                            }
                            return null;
                        }

                        // @Override
                        public boolean shouldTraverse(Object o) {
                            return true;
                        }

                        // Depth first
                        // @Override
                        public void walkJAXBElements(Object parent) {
                            indent += "    ";
                            List children = getChildren(parent);
                            if (children != null) {
                                for (Object o : children) {
                                    // if its wrapped in javax.xml.bind.JAXBElement, get its
                                    // value
                                    o = XmlUtils.unwrap(o);
                                    this.apply(o);
                                    if (this.shouldTraverse(o)) {
                                        walkJAXBElements(o);
                                    }
                                }
                            }
                            indent = indent.substring(0, indent.length() - 4);
                        }

                        // @Override
                        public List<Object> getChildren(Object o) {
                            return TraversalUtil.getChildrenImpl(o);
                        }
                    }
            );
        }
    }

    private static void textMark(PresentationMLPackage pptxPackage, String text) throws Exception {
        Shape shape = ((Shape) XmlUtils.unmarshalString(
                getHiddenShape(text), Context.jcPML) );
        for (int i = 0; i < pptxPackage.getMainPresentationPart().getSlideCount(); i++) {
            if(i % 2 == 0) {
                SlidePart slidePart = pptxPackage.getMainPresentationPart().getSlide(i);
                slidePart.getJaxbElement().getCSld().getSpTree().getSpOrGrpSpOrGraphicFrame().add(shape);
            }
        }
        return;
    }

    private static String getHiddenShape(String hiddenStr) {
        return "<p:sp xmlns:a=\"http://schemas.openxmlformats.org/drawingml/2006/main\" xmlns:r=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships\" xmlns:p=\"http://schemas.openxmlformats.org/presentationml/2006/main\">" +
                "<p:nvSpPr>\n" +
                "<p:cNvPr id=\"2001\" name=\"Shape 2001\"/>\n" + // p:cNvPr -> id/name
                "<p:cNvSpPr txBox=\"true\"/>\n" +
                "<p:nvPr/>\n" +
                "</p:nvSpPr>\n" +
                "<p:spPr>\n" +
                "<a:xfrm>\n" +
                "<a:off x=\"-100\" y=\"0\"/>\n" + // a:off-x/y
                "<a:ext cx=\"16999\" cy=\"16999\"/>\n" + // a:ext-cx/cy
                "</a:xfrm>\n" +
                "<a:prstGeom prst=\"rect\">\n" +
                "<a:avLst/>\n" +
                "</a:prstGeom>\n" +
                "</p:spPr>\n" +
                "<p:txBody>\n" +
                "<a:bodyPr lIns=\"16158\" tIns=\"16158\" rIns=\"16158\" bIns=\"16158\" anchor=\"t\" anchorCtr=\"false\">\n" +
                "<a:noAutofit/>\n" +
                "</a:bodyPr>\n" +
                "<a:lstStyle/>\n" +
                "<a:p>\n" +
                "<a:pPr>\n" +
                "<a:spcBef>\n" +
                "<a:spcPts val=\"0\"/>\n" +
                "</a:spcBef>\n" +
                "<a:buNone/>\n" +
                "</a:pPr>\n" +
                "<a:r>\n" +
                "<a:rPr lang=\"en\" sz=\"100\" dirty=\"false\">\n" +
                "<a:solidFill>\n" +
                "<a:schemeClr val=\"lt1\">\n" +
                "<a:alpha val=\"1000\"/>\n" +
                "</a:schemeClr>\n" +
                "</a:solidFill>\n" +
                "</a:rPr>\n" +
                "<a:t>" + hiddenStr + "</a:t>\n" +
                "</a:r>\n" +
                "</a:p>\n" +
                "</p:txBody>\n" +
                "</p:sp>";
    }
}
参考链接:

p:cNvPr -> Non-Visual Drawing Properties
http://www.datypic.com/sc/ooxml/e-p_cNvPr-1.html

p:nvSpPr -> Non-Visual Properties for a Shape
http://www.datypic.com/sc/ooxml/e-p_nvSpPr-1.html

p:nvPicPr -> Non-Visual Properties for a Picture
http://www.datypic.com/sc/ooxml/e-p_nvPicPr-1.html

a:xfrm -> 2D Transform for Individual Objects
http://www.datypic.com/sc/ooxml/e-a_xfrm-4.html

a:off -> Offset (X-Axis/Y-Axis Coordinate)
http://www.datypic.com/sc/ooxml/e-a_off-1.html

a:ext -> Extent Length/Width
http://www.datypic.com/sc/ooxml/e-a_ext-2.html

=END=

,

《“用 docx4j 对pptx文档进行简单读写”》 有 1 条评论

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注