Java使用POI读取2007+版本PPT
0
贴上代码,感觉读取Word,PPT,Excel这些都可以直接把文件用7z解压后,然后看主要文件的XML文件结构,然后代码自然就出来了。
不多说了,直接贴上代码:
package com.acgist.ppt;
import java.io.IOException;
import java.util.List;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.xslf.XSLFSlideShow;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
import org.openxmlformats.schemas.presentationml.x2006.main.CTGraphicalObjectFrame;
import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
/**
* 读取PPT
*/
public class PPTUtils {
/**
* 读取PPT2007+
* @param path ppt路径
*/
public static void getTextFromPPT2007(String path) {
XSLFSlideShow xslfSlideShow;
try {
xslfSlideShow = new XSLFSlideShow(path);
int page = 0;
List<CTSlideIdListEntry> list = xslfSlideShow.getSlideReferences().getSldIdList();
for (CTSlideIdListEntry ctSlideIdListEntry : list) {
page++;
List<CTShape> spList = xslfSlideShow.getSlide(ctSlideIdListEntry).getCSld().getSpTree().getSpList();
for (CTShape ctShape : spList) {
List<CTTextParagraph> pList = ctShape.getTxBody().getPList();
for (CTTextParagraph ctTextParagraph : pList) {
System.out.println(ctTextParagraph.newCursor() .getTextValue() + "=========================" + page);
CTRegularTextRun[] runs = ctTextParagraph.getRArray();
for (CTRegularTextRun ctRegularTextRun : runs) {
System.out.println(ctRegularTextRun.getT());
}
// List<CTRegularTextRun> rList = ctTextParagraph.getRList(); // 这个应该是缺少了什么包文件不能使用
// for (CTRegularTextRun ctRegularTextRun : rList) {
// System.out.println(ctRegularTextRun.getT());
// }
}
}
List<CTGraphicalObjectFrame> graphicFrameList = xslfSlideShow.getSlide(ctSlideIdListEntry).getCSld().getSpTree().getGraphicFrameList();
for (CTGraphicalObjectFrame ctGraphicalObjectFrame : graphicFrameList) {
ctGraphicalObjectFrame.getGraphic().getGraphicData();
}
}
} catch (OpenXML4JException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (XmlException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
getTextFromPPT2007("E:/company/ppt/简历设计.pptx");
}
}
上面有一段注释掉的代码那一段读取文本框里面换行内容的,但是不知道怎么的getRList
那个方法不能使用,缺少class
,但是网上也没找到,所以用了废弃的一个方法。
更多详细的API参考官网地址:http://poi.apache.org/slideshow/xslf-cookbook.html
下面是官方的读取代码感觉要简单很多,而且修改也非常简单,我只测试了文本框的。
XMLSlideShow ppt = new XMLSlideShow(new FileInputStream(path));
XSLFSlide[] slide = ppt.getSlides();
for (int i = 0; i < slide.length; i++) {
XSLFShape[] sh = slide[i].getShapes();
for (int j = 0; j < sh.length; j++) {
String name = sh[j].getShapeName();
java.awt.geom.Rectangle2D anchor = sh[j].getAnchor();
if (sh[j] instanceof XSLFConnectorShape) {
XSLFConnectorShape line = (XSLFConnectorShape) sh[j];
} else if (sh[j] instanceof XSLFTextShape) {
XSLFTextShape shape = (XSLFTextShape) sh[j];
System.out.println(shape.getText());
shape.setText("acgist");
} else if (sh[j] instanceof XSLFPictureShape) {
XSLFPictureShape shape = (XSLFPictureShape) sh[j];
}
}
ppt.write(new FileOutputStream("e://test/t.pptx"));
}
如果包含table
可以XSLFTable table = (XSLFTable) sh[j];
转为table
读取,怎么获取sh[j]
的类名我想就不需要说了吧。
上面两种方法推荐第二种吧,毕竟是官方给出来的,第一种比较容易了解PPT的XML文件格式。
后面还有一些替换图片的方法有时间再来叙述吧。