[点晴永久免费OA]【JAVA】word和.txt文件转html 及pdf文件，使用poi jsoup itext心得

word和.txt文件转html 及pdf文件， 使用poi jsoup  itext心得本人第一次写博客，有上面不足的或者需要改正的希望大家指出来，一起学习交流讨论。
由于在项目中遇到了这一个问题，在网上也找了很多方法，感觉千篇一律，总有一些问题，因此总结出word转html和pdf文件使用方法。
虽然poi功能不是很强大，但毕竟不依靠本地office软件，同样还有一种方式使用jacob也可以将word转html，不过这个方式要依靠本地office，
而且只能在windows平台下，不支持unix系统。jacob使用起来还是比较简单的，如果大家需要jacob的使用方法，我会分享给大家。
关于.txt文件转html，就是使用io操作将.txt文件读取出来然后写入到html中，也不需要额外的jar包。
注意：使用poi需要注意以下几项，由于我在做这个功能的时候没有注意这个问题的存在，一直找不出原因，还请有关大牛门指正一下为什么？

1

.使用office的文档.doc和.docx格式的都没有问题，但使用wps生成的word文档时，只能转.doc格式的文件，对.docx的文档转出后没有图片，得不到img属性。

2

.在使用word文档转pdf格式的文件时，生成的pdf没有中文，对中文显示不是很支持。

3

.在将word转成pdf时，需要把生成的html文件转化成标准的html文件，不然解析后会出现<meta>或者<img>标签不闭合的情况。

4

.使用的jar包如下，都可以在maven中央仓库下载得到。下面就直接附上代码了，希望大家有什么问题在下面评论互相交流和学习，使用时直接调用方法即可。
如果大家觉得可以请点一个赞，谢谢大家。

package com.kqco.tools;

import org.apache.poi.hwpf.HWPFDocument;

import org.apache.poi.hwpf.converter.PicturesManager;

import org.apache.poi.hwpf.converter.WordToHtmlConverter;

import org.apache.poi.hwpf.usermodel.PictureType;

import org.apache.poi.xwpf.converter.core.BasicURIResolver;

import org.apache.poi.xwpf.converter.core.FileImageExtractor;

import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;

import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;

import org.apache.poi.xwpf.usermodel.XWPFDocument;

import org.jsoup.Jsoup;

import org.w3c.dom.Document;

import org.w3c.tidy.Tidy;

import org.xhtmlrenderer.pdf.ITextFontResolver;

import org.xhtmlrenderer.pdf.ITextRenderer;

import com.lowagie.text.pdf.BaseFont;

import javax.xml.parsers.DocumentBuilderFactory;

import javax.xml.transform.OutputKeys;

import javax.xml.transform.Transformer;

import javax.xml.transform.TransformerFactory;

import javax.xml.transform.dom.DOMSource;

import javax.xml.transform.stream.StreamResult;

import java.io.BufferedInputStream;

import java.io.BufferedOutputStream;

import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.ByteArrayInputStream;

import java.io.ByteArrayOutputStream;

import java.io.DataOutputStream;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStreamReader;

import java.io.OutputStream;

import java.io.OutputStreamWriter;

import java.io.PrintWriter;

import java.nio.file.Path;

import java.nio.file.Paths;

public class FileConverter {

/*

* word文件转成html文件

* sourceFilePath:源word文件路径

* targetFilePosition:转化后生成的html文件路径

*/

public void wordToHtml(String sourceFilePath, String targetFilePosition) throws Exception {

if (".docx".equals(sourceFilePath.substring(sourceFilePath.lastIndexOf(".", sourceFilePath.length())))) {

docxToHtml(sourceFilePath, targetFilePosition);

} else if (".doc".equals(sourceFilePath.substring(sourceFilePath.lastIndexOf(".", sourceFilePath.length())))) {

docToHtml(sourceFilePath, targetFilePosition);

} else {

throw new RuntimeException("文件格式不正确");

}

/*

* doc转换为html

* sourceFilePath:源word文件路径

* targetFilePosition:生成的html文件路径

*/

private void docToHtml(String sourceFilePath, String targetFilePosition) throws Exception {

final Path imagePath = Paths.get(targetFilePosition).getParent().resolve("image");

HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(sourceFilePath));

Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();

WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);

// 保存图片，并返回图片的相对路径

wordToHtmlConverter.setPicturesManager(new PicturesManager() {

@Override

public String savePicture(byte[] content, PictureType pictureType, String name, float width, float height) {

try (FileOutputStream out = new FileOutputStream(imagePath.resolve(name).toString())) {

out.write(content);

} catch (Exception e) {

e.printStackTrace();

}

return "../tmp/image/" + name;

}

});

wordToHtmlConverter.processDocument(wordDocument);

Document htmlDocument = wordToHtmlConverter.getDocument();

DOMSource domSource = new DOMSource(htmlDocument);

StreamResult streamResult = new StreamResult(new File(targetFilePosition));

TransformerFactory tf = TransformerFactory.newInstance();

Transformer serializer = tf.newTransformer();

serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");

serializer.setOutputProperty(OutputKeys.INDENT, "yes");

serializer.setOutputProperty(OutputKeys.METHOD, "html");

serializer.transform(domSource, streamResult);

}

/*

* docx转换为html

* sourceFilePath:源word文件路径

* targetFileName:生成的html文件路径

*/

private void docxToHtml(String sourceFilePath, String targetFileName) throws Exception {

String imagePathStr = Paths.get(targetFileName).getParent().resolve("../tmp/image/word/media").toString();

OutputStreamWriter outputStreamWriter = null;

try {

XWPFDocument document = new XWPFDocument(new FileInputStream(sourceFilePath));

XHTMLOptions options = XHTMLOptions.create();

// 存放图片的文件夹

options.setExtractor(new FileImageExtractor(new File(imagePathStr)));

// html中图片的路径

options.URIResolver(new BasicURIResolver("../tmp/image/word/media"));

outputStreamWriter = new OutputStreamWriter(new FileOutputStream(targetFileName), "UTF-8");

XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance();

xhtmlConverter.convert(document, outputStreamWriter, options);

} finally {

if (outputStreamWriter != null) {

outputStreamWriter.close();

}

/*

* txt文档转html

filePath:txt原文件路径

htmlPosition:转化后生成的html路径

*/

public void txtToHtml(String filePath, String htmlPosition) {

try {

String encoding = "GBK";

File file = new File(filePath);

if (file.isFile() && file.exists()) { // 判断文件是否存在

InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);

// 考虑到编码格式

BufferedReader bufferedReader = new BufferedReader(read);

// 写文件

FileOutputStream fos = new FileOutputStream(new File(htmlPosition));

OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");

BufferedWriter bw = new BufferedWriter(osw);

String lineTxt = null;

while ((lineTxt = bufferedReader.readLine()) != null) {

bw.write(lineTxt + "</br>");

}

bw.close();

osw.close();

fos.close();

read.close();

} else {

System.out.println("找不到指定的文件");

}

} catch (Exception e) {

System.out.println("读取文件内容出错");

e.printStackTrace();

}

/*

移动图片到指定路径

sourceFilePath:原始路径

targetFilePosition:移动后存放的路径

*/

public void changeImageUrl(String sourceFilePath,String targetFilePosition) throws IOException {

FileInputStream fis = new FileInputStream(sourceFilePath);

BufferedInputStream bufis = new BufferedInputStream(fis);

FileOutputStream fos = new FileOutputStream(targetFilePosition);

BufferedOutputStream bufos = new BufferedOutputStream(fos);

int len = 0;

while ((len = bufis.read()) != -1) {

bufos.write(len);

}

bufis.close();

bufos.close();

}

/*

* html文件解析成xhtml，变成标准的html文件

* f_in:源html文件路径

* outfile: 输出后xhtml的文件路径

*/

private boolean parseToXhtml(String f_in, String outfile) {

boolean bo = false;

ByteArrayOutputStream tidyOutStream = null; // 输出流

FileInputStream fis = null;

ByteArrayOutputStream bos = null;

ByteArrayInputStream stream = null;

DataOutputStream to = null;

try {

// Reader reader;

fis = new FileInputStream(f_in);

bos = new ByteArrayOutputStream();

int ch;

while ((ch = fis.read()) != -1) {

bos.write(ch);

}

byte[] bs = bos.toByteArray();

bos.close();

String hope_gb2312 = new String(bs, "gb2312");// 注意，默认是GB2312，所以这里先转化成GB2312然后再转化成其他的。

byte[] hope_b = hope_gb2312.getBytes();

String basil = new String(hope_b, "gb2312");// 将GB2312转化成 UTF-8

stream = new ByteArrayInputStream(basil.getBytes());

tidyOutStream = new ByteArrayOutputStream();

Tidy tidy = new Tidy();

tidy.setInputEncoding("gb2312");

tidy.setQuiet(true);

tidy.setOutputEncoding("UTF-8");

tidy.setShowWarnings(true); // 不显示警告信息

tidy.setIndentContent(true);//

tidy.setSmartIndent(true);

tidy.setIndentAttributes(false);

tidy.setWraplen(1024); // 多长换行

// 输出为xhtml

tidy.setXHTML(true);

tidy.setErrout(new PrintWriter(System.out));

tidy.parse(stream, tidyOutStream);

to = new DataOutputStream(new FileOutputStream(outfile));// 将生成的xhtml写入

tidyOutStream.writeTo(to);

bo = true;

} catch (Exception ex) {

System.out.println(ex.toString());

ex.printStackTrace();

return bo;

} finally {

try {

if (to != null) {

to.close();

}

if (stream != null) {

stream.close();

}

if (fis != null) {

fis.close();

}

if (bos != null) {

bos.close();

}

if (tidyOutStream != null) {

tidyOutStream.close();

}

} catch (IOException e) {

e.printStackTrace();

}

System.gc();

}

return bo;

}

/*

* xhtml文件转pdf文件

* inputFile:xhtml源文件路径

* outputFile:输出的pdf文件路径

* imagePath:图片的存放路径例如(file:/D:/test)

*/

private boolean convertHtmlToPdf(String inputFile, String outputFile) throws Exception {

OutputStream os = new FileOutputStream(outputFile);

ITextRenderer renderer = new ITextRenderer();

String url = new File(inputFile).toURI().toURL().toString();

renderer.setDocument(url);

// 解决中文支持问题

ITextFontResolver fontResolver = renderer.getFontResolver();

fontResolver.addFont("C:/Windows/Fonts/simsun.ttc", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);

// 解决图片的相对路径问题

renderer.getSharedContext().setBaseURL("imagePath");

renderer.layout();

renderer.createPDF(os);

os.flush();

os.close();

return true;

}

/*

* xhtml转成标准html文件

* targetHtml:要处理的html文件路径

*/

private static void standardHTML(String targetHtml) throws IOException {

File f = new File(targetHtml);

org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8");

doc.select("meta").removeAttr("name");

doc.select("meta").attr("content", "text/html; charset=UTF-8");

doc.select("meta").attr("http-equiv", "Content-Type");

doc.select("meta").html("&nbsp");

doc.select("img").html("&nbsp");

doc.select("style").attr("mce_bogus", "1");

doc.select("body").attr("font-family", "SimSun");

doc.select("html").before("<?xml version='1.0' encoding='UTF-8'>");

/*

* Jsoup只是解析，不能保存修改，所以要在这里保存修改。

*/

FileOutputStream fos = new FileOutputStream(f, false);

OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");

osw.write(doc.html());

System.out.println(doc.html());

osw.close();

}

[点晴永久免费OA]【JAVA】word和.txt文件转html 及pdf文件， 使用poi jsoup itext心得

[点晴永久免费OA]【JAVA】word和.txt文件转html 及pdf文件，使用poi jsoup itext心得