欢迎来到徐庆高(Tea)的个人博客网站
磨难很爱我,一度将我连根拔起。从惊慌失措到心力交瘁,我孤身一人,但并不孤独无依。依赖那些依赖我的人,信任那些信任我的人,帮助那些给予我帮助的人。如果我愿意,可以分裂成无数面镜子,让他们看见我,就像看见自己。察言观色和模仿学习是我的领域。像每个深受创伤的人那样,最终,我学会了随遇而安。
当前位置: 日志文章 > 详细内容

Java实现word,pdf转html并保留格式

2025年07月16日 Java
一、word转html依赖:<properties> <poi.version>5.2.3</poi.version> <xhtml.versio

一、word转html

依赖:

<properties>
    <poi.version>5.2.3</poi.version>
    <xhtml.version>2.0.4</xhtml.version>
</properties>
 
<!--word转html-->
<dependency>
    <groupid>org.apache.poi</groupid>
    <artifactid>poi-scratchpad</artifactid>
    <version>${poi.version}</version>
</dependency>
<!--word转html-->
<dependency>
    <groupid>fr.opensagres.xdocreport</groupid>
    <artifactid>fr.opensagres.poi.xwpf.converter.xhtml</artifactid>
    <version>${xhtml.version}</version>
</dependency>
<!--处理office文档表格相关 2007+版-->
<dependency>
    <groupid>org.apache.poi</groupid>
    <artifactid>poi-ooxml</artifactid>
    <version>${poi.version}</version>
</dependency>
<!--处理office文档表格相关 2003版-->
<dependency>
    <groupid>org.apache.poi</groupid>
    <artifactid>poi</artifactid>
    <version>${poi.version}</version>
</dependency>

代码:

import fr.opensagres.poi.xwpf.converter.xhtml.base64embedimgmanager;
import fr.opensagres.poi.xwpf.converter.xhtml.xhtmlconverter;
import fr.opensagres.poi.xwpf.converter.xhtml.xhtmloptions;
import org.apache.commons.codec.binary.base64;
import org.apache.poi.hwpf.hwpfdocument;
import org.apache.poi.hwpf.converter.wordtohtmlconverter;
import org.apache.poi.xwpf.usermodel.xwpfdocument;
 
import javax.xml.parsers.documentbuilderfactory;
import javax.xml.transform.outputkeys;
import javax.xml.transform.transformer;
import javax.xml.transform.transformerfactory;
import javax.xml.transform.dom.domsource;
import javax.xml.transform.stream.streamresult;
import java.io.*;
import java.net.url;
 
public class wordutil {
 
    public static string wordtohtml(string fileurl,string filesuffix) throws exception {
        url url = new url(fileurl);
        try (inputstream inputstream = url.openstream()) {
            if(filesuffix.equals(".docx") || filesuffix.equals(".docx")){
                return word2007tohtml(inputstream);
            } else if (filesuffix.equals(".doc") || filesuffix.equals(".doc")) {
                return word2003tohtml(inputstream);
            }else{
                throw new runtimeexception("错误的文件后缀");
            }
        } catch (runtimeexception e) {
            throw new runtimeexception(e.getmessage());
        }
    }
 
    /**
     * word2007转换成html
     * 对于docx,可以用下面这种方式:
     * @throws exception
     */
    public static string word2007tohtml(inputstream inputstream) {
        try (bytearrayoutputstream htmlstream = new bytearrayoutputstream();
             xwpfdocument docxdocument = new xwpfdocument(inputstream)) {
            xhtmloptions options = xhtmloptions.create();
            // 是否忽略未使用的样式
            options.setignorestylesifunused(false);
            // 设置片段模式,<div>标签包裹
            options.setfragment(true);
            // 图片转base64
            options.setimagemanager(new base64embedimgmanager());
            // 转换htm1
            xhtmlconverter.getinstance().convert(docxdocument, htmlstream, options);
            return htmlstream.tostring();
        } catch (exception e) {
            system.out.println("word转html过程出现异常!");
            throw new runtimeexception(e.getmessage());
        }
    }
    /**
     * word2003转换成html
     * 对于doc,可以用下面这种方式:
     * @throws exception
     */
    public static string word2003tohtml(inputstream inputstream ) throws exception {
        try (stringwriter writer = new stringwriter();
             hwpfdocument document = new hwpfdocument(inputstream)) {
            wordtohtmlconverter wordtohtmlconverter = new wordtohtmlconverter(documentbuilderfactory.newinstance().newdocumentbuilder().newdocument());
            //将图片转成base64的格式
            wordtohtmlconverter.setpicturesmanager((bytes, picturetype, s, v, v1) -> "data:image/png;base64," + base64.encodebase64string(bytes));
            wordtohtmlconverter.processdocument(document);
            org.w3c.dom.document htmldocument = wordtohtmlconverter.getdocument();
            domsource domsource = new domsource(htmldocument);
            transformerfactory factory = transformerfactory.newinstance();
            transformer serializer = factory.newtransformer();
            serializer.setoutputproperty(outputkeys.encoding, "utf-8");
            serializer.setoutputproperty(outputkeys.indent, "yes");
            serializer.setoutputproperty(outputkeys.method, "html");
            serializer.transform(domsource, new streamresult(writer));
            return writer.tostring();
        } catch (exception e) {
            system.out.println("word转html过程出现异常!");
            throw new runtimeexception(e.getmessage());
        }
    }
 
}

二、pdf转html

依赖:

        <dependency>
            <groupid>net.sf.cssbox</groupid>
            <artifactid>pdf2dom</artifactid>
        </dependency>
 
        <dependency>
            <groupid>net.mabboud.fontverter</groupid>
            <artifactid>fontverter</artifactid>
        </dependency>
        <dependency>
            <groupid>org.reflections</groupid>
            <artifactid>reflections</artifactid>
        </dependency>
        <!--pdf转文本-->
        <dependency>
            <groupid>org.apache.pdfbox</groupid>
            <artifactid>pdfbox</artifactid>
        </dependency>

代码:

import org.apache.pdfbox.pdmodel.pddocument;
import org.fit.pdfdom.pdfdomtree;
 
import java.io.*;
import java.net.url;
 
public class pdfutil {
    public static string pdftohtml(string fileurl) throws ioexception {
        url url = new url(fileurl);
        try (inputstream inputstream = url.openstream()){
            return pdftohtml(inputstream);
        }catch (exception e){
            throw new ioexception(e.getmessage());
        }
    }
    public static string pdftohtml(inputstream inputstream) throws ioexception {
        string outfilepath = "mypdf.html";
        string pdfcontent = "";
        pddocument document = pddocument.load(inputstream);
        writer writer = new printwriter(outfilepath, "utf-8");
        new pdfdomtree().writetext(document, writer);
        writer.close();
        document.close();
        // 获取html内容
        try (bufferedreader reader = new bufferedreader(new filereader(outfilepath))) {
            stringbuilder htmlcontent = new stringbuilder();
            string line;
            while ((line = reader.readline()) != null) {
                htmlcontent.append(line).append("\n"); // 追加每一行内容,并添加换行符
            }
            pdfcontent = string.valueof(htmlcontent);
            return pdfcontent;
        } catch (ioexception e) {
            e.printstacktrace();
            system.err.println("读取 html 文件时出错。");
        }
        return null;
    }
}

三、方法补充

java实现word转html

1.引入maven依赖

<properties>
    <poi.version>5.2.3</poi.version>
    <xhtml.version>2.0.4</xhtml.version>
</properties>

<!--word转html-->
<dependency>
    <groupid>org.apache.poi</groupid>
    <artifactid>poi-scratchpad</artifactid>
    <version>${poi.version}</version>
</dependency>
<!--word转html-->
<dependency>
    <groupid>fr.opensagres.xdocreport</groupid>
    <artifactid>fr.opensagres.poi.xwpf.converter.xhtml</artifactid>
    <version>${xhtml.version}</version>
</dependency>
<!--处理office文档表格相关 2007+版-->
<dependency>
    <groupid>org.apache.poi</groupid>
    <artifactid>poi-ooxml</artifactid>
    <version>${poi.version}</version>
</dependency>
<!--处理office文档表格相关 2003版-->
<dependency>
    <groupid>org.apache.poi</groupid>
    <artifactid>poi</artifactid>
    <version>${poi.version}</version>
</dependency>

2.java代码

    /**
     * word2007(docx)格式转html
     * @param filepath 文件路径
     * @return 返回转成string类型的html字符串
     * @throws ioexception
     */
    public static string docxtohtml(string filepath) {
        try (bytearrayoutputstream htmlstream = new bytearrayoutputstream();
             xwpfdocument docxdocument = new xwpfdocument(files.newinputstream(paths.get(filepath)))) {
            xhtmloptions options = xhtmloptions.create();
            // 是否忽略未使用的样式
            options.setignorestylesifunused(false);
            // 设置片段模式,<div>标签包裹
            options.setfragment(true);
            // 图片转base64
            options.setimagemanager(new base64embedimgmanager());
            // 转换htm1
            xhtmlconverter.getinstance().convert(docxdocument, htmlstream, options);
            return htmlstream.tostring();
        } catch (exception e) {
            log.error("word转html过程出现异常!", e);
        }
        return null;
    }


    /**
     * word2003(doc)格式转html
     * @param filepath 文件路径
     * @return 返回转成string类型的html字符串
     * @throws exception
     */
    public static string doctohtml(string filepath) {
        try (stringwriter writer = new stringwriter();
             hwpfdocument document = new hwpfdocument(files.newinputstream(new file(filepath).topath()))) {
            wordtohtmlconverter wordtohtmlconverter = new wordtohtmlconverter(documentbuilderfactory.newinstance().newdocumentbuilder().newdocument());
            //将图片转成base64的格式
            wordtohtmlconverter.setpicturesmanager((bytes, picturetype, s, v, v1) -> "data:image/png;base64," + base64.encodebase64string(bytes));
            wordtohtmlconverter.processdocument(document);
            org.w3c.dom.document htmldocument = wordtohtmlconverter.getdocument();
            domsource domsource = new domsource(htmldocument);
            transformerfactory factory = transformerfactory.newinstance();
            transformer serializer = factory.newtransformer();
            serializer.setoutputproperty(outputkeys.encoding, "utf-8");
            serializer.setoutputproperty(outputkeys.indent, "yes");
            serializer.setoutputproperty(outputkeys.method, "html");
            serializer.transform(domsource, new streamresult(writer));
            return writer.tostring();
        } catch (exception e) {
            log.error("word转html过程出现异常!", e);
        }
        return null;
    }

    /**
     * word 转 html
     * 自动检测文件格式转换
     * @param filepath 文件本地路径
     * @return 成功返回转换后的html字符串;失败返回null
     */
    public static string autoword2html(string filepath) {
        int lastindexof = filepath.lastindexof(".");
        string suffix = filepath.substring(lastindexof + 1);
        if ("doc".equalsignorecase(suffix)) {
            return doctohtml(filepath);
        } else if ("docx".equalsignorecase(suffix)) {
            return docxtohtml(filepath);
        } else {
            log.info("文件格式错误,只支持docx和doc格式的文档!");
            return null;
        }
    }

使用java实现pdf到html的转换

引入以下依赖

<dependency>
            <groupid>net.sf.cssbox</groupid>
            <artifactid>pdf2dom</artifactid>
            <version>2.0.3</version>
        </dependency>
 
        <dependency>
            <groupid>net.mabboud.fontverter</groupid>
            <artifactid>fontverter</artifactid>
            <version>1.2.22</version> <!-- 请根据需要使用最新版本 -->
        </dependency>
        <dependency>
            <groupid>org.reflections</groupid>
            <artifactid>reflections</artifactid>
            <version>0.10.2</version> <!-- 请根据需要使用最新版本 -->
        </dependency>
        <!--pdf转文本-->
        <dependency>
            <groupid>org.apache.pdfbox</groupid>
            <artifactid>pdfbox</artifactid>
            <version>2.0.24</version>
        </dependency>

实现关键代码

        file file = new file(pdfurl);
        string localpdffilepath = 要解析的pdf文件路径(本地)+ file.getname();
        string newpdffilepath = 截取pdf后生成的pdf文件路径+ file.getname();
        string outfilepath = 生成的html文件.html";
        string pdfcontent = "";
        pddocument pdfdocument = pddocument.load(new file(localpdffilepath));
        // 检查文档中是否有页面
        if (pdfdocument.getnumberofpages() > 0) {
            // 移除第一页
            pdfdocument.removepage(0);
        }
        // 保存更改后的pdf到新文件
        pdfdocument.save(new file(newpdffilepath));
        system.out.println("第一页已被移除,新pdf保存在: " + newpdffilepath);
        pdfdocument.close();
        // 转换成html格式文件
        pddocument document = pddocument.load(new file(newpdffilepath));
        writer writer = new printwriter(outfilepath, "utf-8");
        new pdfdomtree().writetext(document, writer);
        writer.close();
        document.close();
        // 获取html内容
        try (bufferedreader reader = new bufferedreader(new filereader(outfilepath))) {
            stringbuilder htmlcontent = new stringbuilder();
            string line;
            while ((line = reader.readline()) != null) {
                htmlcontent.append(line).append("\n"); // 追加每一行内容,并添加换行符
            }
            pdfcontent = string.valueof(htmlcontent);
        } catch (ioexception e) {
            e.printstacktrace();
            system.err.println("读取 html 文件时出错。");
        }

到此这篇关于java实现word,pdf转html并保留格式的文章就介绍到这了,更多相关java word,pdf转html内容请搜索代码网以前的文章或继续浏览下面的相关文章希望大家以后多多支持代码网!