当前位置: 代码网 > it编程>编程语言>Java > Java实现轻松提取word和pdf文档内容

Java实现轻松提取word和pdf文档内容

2025年10月15日 Java 我要评论
1.添加依赖<dependency> <groupid>org.apache.poi</groupid> <artifactid>poi&l

1.添加依赖

<dependency>
    <groupid>org.apache.poi</groupid>
    <artifactid>poi</artifactid>
    <version>5.2.5</version>
</dependency>
<dependency>
    <groupid>org.apache.poi</groupid>
    <artifactid>poi-scratchpad</artifactid>
    <version>5.2.5</version>
</dependency>
<dependency>
    <groupid>org.apache.poi</groupid>
    <artifactid>poi-ooxml</artifactid>
    <version>5.2.5</version> <!-- 如果有 docx 相关 -->
</dependency>
<dependency>
    <groupid>org.apache.pdfbox</groupid>
    <artifactid>pdfbox</artifactid>
    <version>3.0.5</version>
</dependency>

2.java代码

import cn.hutool.core.util.strutil;
import org.apache.pdfbox.loader;
import org.apache.pdfbox.pdmodel.pddocument;
import org.apache.pdfbox.text.pdftextstripper;
import org.apache.poi.hwpf.hwpfdocument;
import org.apache.poi.hwpf.extractor.wordextractor;
import org.apache.poi.xwpf.usermodel.xwpfdocument;
import org.apache.poi.xwpf.usermodel.xwpfparagraph;

import java.io.bytearrayinputstream;
import java.io.bytearrayoutputstream;
import java.io.ioexception;
import java.io.inputstream;
import java.nio.charset.standardcharsets;
import java.util.list;
import java.util.locale;
/**
* @description 文档提取工具类
* @date 2025/9/29 17:32
*/

public class documenttextextractutil {

    public static string extracttext(inputstream inputstream, string filetype) throws ioexception {
        if (inputstream == null) return "";
        if (strutil.isblank(filetype)) throw new illegalargumentexception("文件类型不能为空");


        string ft = normalizefiletype(filetype);
        byte[] data = tobytearray(inputstream);


        return switch (ft) {
            case "pdf" -> extractpdftext(data);
            case "docx" -> extractdocxtext(data);
            case "doc" -> extractdoctext(data);
            default -> new string(data, standardcharsets.utf_8);
        };
    }


    private static string normalizefiletype(string filetype) {
        string ft = filetype.trim().tolowercase(locale.root);
        if (ft.startswith(".")) ft = ft.substring(1);
        if (ft.contains("/")) {
            ft = ft.substring(ft.indexof('/') + 1);
            if (ft.contains("+")) ft = ft.substring(0, ft.indexof('+'));
        }
        return ft;
    }



    private static string extractpdftext(byte[] data) throws ioexception {
        try (pddocument doc = loader.loadpdf(data)) { // 3.x 用 loader.loadpdf
            pdftextstripper stripper = new pdftextstripper();
            stripper.setsortbyposition(true);
            string text = stripper.gettext(doc);
            return text == null ? "" : text.trim();
        }
    }


    private static string extractdocxtext(byte[] data) throws ioexception {
        try (xwpfdocument doc = new xwpfdocument(new bytearrayinputstream(data))) {
            stringbuilder sb = new stringbuilder();
            list<xwpfparagraph> paragraphs = doc.getparagraphs();
            for (xwpfparagraph p : paragraphs) {
                string t = p.gettext();
                if (t != null && !t.isempty()) {
                    sb.append(t).append('\n');
                }
            }
            return sb.tostring().trim();
        }
    }


    private static string extractdoctext(byte[] data) throws ioexception {
        try (hwpfdocument doc = new hwpfdocument(new bytearrayinputstream(data))) {
            wordextractor extractor = new wordextractor(doc);
            string[] paragraphs = extractor.getparagraphtext();
            stringbuilder sb = new stringbuilder();
            if (paragraphs != null) {
                for (string p : paragraphs) {
                    if (p == null) continue;
                    string clean = p.replaceall("\\u0000", "").trim();
                    if (!clean.isempty()) sb.append(clean).append('\n');
                }
            }
            return sb.tostring().trim();
        }
    }

    private static byte[] tobytearray(inputstream in) throws ioexception {
        bytearrayoutputstream baos = new bytearrayoutputstream(8192);
        byte[] buf = new byte[8192];
        int r;
        while ((r = in.read(buf)) != -1) {
            baos.write(buf, 0, r);
        }
        return baos.tobytearray();
    }
}

3.如何使用

try (inputstream textstream = new bytearrayinputstream(content)) {
                string text = documenttextextractutil.extracttext(textstream, filetype.tolowercase());
            } catch (exception e) {
                log.warn("文本提取失败,文件: {},类型: {}", filename, filetype, e);
            }

到此这篇关于java实现轻松提取word和pdf文档内容的文章就介绍到这了,更多相关java提取word和pdf内容请搜索代码网以前的文章或继续浏览下面的相关文章希望大家以后多多支持代码网!

(0)

相关文章:

版权声明:本文内容由互联网用户贡献,该文观点仅代表作者本人。本站仅提供信息存储服务,不拥有所有权,不承担相关法律责任。 如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件至 2386932994@qq.com 举报,一经查实将立刻删除。

发表评论

验证码:
Copyright © 2017-2025  代码网 保留所有权利. 粤ICP备2024248653号
站长QQ:2386932994 | 联系邮箱:2386932994@qq.com