代码介绍
本文旨在系统介绍如何利用python主流库从四种常见格式文档中提取原始文本
相关软件安装
第三方库安装
在cmd界面输入:
pip install python-docx openpyxl pdfminer.six python-pptx
程序打包
完整代码
import os import sys from docx import document import openpyxl from pdfminer.high_level import extract_text from pptx import presentation from pyqt5.qtwidgets import (qapplication, qmainwindow, qpushbutton, qlabel, qfiledialog, qtextedit, qvboxlayout, qhboxlayout, qwidget, qprogressbar, qmessagebox, qgroupbox, qradiobutton, qbuttongroup) from pyqt5.qtcore import qt, qthread, pyqtsignal, qobject from pyqt5.qtgui import qfont class batchextractorthread(qthread): """批量提取线程(支持单文件/文件夹)""" progress_updated = pyqtsignal(int) batch_finished = pyqtsignal(str, int, int) # 汇总信息, 成功数, 失败数 log_message = pyqtsignal(str) def __init__(self, input_path, output_path, is_folder=false, parent=none): super().__init__(parent) # 绑定父对象,避免线程残留 self.input_path = input_path # 输入路径(文件/文件夹) self.output_path = output_path # 输出路径(文件/文件夹) self.is_folder = is_folder # 是否为文件夹批量处理 self.supported_exts = ['.docx', '.xlsx', '.pdf', '.pptx'] # 支持的格式 self.success_count = 0 # 成功提取数 self.fail_count = 0 # 提取失败数 self.is_running = false # 线程运行状态标记 def run(self): self.is_running = true try: self.progress_updated.emit(5) if self.is_folder: # 处理文件夹:遍历所有支持的文件 self.log_message.emit(f"开始批量处理文件夹: {self.input_path}") file_list = self.get_supported_files(self.input_path) total_files = len(file_list) if total_files == 0: self.log_message.emit("文件夹中未找到支持格式的文档") self.batch_finished.emit("未找到可处理的文档", 0, 0) self.progress_updated.emit(100) return self.log_message.emit(f"共发现 {total_files} 个支持格式的文档") # 逐个处理文件 for idx, file_path in enumerate(file_list): if not self.is_running: # 检查是否需要终止 self.log_message.emit("提取任务已被终止") return self.process_single_file(file_path, self.output_path) # 更新整体进度 progress = 5 + int((idx + 1) / total_files * 90) self.progress_updated.emit(min(100, progress)) summary = f"批量处理完成!共处理 {total_files} 个文件,成功 {self.success_count} 个,失败 {self.fail_count} 个" self.batch_finished.emit(summary, self.success_count, self.fail_count) else: # 处理单个文件 self.log_message.emit(f"开始处理单个文件: {self.input_path}") if self.is_running: # 确保线程未被终止 self.process_single_file(self.input_path, self.output_path, is_single=true) summary = f"单文件处理完成!成功: {self.success_count} 个,失败: {self.fail_count} 个" self.batch_finished.emit(summary, self.success_count, self.fail_count) self.progress_updated.emit(100) except exception as e: self.log_message.emit(f"处理过程总错误: {str(e)}") self.batch_finished.emit(f"处理失败: {str(e)}", 0, 1) finally: self.is_running = false # 标记线程结束 def get_supported_files(self, folder_path): """获取文件夹内所有支持格式的文件(递归遍历子文件夹)""" supported_files = [] for root, _, files in os.walk(folder_path): for file in files: file_ext = os.path.splitext(file)[1].lower() if file_ext in self.supported_exts: supported_files.append(os.path.join(root, file)) return supported_files def process_single_file(self, file_path, output_root, is_single=false): """处理单个文件的提取和保存""" file_name = os.path.basename(file_path) file_ext = os.path.splitext(file_path)[1].lower() # 确定单个文件的输出路径 if is_single: # 单文件模式:直接使用用户指定的输出路径 save_path = output_root else: # 批量模式:保持原文件夹结构,输出到指定根目录 relative_path = os.path.relpath(os.path.dirname(file_path), self.input_path) output_dir = os.path.join(output_root, relative_path) os.makedirs(output_dir, exist_ok=true) # 生成输出文件名(原文件名+_提取结果.txt) base_name = os.path.splitext(file_name)[0] save_path = os.path.join(output_dir, f"{base_name}_提取结果.txt") try: self.log_message.emit(f"正在处理: {file_name}") # 根据文件类型提取内容 if file_ext == '.docx': content = self.doc_extract(file_path) file_type = "word" elif file_ext == '.xlsx': content = self.excel_extract(file_path) file_type = "excel" elif file_ext == '.pdf': content = self.pdf_extract(file_path) file_type = "pdf" elif file_ext == '.pptx': content = self.ppt_extract(file_path) file_type = "powerpoint" else: self.log_message.emit(f"跳过不支持的文件: {file_name}") self.fail_count += 1 return # 保存内容 if content: self.save_content(content, save_path) self.log_message.emit(f"✅ 成功: {file_name} -> 保存至 {os.path.basename(save_path)}") self.success_count += 1 else: self.log_message.emit(f"⚠️ 无内容: {file_name}(未生成输出文件)") self.fail_count += 1 except exception as e: self.log_message.emit(f"❌ 失败: {file_name} - {str(e)}") self.fail_count += 1 def doc_extract(self, file_path) -> str: """提取 word (.docx) 文档内容""" if not os.path.exists(file_path): raise filenotfounderror(f"文件不存在: {file_path}") doc = document(file_path) content = [para.text for para in doc.paragraphs] return '\n'.join(content) def excel_extract(self, file_path) -> str: """提取 excel (.xlsx) 文档内容""" if not os.path.exists(file_path): raise filenotfounderror(f"文件不存在: {file_path}") data = [] wb = openpyxl.load_workbook(file_path) for sheet in wb.sheetnames: data.append(f"=== 工作表: {sheet} ===") ws = wb[sheet] for i in range(1, ws.max_row + 1): row_data = [] for j in range(1, ws.max_column + 1): cell_val = str(ws.cell(i, j).value) if ws.cell(i, j).value is not none else "" row_data.append(cell_val) data.append("\t".join(row_data)) return '\n'.join(data) def pdf_extract(self, file_path) -> str: """提取 pdf (.pdf) 文档内容""" if not os.path.exists(file_path): raise filenotfounderror(f"文件不存在: {file_path}") return extract_text(file_path) def ppt_extract(self, file_path) -> str: """提取 powerpoint (.pptx) 文档内容""" if not os.path.exists(file_path): raise filenotfounderror(f"文件不存在: {file_path}") content = [] ppt = presentation(file_path) for slide_idx, slide in enumerate(ppt.slides, 1): content.append(f"=== 幻灯片 {slide_idx} ===") for shape in slide.shapes: if hasattr(shape, 'text') and shape.text: content.append(shape.text) return '\n'.join(content) def save_content(self, content, save_path) -> none: """保存提取的内容到指定路径""" with open(save_path, 'w', encoding='utf-8') as f: f.write(content) def stop(self): """终止线程(安全退出)""" self.is_running = false class documentextractorapp(qmainwindow): """文档提取器主窗口(支持单文件/文件夹批量处理)""" def __init__(self): super().__init__() self.init_ui() self.extractor_thread = none # 线程对象初始化 def init_ui(self): """初始化用户界面(优化布局间距)""" self.setwindowtitle("一站式提取word、excel、pdf 和ppt文档内容@阿幸") self.setgeometry(100, 100, 1000, 750) # 设置中文字体(优化字体大小和间距) base_font = qfont() base_font.setfamily("simhei") base_font.setpointsize(10) # 基础字体大小 self.setfont(base_font) # 主布局(增加整体内边距) main_layout = qvboxlayout() main_layout.setcontentsmargins(15, 15, 15, 15) # 主布局四周内边距 main_layout.setspacing(12) # 布局内组件间距 # 1. 处理模式选择(单文件/文件夹)- 核心优化区域 mode_group = qgroupbox("处理模式") mode_group.setfont(qfont("simhei", 11, qfont.bold)) # 分组标题加粗 mode_layout = qhboxlayout() # 增加模式布局内边距和间距,解决文字拥挤 mode_layout.setcontentsmargins(20, 15, 20, 15) # 分组内边距 mode_layout.setspacing(30) # 单选按钮间距 self.single_mode_radio = qradiobutton("单文件处理") self.folder_mode_radio = qradiobutton("文件夹批量处理") # 优化单选按钮字体和大小 radio_font = qfont("simhei", 10) self.single_mode_radio.setfont(radio_font) self.folder_mode_radio.setfont(radio_font) self.mode_btn_group = qbuttongroup(self) self.mode_btn_group.addbutton(self.single_mode_radio, 0) self.mode_btn_group.addbutton(self.folder_mode_radio, 1) self.single_mode_radio.setchecked(true) # 默认单文件模式 # 模式切换事件 self.mode_btn_group.buttonclicked.connect(self.switch_mode) mode_layout.addwidget(self.single_mode_radio) mode_layout.addwidget(self.folder_mode_radio) mode_group.setlayout(mode_layout) main_layout.addwidget(mode_group) # 2. 路径选择区域 path_group = qgroupbox("路径设置") path_group.setfont(qfont("simhei", 11, qfont.bold)) path_layout = qvboxlayout() path_layout.setcontentsmargins(15, 15, 15, 15) path_layout.setspacing(10) # 输入路径选择(文件/文件夹) input_layout = qhboxlayout() input_layout.setspacing(10) # 标签和按钮间距 self.input_label = qlabel("未选择输入(文件/文件夹)") self.input_label.setwordwrap(true) self.input_label.setstylesheet("border: 1px solid #ccc; padding: 8px; min-height: 35px;") self.input_label.setfont(qfont("simhei", 9)) # 标签字体稍小,避免拥挤 self.select_input_btn = qpushbutton("选择输入路径") self.select_input_btn.setfont(qfont("simhei", 10)) self.select_input_btn.setminimumwidth(120) # 固定按钮宽度,避免变形 self.select_input_btn.clicked.connect(self.select_input_path) input_layout.addwidget(self.input_label, 7) input_layout.addwidget(self.select_input_btn, 3) # 输出路径选择(文件/文件夹) output_layout = qhboxlayout() output_layout.setspacing(10) self.output_label = qlabel("未选择输出(文件/文件夹)") self.output_label.setwordwrap(true) self.output_label.setstylesheet("border: 1px solid #ccc; padding: 8px; min-height: 35px;") self.output_label.setfont(qfont("simhei", 9)) self.select_output_btn = qpushbutton("选择输出路径") self.select_output_btn.setfont(qfont("simhei", 10)) self.select_output_btn.setminimumwidth(120) self.select_output_btn.clicked.connect(self.select_output_path) output_layout.addwidget(self.output_label, 7) output_layout.addwidget(self.select_output_btn, 3) # 添加到路径布局 path_layout.addlayout(input_layout) path_layout.addlayout(output_layout) path_group.setlayout(path_layout) main_layout.addwidget(path_group) # 3. 操作区域(新增终止按钮) action_layout = qhboxlayout() action_layout.setspacing(15) # 按钮间距 self.start_btn = qpushbutton("开始提取") self.start_btn.setstylesheet("font-size: 14px; padding: 10px; background-color: #4caf50; color: white;") self.start_btn.setfont(qfont("simhei", 10, qfont.bold)) self.start_btn.setminimumwidth(150) self.start_btn.clicked.connect(self.start_extraction) self.start_btn.setenabled(false) self.stop_btn = qpushbutton("终止提取") self.stop_btn.setstylesheet("font-size: 14px; padding: 10px; background-color: #f44336; color: white;") self.stop_btn.setfont(qfont("simhei", 10, qfont.bold)) self.stop_btn.setminimumwidth(150) self.stop_btn.clicked.connect(self.stop_extraction) self.stop_btn.setenabled(false) # 默认禁用 self.clear_btn = qpushbutton("清空日志") self.clear_btn.setstylesheet("font-size: 14px; padding: 10px;") self.clear_btn.setfont(qfont("simhei", 10)) self.clear_btn.setminimumwidth(150) self.clear_btn.clicked.connect(self.clear_logs) action_layout.addwidget(self.start_btn) action_layout.addwidget(self.stop_btn) action_layout.addwidget(self.clear_btn) main_layout.addlayout(action_layout) # 4. 进度条 self.progress_bar = qprogressbar() self.progress_bar.setvisible(false) self.progress_bar.setstylesheet("margin-bottom: 10px; height: 25px;") main_layout.addwidget(self.progress_bar) # 5. 结果预览区域 result_group = qgroupbox("提取结果预览(仅显示最后一个文件的前2000字符)") result_group.setfont(qfont("simhei", 11, qfont.bold)) result_layout = qvboxlayout() result_layout.setcontentsmargins(10, 10, 10, 10) self.result_text = qtextedit() self.result_text.setreadonly(true) self.result_text.setfont(qfont("simhei", 9)) result_layout.addwidget(self.result_text) result_group.setlayout(result_layout) main_layout.addwidget(result_group, 2) # 6. 日志区域 log_group = qgroupbox("操作日志") log_group.setfont(qfont("simhei", 11, qfont.bold)) log_layout = qvboxlayout() log_layout.setcontentsmargins(10, 10, 10, 10) self.log_text = qtextedit() self.log_text.setreadonly(true) self.log_text.setmaximumheight(120) self.log_text.setfont(qfont("simhei", 9)) log_layout.addwidget(self.log_text) log_group.setlayout(log_layout) main_layout.addwidget(log_group) # 设置中心部件 central_widget = qwidget() central_widget.setlayout(main_layout) self.setcentralwidget(central_widget) # 初始化变量 self.input_path = "" # 输入路径(文件/文件夹) self.output_path = "" # 输出路径(文件/文件夹) self.is_folder_mode = false # 当前是否为文件夹模式 def switch_mode(self): """切换处理模式(单文件/文件夹)""" # 切换模式前先检查是否有正在运行的线程 if self.extractor_thread and self.extractor_thread.isrunning(): qmessagebox.warning(self, "警告", "正在进行提取操作,无法切换模式") # 恢复原选择状态 if self.is_folder_mode: self.folder_mode_radio.setchecked(true) else: self.single_mode_radio.setchecked(true) return self.is_folder_mode = (self.mode_btn_group.checkedid() == 1) # 清空现有路径并更新提示 self.input_path = "" self.output_path = "" self.input_label.settext("未选择输入文件夹" if self.is_folder_mode else "未选择输入文件") self.output_label.settext("未选择输出文件夹" if self.is_folder_mode else "未选择输出文件") self.start_btn.setenabled(false) self.log_message(f"已切换至{'文件夹批量处理' if self.is_folder_mode else '单文件处理'}模式") def select_input_path(self): """选择输入路径(根据模式选择文件或文件夹)""" # 选择路径前检查线程状态 if self.extractor_thread and self.extractor_thread.isrunning(): qmessagebox.warning(self, "警告", "正在进行提取操作,无法修改路径") return if self.is_folder_mode: # 文件夹模式:选择文件夹 folder_path = qfiledialog.getexistingdirectory( self, "选择输入文件夹", "", qfiledialog.showdirsonly ) if folder_path: self.input_path = folder_path self.input_label.settext(f"输入文件夹: {folder_path}") self.log_message(f"已选择输入文件夹: {folder_path}") # 自动建议输出文件夹(原文件夹名+_提取结果) if not self.output_path: folder_name = os.path.basename(folder_path) self.output_path = os.path.join(os.path.dirname(folder_path), f"{folder_name}_提取结果") self.output_label.settext(f"输出文件夹: {self.output_path}") else: # 单文件模式:选择文件 file_path, _ = qfiledialog.getopenfilename( self, "选择文档文件", "", "支持的文件 (*.docx *.xlsx *.pdf *.pptx);;word 文件 (*.docx);;excel 文件 (*.xlsx);;pdf 文件 (*.pdf);;ppt 文件 (*.pptx);;所有文件 (*)" ) if file_path: self.input_path = file_path self.input_label.settext(f"输入文件: {os.path.basename(file_path)}") self.log_message(f"已选择输入文件: {file_path}") # 自动建议输出文件 if not self.output_path: base_name = os.path.splitext(os.path.basename(file_path))[0] self.output_path = os.path.join(os.path.dirname(file_path), f"{base_name}_提取结果.txt") self.output_label.settext(f"输出文件: {os.path.basename(self.output_path)}") self.check_btn_state() def select_output_path(self): """选择输出路径(根据模式选择文件或文件夹)""" # 选择路径前检查线程状态 if self.extractor_thread and self.extractor_thread.isrunning(): qmessagebox.warning(self, "警告", "正在进行提取操作,无法修改路径") return if not self.input_path: qmessagebox.warning(self, "警告", f"请先选择输入{'文件夹' if self.is_folder_mode else '文件'}") return if self.is_folder_mode: # 文件夹模式:选择输出文件夹 folder_path = qfiledialog.getexistingdirectory( self, "选择输出文件夹", os.path.dirname(self.input_path), qfiledialog.showdirsonly ) if folder_path: self.output_path = folder_path self.output_label.settext(f"输出文件夹: {folder_path}") self.log_message(f"已选择输出文件夹: {folder_path}") else: # 单文件模式:选择输出文件 default_name = os.path.splitext(os.path.basename(self.input_path))[0] + "_提取结果.txt" file_path, _ = qfiledialog.getsavefilename( self, "保存提取结果", os.path.join(os.path.dirname(self.input_path), default_name), "文本文件 (*.txt);;所有文件 (*)" ) if file_path: self.output_path = file_path self.output_label.settext(f"输出文件: {os.path.basename(file_path)}") self.log_message(f"已选择输出文件: {file_path}") self.check_btn_state() def check_btn_state(self): """检查按钮启用状态""" # 只有路径都设置且无运行线程时,才能启用开始按钮 can_start = bool(self.input_path) and bool(self.output_path) if self.extractor_thread and self.extractor_thread.isrunning(): can_start = false self.start_btn.setenabled(can_start) def log_message(self, message): """添加日志信息""" self.log_text.append(message) self.log_text.movecursor(self.log_text.textcursor().end) def clear_logs(self): """清空日志和预览""" # 提取中也允许清空日志 self.log_text.clear() self.result_text.clear() self.log_message("已清空日志和结果预览") def start_extraction(self): """开始提取(单文件/批量)""" # 再次检查路径有效性 if not os.path.exists(self.input_path): qmessagebox.warning(self, "错误", f"输入{'文件夹' if self.is_folder_mode else '文件'}不存在") return # 禁用相关按钮,启用终止按钮 self.select_input_btn.setenabled(false) self.select_output_btn.setenabled(false) self.start_btn.setenabled(false) self.stop_btn.setenabled(true) self.progress_bar.setvisible(true) self.progress_bar.setvalue(0) self.result_text.clear() # 创建并启动提取线程(绑定父对象,避免内存泄漏) self.extractor_thread = batchextractorthread( input_path=self.input_path, output_path=self.output_path, is_folder=self.is_folder_mode, parent=self # 关键:绑定到主窗口,确保线程随窗口生命周期管理 ) self.extractor_thread.progress_updated.connect(self.update_progress) self.extractor_thread.batch_finished.connect(self.on_extraction_finished) self.extractor_thread.log_message.connect(self.log_message) # 单文件模式下绑定预览功能 if not self.is_folder_mode: self.extractor_thread.process_single_file = self.wrap_single_file_process( self.extractor_thread.process_single_file ) self.extractor_thread.start() def stop_extraction(self): """终止提取操作""" if self.extractor_thread and self.extractor_thread.isrunning(): reply = qmessagebox.question( self, "确认终止", "确定要终止当前提取操作吗?已处理的文件会保留,未处理的将停止。", qmessagebox.yes | qmessagebox.no, qmessagebox.no ) if reply == qmessagebox.yes: self.extractor_thread.stop() # 安全终止线程 self.log_message("已终止提取操作") # 恢复按钮状态 self.select_input_btn.setenabled(true) self.select_output_btn.setenabled(true) self.start_btn.setenabled(true) self.stop_btn.setenabled(false) self.progress_bar.setvalue(0) self.progress_bar.setvisible(false) def wrap_single_file_process(self, original_func): """包装单文件处理函数,用于获取预览内容""" def wrapper(file_path, output_root, is_single=false): # 先执行原处理逻辑 original_func(file_path, output_root, is_single) # 读取保存的文件内容用于预览 if os.path.exists(output_root): with open(output_root, 'r', encoding='utf-8') as f: content = f.read() # 显示前2000字符预览 preview = content[:2000] if len(content) > 2000: preview += "\n\n... 内容过长,仅显示前2000字符 ..." self.result_text.settext(preview) return wrapper def update_progress(self, value): """更新进度条""" self.progress_bar.setvalue(value) def on_extraction_finished(self, summary, success_count, fail_count): """提取完成回调(核心修复:保持软件运行)""" # 关键修复:恢复所有操作按钮状态,不终止程序 self.select_input_btn.setenabled(true) self.select_output_btn.setenabled(true) self.start_btn.setenabled(true) self.stop_btn.setenabled(false) # 提取完成后禁用终止按钮 self.progress_bar.setvisible(false) # 隐藏进度条 # 显示汇总信息(使用information而非critical,避免误操作) qmessagebox.information(self, "处理完成", summary) self.log_message(f"\n{summary}") # 清理线程对象(避免残留) self.extractor_thread = none def closeevent(self, event): """窗口关闭事件(安全处理线程)""" if self.extractor_thread and self.extractor_thread.isrunning(): reply = qmessagebox.question( self, "确认关闭", "正在进行提取操作,强制关闭可能导致文件损坏,确定要关闭吗?", qmessagebox.yes | qmessagebox.no, qmessagebox.no ) if reply == qmessagebox.yes: self.extractor_thread.stop() self.extractor_thread.wait() # 等待线程安全退出 event.accept() else: event.ignore() else: event.accept() if __name__ == '__main__': app = qapplication(sys.argv) # 确保中文显示正常 app.setfont(qfont("simhei", 10)) window = documentextractorapp() window.show() # 关键修复:正确的事件循环退出逻辑 exit_code = app.exec_() # 程序退出前确保线程已终止 if hasattr(window, 'extractor_thread') and window.extractor_thread and window.extractor_thread.isrunning(): window.extractor_thread.stop() window.extractor_thread.wait() sys.exit(exit_code)
软件使用
可以选择单文件也可以选择文件夹
点击开始提取
以上就是python一站式进行提取文档内容(word、excel、pdf 和ppt)的详细内容,更多关于python提取文档内容的资料请关注代码网其它相关文章!
发表评论