Python数据清洗之抽取jsonl文件数据字段并合并_Python

基于python抽取目录下所有“jsonl”格式文件。遍历文件内某个字段进行抽取并合并。

实现代码

import os
import json
import time
from tqdm import tqdm  # 需要先安装：pip install tqdm
 
 
def process_files():
    # 设置目录路径
    dir_path = r"d:\daku\关键词识别\1623-0000001\zh"
 
    # 获取并排序文件列表
    file_list = sorted([f for f in os.listdir(dir_path) if f.lower().endswith('.jsonl')],
                       key=lambda x: os.path.getsize(os.path.join(dir_path, x)),
                       reverse=true)  # 按文件大小降序排列
 
    # 进度统计
    total_files = len(file_list)
    processed_files = 0
    total_lines = sum(1 for f in file_list for _ in open(os.path.join(dir_path, f), 'r', encoding='utf-8'))
    processed_lines = 0
    start_time = time.time()
 
    # 输出文件设置
    output_file = os.path.join(dir_path, "combined_contents.txt")
 
    with open(output_file, "w", encoding="utf-8") as outfile:
        with tqdm(total=total_lines, desc="合并进度", unit="line") as pbar:
            for filename in file_list:
                file_path = os.path.join(dir_path, filename)
                try:
                    with open(file_path, "r", encoding="utf-8") as infile:
                        file_size = os.path.getsize(file_path)
                        chunk_size = max(1024 * 1024, file_size // 100)  # 动态调整读取块大小
 
                        while true:
                            lines = infile.readlines(chunk_size)
                            if not lines:
                                break
 
                            for line_num, line in enumerate(lines, 1):
                                line = line.strip()
                                if not line:
                                    continue
 
                                try:
                                    data = json.loads(line)
                                    content = data.get("content", "").replace("\n", " ")  # 清除内容中的换行符
                                    outfile.write(content + "\n\n")  # 用双换行分隔记录
                                    processed_lines += 1
                                except json.jsondecodeerror:
                                    print(f"\njson解析失败: {filename} 第{processed_lines + 1}行")
                                except exception as e:
                                    print(f"\n处理异常: {filename} 第{processed_lines + 1}行 - {str(e)}")
 
                                # 进度更新
                                pbar.update(1)
                                if processed_lines % 1000 == 0:
                                    elapsed = time.time() - start_time
                                    speed = processed_lines / (elapsed + 1e-5)
                                    remaining = (total_lines - processed_lines) / (speed + 1e-5)
                                    pbar.set_postfix({
                                        '速度': f"{speed:.1f} lines/s",
                                        '剩余时间': f"{remaining // 3600:.0f}h {remaining % 3600 // 60:.0f}m"
                                    })
 
                    processed_files += 1
                except exception as e:
                    print(f"\n无法读取文件 {filename}: {str(e)}")
 
    # 生成统计报告
    end_time = time.time()
    print(f"\n合并完成！共处理 {processed_files}/{total_files} 个文件")
    print(f"总记录数: {processed_lines:,} 条")
    print(f"耗时: {end_time - start_time:.2f} 秒")
    print(f"输出文件路径: {output_file}")
 
 
if __name__ == "__main__":
    process_files()

知识延展：

1.python中json文件和jsonl文件的区别

众所周知，json 文件是使用 json（javascript object notation）格式存储数据的文件。它是一种结构化的文本格式，使用键值对的方式表示数据。json 文件通常包含一个根对象，可以包含多个嵌套的对象、数组和基本数据类型。

而 jsonl 文件（json lines），则是一种每行包含一个独立的 json 对象的文本文件格式。每行都是一个有效的 json 对象，和json的"list dict"不一样，jsonl的话没有"list"，只有一行行的“dict”，使用换行符分隔。相比于 json 文件，jsonl 文件更加轻量，每行为独立的 json 对象，没有逗号或其他分隔符。这样的好处是方便一行行读取，而不用像json一样只能一次性把"list"里面的"dict”全部都读出来，节省了内存吧，并且还能增加可读性，普通的json文件打开之后会乱糟糟一团。jsonl的话需要pip安装一个jsonlines包。

json文件的内容示例：

[{"name": "john", "age": 30},
{"name": "jane", "age": 25},
{"name": "bob", "age": 40}]

jsonl文件的内容示例：

{"name": "john", "age": 30}
{"name": "jane", "age": 25}
{"name": "bob", "age": 40}

主要区别如下：

json 文件：

使用大括号 {} 表示对象，使用方括号 [] 表示数组。
整个文件是一个有效的 json 对象或数组。
适合存储结构化的数据，如配置文件、api 响应等。
一次性读取整个文件，解析成一个 json 对象，可以随机访问其中的数据。

jsonl 文件：

每行是一个独立的有效 json 对象。
每行之间没有逗号或其他分隔符。
适合存储每行为独立记录的数据，如日志、传感器数据、日志行等。
逐行读取文件，逐行解析 json 对象，一次处理一行的数据。

jsonl 文件适合用于以下情况：

当数据以行为单位独立存储，并且每行数据之间没有明确的分隔符时。
当需要逐行处理数据，以节省内存和提高处理速度时。
当数据量非常大，无法一次性加载到内存中时，jsonl 格式提供了一种流式处理数据的方式。

这么对比下来，json 文件更适合结构化的数据存储和传输，而 jsonl 文件更适合每行为独立记录的数据存储和处理。

到此这篇关于python中json文件和jsonl文件的区别小结的文章就介绍到这了,更多相关python json文件和jsonl文件区别内容请搜索代码网以前的文章或继续浏览下面的相关文章希望大家以后多多支持代码网！

2.抽取多个文本格式内容进行合并

即抽取目录下多个文本多个格式文件，进行合并并去重。

实现代码

import os
from chardet import detect
 
 
def get_safe_encoding(encoding):
    """将检测到的编码转换为更安全的兼容编码"""
    encoding = encoding.lower()
    if encoding in ['gb2312', 'gbk']:
        return 'gb18030'  # 最全面的中文编码
    return encoding
 
 
def get_file_encoding(file_path):
    """获取文件编码并自动升级到更安全的版本"""
    with open(file_path, 'rb') as f:
        raw_data = f.read(10000)
    result = detect(raw_data)
    # 过滤低置信度检测（confidence<0.8则视为不可信）
    if result['confidence'] < 0.8:
        return 'gb18030'
    return get_safe_encoding(result['encoding'])
 
 
def merge_files(directory, output_filename='merged.txt'):
    seen_lines = set()
    output_path = os.path.join(directory, output_filename)
 
    txt_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.txt')]
 
    with open(output_path, 'w', encoding='utf-8', errors='ignore') as outfile:
        for file_path in txt_files:
            try:
                # 获取安全编码并添加错误处理
                file_enc = get_file_encoding(file_path)
                with open(file_path, 'r',
                          encoding=file_enc,
                          errors='backslashreplace') as infile:  # 保留无法解码字符
                    for line_idx, line in enumerate(infile, 1):
                        try:
                            stripped_line = line.rstrip('\n')
                            if stripped_line not in seen_lines:
                                outfile.write(line)
                                seen_lines.add(stripped_line)
                        except exception as line_err:
                            print(f"文件 {os.path.basename(file_path)} 第 {line_idx} 行处理异常: {str(line_err)}")
                            continue
            except exception as file_err:
                print(f"文件 {os.path.basename(file_path)} 读取失败: {str(file_err)}")
                continue
 
 
if __name__ == '__main__':
    target_directory = r'd:\daku\关键词识别\stop6931'
    merge_files(target_directory)
    print(f'合并完成，输出文件：{os.path.join(target_directory, "merged.txt")}')

到此这篇关于python数据清洗之抽取jsonl文件数据字段并合并的文章就介绍到这了,更多相关python抽取jsonl数据字段内容请搜索代码网以前的文章或继续浏览下面的相关文章希望大家以后多多支持代码网！

Python数据清洗之抽取jsonl文件数据字段并合并

2025年03月26日 • Python •我要评论

相关文章:

发表评论


验证码：