Java高效处理超大文本文件的技巧分享_Java

java读取超大文本文件的方式详解

1. bufferedreader逐行读取（推荐）

这是处理大文件最常用且高效的方式，内存占用小。

import java.io.bufferedreader;
import java.io.filereader;
import java.io.ioexception;

public class largefilereader {
    /**
     * 使用bufferedreader逐行读取大文件
     * @param filepath 文件路径
     */
    public static void readwithbufferedreader(string filepath) {
        try (bufferedreader reader = new bufferedreader(new filereader(filepath))) {
            string line;
            long linenumber = 0;
            
            // 逐行读取，不会将整个文件加载到内存
            while ((line = reader.readline()) != null) {
                linenumber++;
                // 处理每一行数据
                processline(line, linenumber);
                
                // 可选：定期输出进度
                if (linenumber % 1000000 == 0) {
                    system.out.println("已处理 " + linenumber + " 行");
                }
            }
        } catch (ioexception e) {
            e.printstacktrace();
        }
    }
    
    private static void processline(string line, long linenumber) {
        // 实际的数据处理逻辑
        // 例如：解析、过滤、转换等操作
    }
}

2. files.lines()流式处理（java 8+）

利用java 8的stream api，代码更简洁现代。

import java.io.ioexception;
import java.nio.file.files;
import java.nio.file.paths;
import java.util.stream.stream;

public class streamfilereader {
    /**
     * 使用files.lines()流式处理大文件
     * @param filepath 文件路径
     */
    public static void readwithstreams(string filepath) {
        try (stream<string> lines = files.lines(paths.get(filepath))) {
            lines
                .parallel()  // 并行处理提高性能
                .foreachordered(line -> {
                    // 处理每一行数据
                    processline(line);
                });
        } catch (ioexception e) {
            e.printstacktrace();
        }
    }
    
    private static void processline(string line) {
        // 实际的数据处理逻辑
    }
}

3. 内存映射文件（mappedbytebuffer）

适用于需要随机访问的大文件处理。

import java.io.ioexception;
import java.io.randomaccessfile;
import java.nio.mappedbytebuffer;
import java.nio.channels.filechannel;
import java.nio.charset.standardcharsets;

public class mappedfilereader {
    /**
     * 使用内存映射读取大文件
     * @param filepath 文件路径
     */
    public static void readwithmemorymapping(string filepath) {
        try (randomaccessfile file = new randomaccessfile(filepath, "r");
             filechannel channel = file.getchannel()) {
            
            long filesize = channel.size();
            final int map_size = 1024 * 1024 * 100; // 100mb映射块
            long position = 0;
            
            while (position < filesize) {
                long size = math.min(map_size, filesize - position);
                
                // 创建内存映射缓冲区
                mappedbytebuffer buffer = channel.map(
                    filechannel.mapmode.read_only, 
                    position, 
                    size
                );
                
                // 处理缓冲区数据
                processbuffer(buffer);
                
                position += size;
            }
        } catch (ioexception e) {
            e.printstacktrace();
        }
    }
    
    private static void processbuffer(mappedbytebuffer buffer) {
        byte[] bytes = new byte[buffer.remaining()];
        buffer.get(bytes);
        string content = new string(bytes, standardcharsets.utf_8);
        
        // 按行分割处理
        string[] lines = content.split("\n");
        for (string line : lines) {
            // 处理每一行
        }
    }
}

4. 分块读取（自定义缓冲区）

手动控制缓冲区大小，精确管理内存。

import java.io.fileinputstream;
import java.io.ioexception;
import java.nio.bytebuffer;
import java.nio.channels.filechannel;

public class chunkfilereader {
    /**
     * 分块读取大文件
     * @param filepath 文件路径
     */
    public static void readinchunks(string filepath) {
        try (fileinputstream fis = new fileinputstream(filepath);
             filechannel channel = fis.getchannel()) {
            
            final int buffer_size = 8192; // 8kb缓冲区
            bytebuffer buffer = bytebuffer.allocate(buffer_size);
            
            stringbuilder linebuilder = new stringbuilder();
            
            while (channel.read(buffer) != -1) {
                buffer.flip(); // 切换到读模式
                
                byte[] bytes = new byte[buffer.remaining()];
                buffer.get(bytes);
                
                string chunk = new string(bytes);
                linebuilder.append(chunk);
                
                // 处理完整的行
                processcompletelines(linebuilder);
                
                buffer.clear(); // 清空缓冲区准备下次读取
            }
            
            // 处理最后一行（如果没有换行符结尾）
            if (linebuilder.length() > 0) {
                processline(linebuilder.tostring());
            }
            
        } catch (ioexception e) {
            e.printstacktrace();
        }
    }
    
    private static void processcompletelines(stringbuilder linebuilder) {
        string content = linebuilder.tostring();
        string[] lines = content.split("\n", -1);
        
        // 处理除了最后一个可能不完整的部分外的所有行
        for (int i = 0; i < lines.length - 1; i++) {
            processline(lines[i]);
        }
        
        // 保留最后一个可能不完整的部分
        linebuilder.setlength(0);
        linebuilder.append(lines[lines.length - 1]);
    }
    
    private static void processline(string line) {
        // 实际的数据处理逻辑
    }
}

5. 完整示例：处理1tb文本文件

import java.io.bufferedreader;
import java.io.filereader;
import java.io.ioexception;
import java.util.concurrent.atomic.atomiclong;

public class terabytefileprocessor {
    
    public static void main(string[] args) {
        string largefilepath = "/path/to/your/1tb_file.txt";
        
        // 处理超大文件
        processlargefile(largefilepath);
    }
    
    /**
     * 处理超大文件的主方法
     * @param filepath 文件路径
     */
    public static void processlargefile(string filepath) {
        atomiclong totallines = new atomiclong(0);
        atomiclong processedlines = new atomiclong(0);
        
        try {
            // 先统计总行数（可选）
            totallines.set(countlines(filepath));
            system.out.println("文件总行数: " + totallines.get());
            
            // 开始处理文件
            try (bufferedreader reader = new bufferedreader(
                    new filereader(filepath), 8192 * 2)) { // 增大缓冲区
                
                string line;
                while ((line = reader.readline()) != null) {
                    long currentline = processedlines.incrementandget();
                    
                    // 实际处理逻辑
                    handledataline(line, currentline);
                    
                    // 进度报告
                    if (currentline % 1000000 == 0) {
                        double progress = (double) currentline / totallines.get() * 100;
                        system.out.printf("处理进度: %.2f%% (%d/%d行)\n", 
                                progress, currentline, totallines.get());
                    }
                }
            }
            
            system.out.println("文件处理完成，共处理 " + processedlines.get() + " 行");
            
        } catch (ioexception e) {
            system.err.println("处理文件时发生错误: " + e.getmessage());
            e.printstacktrace();
        }
    }
    
    /**
     * 统计文件行数
     * @param filepath 文件路径
     * @return 行数
     * @throws ioexception io异常
     */
    private static long countlines(string filepath) throws ioexception {
        long lines = 0;
        try (bufferedreader reader = new bufferedreader(new filereader(filepath))) {
            while (reader.readline() != null) {
                lines++;
            }
        }
        return lines;
    }
    
    /**
     * 处理单行数据
     * @param line 行数据
     * @param linenumber 行号
     */
    private static void handledataline(string line, long linenumber) {
        // 在这里实现具体的数据处理逻辑
        // 例如：数据清洗、转换、存储到数据库等
        
        // 示例：简单的数据验证和处理
        if (line != null && !line.trim().isempty()) {
            // 处理有效行数据
            string processeddata = line.trim().touppercase();
            // 可以将处理后的数据保存到其他地方
            
            // 模拟处理时间
            if (linenumber % 10000000 == 0) {
                try {
                    thread.sleep(1); // 避免cpu占用过高
                } catch (interruptedexception e) {
                    thread.currentthread().interrupt();
                }
            }
        }
    }
}

不同方法的优缺点对比

方法	优点	缺点	适用场景
`bufferedreader`	简单易用，内存效率高，兼容性好	需要手动管理行处理逻辑	大多数文本文件处理场景
`files.lines()`	代码简洁，支持并行处理，函数式编程风格	需要java 8+，异常处理复杂	现代java应用，需要并行处理
`mappedbytebuffer`	直接内存访问，随机读取效率高	内存映射有限制，实现复杂	需要随机访问或高性能读取
分块读取	精确控制内存使用，灵活性高	实现复杂，需要处理边界情况	特殊需求，对内存控制要求严格