从基础到进阶详解Python下载文件的方法完整指南_Python

在python中下载文件是一项常见任务，无论是从网页下载图片、文档，还是通过api获取数据，掌握文件下载技术都是开发者的必备技能。本文将系统介绍python下载文件的多种方法，涵盖基础实现、高级技巧和常见问题解决方案。

一、基础方法：使用标准库下载文件

1. 使用urllib.request（python内置库）

import urllib.request

url = "https://example.com/file.zip"
filename = "downloaded_file.zip"

try:
    urllib.request.urlretrieve(url, filename)
    print(f"文件已下载到: {filename}")
except exception as e:
    print(f"下载失败: {e}")

特点：

无需安装第三方库
适合简单下载场景
缺乏进度显示和错误处理细节

2. 使用requests库（推荐）

import requests

url = "https://example.com/file.zip"
filename = "downloaded_file.zip"

try:
    response = requests.get(url, stream=true)  # 使用流式下载大文件
    response.raise_for_status()  # 检查请求是否成功
    
    with open(filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):  # 分块写入
            if chunk:  # 过滤掉keep-alive新块
                f.write(chunk)
    
    print(f"文件已下载到: {filename}")
except requests.exceptions.requestexception as e:
    print(f"下载失败: {e}")

优势：

更简洁的api
支持流式下载（适合大文件）
完善的错误处理机制
可添加请求头、代理等高级功能

二、进阶技巧：增强下载功能

1. 显示下载进度

import requests
from tqdm import tqdm  # 需要安装: pip install tqdm

url = "https://example.com/large_file.zip"
filename = "large_file.zip"

try:
    response = requests.get(url, stream=true)
    total_size = int(response.headers.get('content-length', 0))
    
    with open(filename, 'wb') as f, tqdm(
        desc=filename,
        total=total_size,
        unit='ib',
        unit_scale=true,
        unit_divisor=1024,
    ) as bar:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
            bar.update(len(chunk))
    
    print("\n下载完成!")
except exception as e:
    print(f"下载失败: {e}")

2. 断点续传功能

import os
import requests

url = "https://example.com/large_file.zip"
filename = "large_file.zip"

# 检查是否已部分下载
downloaded_size = 0
if os.path.exists(filename):
    downloaded_size = os.path.getsize(filename)

headers = {'range': f'bytes={downloaded_size}-'}

try:
    response = requests.get(url, headers=headers, stream=true)
    response.raise_for_status()
    
    with open(filename, 'ab') as f:  # 以追加模式打开
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)
    
    print("下载完成!")
except exception as e:
    print(f"下载失败: {e}")

3. 多线程/异步下载（加速下载）

import requests
from concurrent.futures import threadpoolexecutor
import os

def download_chunk(url, start, end, filename, chunk_num):
    headers = {'range': f'bytes={start}-{end}'}
    try:
        response = requests.get(url, headers=headers, stream=true)
        with open(f"{filename}.part{chunk_num}", 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        return true
    except exception as e:
        print(f"分块{chunk_num}下载失败: {e}")
        return false

def merge_files(filename, num_chunks):
    with open(filename, 'wb') as outfile:
        for i in range(num_chunks):
            part_filename = f"{filename}.part{i}"
            if os.path.exists(part_filename):
                with open(part_filename, 'rb') as infile:
                    outfile.write(infile.read())
                os.remove(part_filename)

url = "https://example.com/very_large_file.zip"
filename = "very_large_file.zip"
file_size = 1024 * 1024 * 100  # 假设文件100mb
chunk_size = 1024 * 1024 * 10  # 每块10mb
num_chunks = file_size // chunk_size

# 创建线程池下载各分块
with threadpoolexecutor(max_workers=5) as executor:
    futures = []
    for i in range(num_chunks):
        start = i * chunk_size
        end = start + chunk_size - 1 if i != num_chunks - 1 else file_size - 1
        futures.append(executor.submit(
            download_chunk, url, start, end, filename, i
        ))
    
    # 等待所有分块下载完成
    for future in futures:
        future.result()

# 合并分块
merge_files(filename, num_chunks)
print("下载并合并完成!")

三、常见场景解决方案

1. 下载网页上的所有资源

import requests
from bs4 import beautifulsoup
import os

def download_resources(url, output_folder="downloads"):
    os.makedirs(output_folder, exist_ok=true)
    
    try:
        response = requests.get(url)
        soup = beautifulsoup(response.text, 'html.parser')
        
        # 下载图片
        for img in soup.find_all('img'):
            img_url = img.get('src')
            if img_url and not img_url.startswith('data:'):
                if not img_url.startswith(('http://', 'https://')):
                    img_url = f"{url}/{img_url}" if not url.endswith('/') else f"{url}{img_url}"
                try:
                    img_data = requests.get(img_url).content
                    img_name = os.path.join(output_folder, img_url.split('/')[-1])
                    with open(img_name, 'wb') as f:
                        f.write(img_data)
                except exception as e:
                    print(f"图片下载失败: {e}")
        
        # 可以类似地下载css/js等资源
        print("资源下载完成!")
    except exception as e:
        print(f"网页下载失败: {e}")

download_resources("https://example.com")

2. 使用代理下载

import requests

proxies = {
    'http': 'http://10.10.1.10:3128',
    'https': 'http://10.10.1.10:1080',
}

url = "https://example.com"
try:
    response = requests.get(url, proxies=proxies)
    with open("page.html", 'w', encoding='utf-8') as f:
        f.write(response.text)
    print("通过代理下载成功!")
except exception as e:
    print(f"代理下载失败: {e}")

3. 处理下载重定向

import requests

url = "http://example.com/redirecting_link"
try:
    response = requests.get(url, allow_redirects=true)  # 默认允许重定向
    final_url = response.url  # 获取最终url
    print(f"最终url: {final_url}")
    
    # 下载最终文件
    with open("final_file.txt", 'wb') as f:
        f.write(response.content)
except exception as e:
    print(f"下载失败: {e}")

四、最佳实践与注意事项

错误处理：始终添加异常处理，特别是网络请求可能因各种原因失败
资源清理：使用with语句确保文件正确关闭
大文件处理：使用流式下载(stream=true)和分块写入
安全性：
- 验证ssl证书（默认行为）
- 对用户提供的url进行验证
- 限制文件类型和保存路径
性能优化：
- 合理设置分块大小（通常8kb-1mb）
- 多线程下载适合高延迟网络
- 考虑使用异步io（如aiohttp）提高并发性能

五、完整示例：带进度条的下载函数

import requests
from tqdm import tqdm
import os

def download_file(url, filename=none, chunk_size=8192):
    """
    下载文件并显示进度条
    
    :param url: 文件url
    :param filename: 保存文件名（可选，默认从url提取）
    :param chunk_size: 分块大小（字节）
    :return: 保存的文件路径
    """
    try:
        # 获取文件名（如果未提供）
        if filename is none:
            filename = os.path.basename(url.split('?')[0])  # 去除查询参数
        
        # 发送请求
        response = requests.get(url, stream=true)
        response.raise_for_status()
        
        # 获取总大小（如果服务器提供）
        total_size = int(response.headers.get('content-length', 0))
        
        # 创建进度条
        progress_bar = tqdm(
            desc=filename,
            total=total_size,
            unit='ib',
            unit_scale=true,
            unit_divisor=1024,
        )
        
        # 写入文件
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=chunk_size):
                f.write(chunk)
                progress_bar.update(len(chunk))
        
        progress_bar.close()
        print(f"\n文件已保存到: {os.path.abspath(filename)}")
        return filename
    
    except requests.exceptions.requestexception as e:
        print(f"下载失败: {e}")
        return none

# 使用示例
download_file("https://example.com/sample.pdf", "my_document.pdf")