一文学会在Python中从URL下载文件

自由坦荡的智能 2025-02-15 03:22:19

完整指南:从 URL 下载文件

当需要使用 Python 从 Internet 下载文件时,可以使用几种可靠的方法。本指南介绍了从基本下载到处理大文件和管理常见边缘情况的所有内容。让我们探索完成此作的实用方法。

使用 urllib 下载基本文件

'urllib' 库内置于 Python 中,可以很好地处理简单的下载:

from urllib.request import urlretrievedef download_file_simple(url, filename): try: urlretrieve(url, filename) print(f"Successfully downloaded {filename}") except Exception as e: print(f"An error occurred: {e}")# Example usageurl = "https://example.com/sample.pdf"download_file_simple(url, "sample.pdf")

此方法适用于基本下载,但缺乏进度跟踪和高级功能。让我们看看更好的选择。

使用请求:推荐的方法

'requests' 库提供了更多功能和更好的错误处理:

import requestsdef download_file(url, filename): try: # Send a GET request to the URL response = requests.get(url, stream=True) response.raise_for_status() # Raises an HTTPError for bad responses # Open the local file to write the downloaded content with open(filename, 'wb') as file: for chunk in response.iter_content(chunk_size=8192): file.write(chunk) return True except requests.exceptions.RequestException as e: print(f"Error downloading file: {e}") return False# Example usageurl = "https://example.com/large-file.zip"success = download_file(url, "large-file.zip")if success: print("Download completed successfully")

'chunk_size=8192' 参数通过读取较小的内容来帮助在下载大型文件时管理内存使用情况。

添加进度跟踪

让我们添加一个进度条来查看下载的进展情况:

import requestsfrom tqdm import tqdmdef download_with_progress(url, filename): try: # Send GET request response = requests.get(url, stream=True) response.raise_for_status() # Get the file size from headers total_size = int(response.headers.get('content-length', 0)) # Open file and create progress bar with open(filename, 'wb') as file, \ tqdm(desc=filename, total=total_size, unit='iB', unit_scale=True) as progress_bar: for data in response.iter_content(chunk_size=8192): size = file.write(data) progress_bar.update(size) return True except requests.exceptions.RequestException as e: print(f"Download error: {e}") return False# Example usageurl = "https://example.com/large-file.zip"download_with_progress(url, "large-file.zip")

此版本显示一个进度条,其中包含下载速度和估计剩余时间。

处理不同类型的文件

下面是一个更强大的函数,用于处理各种文件类型并包含基本验证:

import requestsimport osfrom urllib.parse import urlparseimport mimetypesdef smart_download(url, output_dir="."): try: # Send HEAD request first to get headers head_response = requests.head(url) head_response.raise_for_status() # Get filename from URL or Content-Disposition content_disposition = head_response.headers.get('content-disposition') if content_disposition: import re fname = re.findall("filename=(.+)", content_disposition) if fname: filename = fname[0].strip('"') else: filename = os.path.basename(urlparse(url).path) else: filename = os.path.basename(urlparse(url).path) # If no extension in filename, try to guess from content-type if '.' not in filename: content_type = head_response.headers.get('content-type') if content_type: ext = mimetypes.guess_extension(content_type.split(';')[0].strip()) if ext: filename = f"download{ext}" # Create output directory if it doesn't exist os.makedirs(output_dir, exist_ok=True) filepath = os.path.join(output_dir, filename) # Download the file with progress tracking print(f"Downloading {url} to {filepath}") response = requests.get(url, stream=True) response.raise_for_status() total_size = int(response.headers.get('content-length', 0)) with open(filepath, 'wb') as file, \ tqdm(desc=filename, total=total_size, unit='iB', unit_scale=True) as progress_bar: for data in response.iter_content(chunk_size=8192): size = file.write(data) progress_bar.update(size) return filepath except requests.exceptions.RequestException as e: print(f"Download failed: {e}") return None# Example usageurls = [ "https://example.com/document.pdf", "https://example.com/image.jpg", "https://example.com/data.csv"]for url in urls: downloaded_file = smart_download(url, "downloads") if downloaded_file: print(f"Successfully downloaded to {downloaded_file}")处理身份验证和标头

从 API 或受保护的资源下载时,您可能需要处理身份验证:

import requestsdef download_with_auth(url, filename, headers=None, auth=None): try: # Set default headers if none provided if headers is None: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } # Make request with authentication and headers response = requests.get(url, headers=headers, auth=auth, stream=True) response.raise_for_status() # Download with progress tracking total_size = int(response.headers.get('content-length', 0)) with open(filename, 'wb') as file, \ tqdm(desc=filename, total=total_size, unit='iB', unit_scale=True) as progress_bar: for chunk in response.iter_content(chunk_size=8192): size = file.write(chunk) progress_bar.update(size) return True except requests.exceptions.RequestException as e: print(f"Download failed: {e}") return False# Example usage with basic authurl = "https://api.example.com/files/document.pdf"headers = { 'Authorization': 'Bearer your-access-token', 'Accept': 'application/pdf'}auth = ('username', 'password') # Basic authenticationsuccess = download_with_auth(url, "document.pdf", headers=headers, auth=auth)实际示例:下载多个文件

下面是一个同时下载多个文件的实际示例:

import requestsfrom concurrent.futures import ThreadPoolExecutorfrom tqdm import tqdmimport osdef download_file_threaded(args): url, output_dir = args try: filename = os.path.basename(urlparse(url).path) filepath = os.path.join(output_dir, filename) response = requests.get(url, stream=True) response.raise_for_status() with open(filepath, 'wb') as file: for chunk in response.iter_content(chunk_size=8192): file.write(chunk) return filepath except Exception as e: return Nonedef download_multiple_files(urls, output_dir="downloads", max_workers=5): # Create output directory os.makedirs(output_dir, exist_ok=True) # Prepare arguments for thread pool args = [(url, output_dir) for url in urls] # Download files using thread pool with ThreadPoolExecutor(max_workers=max_workers) as executor: results = list(tqdm( executor.map(download_file_threaded, args), total=len(urls), desc="Downloading files" )) # Process results successful = [r for r in results if r is not None] failed = len(results) - len(successful) print(f"\nDownload complete:") print(f"- Successfully downloaded: {len(successful)} files") print(f"- Failed downloads: {failed} files") return successful# Example usageurls = [ "https://example.com/file1.pdf", "https://example.com/file2.jpg", "https://example.com/file3.zip",]downloaded_files = download_multiple_files(urls, "downloads", max_workers=3)重要提示

1. 始终使用 'raise_for_status()' 来捕获 HTTP 错误2. 使用 'stream=True' 和 'iter_content()' 流式传输大文件3. 为网络问题添加适当的错误处理4. 使用进度条获得更好的用户体验5. 在内容完整性很重要时验证下载的文件6. 下载多个文件时考虑速率限制7. 处理慢速连接的超时

下面是文件验证的快速示例:

import hashlibdef validate_download(filepath, expected_hash): sha256_hash = hashlib.sha256() with open(filepath, "rb") as f: for byte_block in iter(lambda: f.read(4096), b""): sha256_hash.update(byte_block) actual_hash = sha256_hash.hexdigest() return actual_hash == expected_hash# Example usagefilepath = "downloaded_file.zip"expected_hash = "a1b2c3..." # Expected SHA-256 hashif validate_download(filepath, expected_hash): print("File integrity verified")else: print("File may be corrupted")

通过使用这些方法和模式,您可以在处理常见问题和边缘情况时可靠地从 Internet 下载文件。请记住,要始终考虑安全隐患,并针对您的特定用例实施适当的错误处理。

0 阅读:13
自由坦荡的智能

自由坦荡的智能

感谢大家的关注