import argparse import internetarchive as ia import io import os import struct from concurrent.futures import ThreadPoolExecutor, as_completed from warcio.archiveiterator import ArchiveIterator import zstandard as zstd import re def list_warcs(collection_id): warcs = [] for result in ia.search_items(f"collection:{collection_id}"): item = ia.get_item(result['identifier']) for file in item.files: file_name = file['name'] if file_name.endswith('.warc.gz') or file_name.endswith('.warc') or file_name.endswith('.warc.zst'): warcs.append((result['identifier'], file_name)) return warcs def download_warc(collection_id, identifier, file_name, output_dir): output_path = os.path.join(output_dir, os.path.basename(file_name)) ia.download(identifier, files=[file_name], destdir=output_dir, no_directory=True, retries=3) return output_path def read_skippable_frame(stream): frame_header = stream.read(4) if len(frame_header) != 4: return None frame_size = struct.unpack('