Extern crate regex;
Extern crate flate2;
Extern crate zstd;
Extern crate warc;
Extern crate rayon;

Use std::fs;
Use std::io::{bufread, write};
Use std::path::path;
Use std::time::duration;
Use regex::regex;
Use flate2::read::gzdecoder;
Use zstd::stream::read::decoder;
Use warc::record;
Use rayon::prelude::*;

Fn read_skippable_frame(file: &mut fs::file) -> option<vec<u8>> {
Let mut magic_number = [0; 4];
If file.read_exact(&mut magic_number).is_err() || magic_number != [40, 181, 47, 253] {
Return none;
}
Let mut frame_size = [0; 4];
File.read_exact(&mut frame_size).unwrap();
Let frame_size = u32::from_le_bytes(frame_size) as usize;
Let mut frame = vec![0; frame_size];
File.read_exact(&mut frame).unwrap();
Some(frame)
}

Fn process_warc(file_path: &str, output_file_path: &str, pattern: &regex) {
Println!("processing file: {}", file_path);
Let mut counter = 0;
Let mut matches_buffer = vec::new();

Let warc_stream = match path::new(file_path).extension().and_then(|s| s.to_str()) {
Some("gz") => box::new(bufreader::new(gzdecoder::new(fs::file::open(file_path).unwrap()))) as box<dyn bufread>,
Some("zst") => {
Let mut raw_file = fs::file::open(file_path).unwrap();
Let dict_data = read_skippable_frame(&mut raw_file).unwrap();
Let dctx = zstd::decoder::with_dictionary(raw_file, &dict_data).unwrap();
Box::new(bufreader::new(dctx)) as box<dyn bufread>
},
_ => box::new(fs::file::open(file_path).unwrap()) as box<dyn bufread>,
};

Let mut warc_reader = warc::reader::new(warc_stream);
While let some(result) = warc_reader.next() {
Let record = result.unwrap();
If record.header().record_type() == some(record::response) {
Let content = string::from_utf8_lossy(record.content());
Let matches: vec<string> = pattern.find_iter(&content).map(|m| m.as_str().to_string()).collect();
Matches_buffer.extend(matches);
Counter += matches.len();

If counter % 100 == 0 {
Let mut output_file = fs::openoptions::new()
.append(true)
.create(true)
.open(output_file_path)
.unwrap();
Writeln!(output_file, "{}", matches_buffer.join("\n")).unwrap();
Matches_buffer.clear();
}
}
}
If !matches_buffer.is_empty() {
Let mut output_file = fs::openoptions::new()
.append(true)
.create(true)
.open(output_file_path)
.unwrap();
Writeln!(output_file, "{}", matches_buffer.join("\n")).unwrap();
}

Println!("removing file: {}", file_path);
Fs::remove_file(file_path).unwrap();
}

Fn process_warcs_in_directory(warc_directory: &str, output_file_path: &str, pattern: &regex) {
Loop {
Let file_names: vec<string> = fs::read_dir(warc_directory)
.unwrap()
.filter_map(|entry| {
Let entry = entry.unwrap();
Let file_name = entry.file_name().into_string().unwrap();
If file_name.ends_with(".warc.gz") || file_name.ends_with(".warc.zst") {
Some(file_name)
} else {
None
}
})
.collect();

File_names.par_iter().for_each(|file_name| {
Let file_path = format!("{}/{}", warc_directory, file_name);
Process_warc(&file_path, output_file_path, pattern);
});

Println!("sleeping 60s");
Std::thread::sleep(duration::from_secs(60));
}
}
Fn main() {
Let args: vec<string> = env::args().collect();
If args.len() < 3 {
Println!("usage: {} <warc_directory> <output_file_path>", args[0]);
Return;
}

Let warc_directory = &args[1];
Let output_file_path = &args[2];
Let regex_pattern = r"\s*imgur\s*";
Let pattern = regex::new(regex_pattern).unwrap();

Process_warcs_in_directory(warc_directory, output_file_path, &pattern);
}