import json import sys import os import re import time Directory_below_python_file = "JSON_DATA" def find_filenames(): filenames = [] for subdir, dirs, files in os.walk(Directory_below_python_file): for filename in files: filepath = (subdir + os.sep + filename).replace("\\","/") filenames.append(filepath) return filenames def scan_files_for_gdrive(filenames): non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd) #For if anyone decided to use bad chars for filename in filenames: counter = 0 hit_counter = 0 start_time = time.time() print(f"scanning {filename}") url_links = [] search_string = "drive.google.com" max_expected_length = 90 with open(filename,"r",encoding="utf-8") as readfile: for line in readfile: counter += 1 body_data = json.loads(line.translate(non_bmp_map))['body'] if search_string in body_data: search = re.search(f"{search_string}",body_data) word_index_value = search.start() if len(body_data) <= word_index_value + max_expected_length + 1: possible_valid_url = body_data[word_index_value:-1] else: possible_valid_url = body_data[word_index_value:word_index_value+max_expected_length] segmented_url = possible_valid_url.split(" ")[0].split("\n")[0] url_links.append(segmented_url) hit_counter += 1 if hit_counter % 10 == 0: print(f"{hit_counter} items found, one of them is {segmented_url}") if len(url_links) > 0: print(f"writing results to text") with open(f"{filename.split('/')[-1]}.txt","w",encoding="utf-8") as outfile: for item in url_links: outfile.write(item+"\n") end_time = time.time() duration = end_time-start_time ips = round(counter/(duration),3) print(f"{ips} items per second for {round(duration,4)} seconds") if __name__ == "__main__": filenames = find_filenames() scan_files_for_gdrive(filenames)