Exbin

import json
import sys
import os
import re
import time
Directory_below_python_file = "JSON_DATA"
def find_filenames():
    filenames = []
    for subdir, dirs, files in os.walk(Directory_below_python_file):
        for filename in files:
            filepath = (subdir + os.sep + filename).replace("\\","/")
            filenames.append(filepath)
    return filenames
 
def scan_files_for_gdrive(filenames):
    non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd) #For if anyone decided to use bad chars
    for filename in filenames:
        counter = 0
        hit_counter = 0
        start_time = time.time()
        print(f"scanning {filename}")
        url_links = []
        search_string = "drive.google.com"
        max_expected_length = 90
        with open(filename,"r",encoding="utf-8") as readfile:
            for line in readfile:
                counter += 1
                body_data = json.loads(line.translate(non_bmp_map))['body']
                if search_string in body_data:
                    search = re.search(f"{search_string}",body_data)
                    word_index_value = search.start()
                    if len(body_data) <= word_index_value + max_expected_length + 1:
                        possible_valid_url = body_data[word_index_value:-1]
                    else:
                        possible_valid_url = body_data[word_index_value:word_index_value+max_expected_length]
                    segmented_url = possible_valid_url.split(" ")[0].split("\n")[0]
                    url_links.append(segmented_url)
                    hit_counter += 1
                    if hit_counter % 10 == 0:
                        print(f"{hit_counter} items found, one of them is {segmented_url}")
        
        if len(url_links) > 0:
            print(f"writing results to text")
            with open(f"{filename.split('/')[-1]}.txt","w",encoding="utf-8") as outfile:
                for item in url_links:
                    outfile.write(item+"\n")
        end_time = time.time()
        duration = end_time-start_time
        ips = round(counter/(duration),3)
        print(f"{ips} items per second for {round(duration,4)} seconds")
 
if __name__ == "__main__":
    filenames = find_filenames()
    scan_files_for_gdrive(filenames)