import json
import sys
import os
import re
import time
Directory_below_python_file = "JSON_DATA"
def find_filenames():
filenames = []
for subdir, dirs, files in os.walk(Directory_below_python_file):
for filename in files:
filepath = (subdir + os.sep + filename).replace("\\","/")
filenames.append(filepath)
return filenames
def scan_files_for_gdrive(filenames):
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd) #For if anyone decided to use bad chars
for filename in filenames:
counter = 0
hit_counter = 0
start_time = time.time()
print(f"scanning {filename}")
url_links = []
search_string = "drive.google.com"
max_expected_length = 90
with open(filename,"r",encoding="utf-8") as readfile:
for line in readfile:
counter += 1
body_data = json.loads(line.translate(non_bmp_map))['body']
if search_string in body_data:
search = re.search(f"{search_string}",body_data)
word_index_value = search.start()
if len(body_data) <= word_index_value + max_expected_length + 1:
possible_valid_url = body_data[word_index_value:-1]
else:
possible_valid_url = body_data[word_index_value:word_index_value+max_expected_length]
segmented_url = possible_valid_url.split(" ")[0].split("\n")[0]
url_links.append(segmented_url)
hit_counter += 1
if hit_counter % 10 == 0:
print(f"{hit_counter} items found, one of them is {segmented_url}")
if len(url_links) > 0:
print(f"writing results to text")
with open(f"{filename.split('/')[-1]}.txt","w",encoding="utf-8") as outfile:
for item in url_links:
outfile.write(item+"\n")
end_time = time.time()
duration = end_time-start_time
ips = round(counter/(duration),3)
print(f"{ips} items per second for {round(duration,4)} seconds")
if __name__ == "__main__":
filenames = find_filenames()
scan_files_for_gdrive(filenames)