Import json
Import sys
Import os
Import re
Import time
Directory_below_python_file = "json_data"
Def find_filenames():
Filenames = []
For subdir, dirs, files in os.walk(directory_below_python_file):
For filename in files:
Filepath = (subdir + os.sep + filename).replace("\\","/")
Filenames.append(filepath)
Return filenames

Def scan_files_for_gdrive(filenames):
Non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd) #for if anyone decided to use bad chars
For filename in filenames:
Counter = 0
Hit_counter = 0
Start_time = time.time()
Print(f"scanning {filename}")
Url_links = []
Search_string = "drive.google.com"
Max_expected_length = 90
With open(filename,"r",encoding="utf-8") as readfile:
For line in readfile:
Counter += 1
Body_data = json.loads(line.translate(non_bmp_map))['body']
If search_string in body_data:
Search = re.search(f"{search_string}",body_data)
Word_index_value = search.start()
If len(body_data) <= word_index_value + max_expected_length + 1:
Possible_valid_url = body_data[word_index_value:-1]
Else:
Possible_valid_url = body_data[word_index_value:word_index_value+max_expected_length]
Segmented_url = possible_valid_url.split(" ")[0].split("\n")[0]
Url_links.append(segmented_url)
Hit_counter += 1
If hit_counter % 10 == 0:
Print(f"{hit_counter} items found, one of them is {segmented_url}")

If len(url_links) > 0:
Print(f"writing results to text")
With open(f"{filename.split('/')[-1]}.txt","w",encoding="utf-8") as outfile:
For item in url_links:
Outfile.write(item+"\n")
End_time = time.time()
Duration = end_time-start_time
Ips = round(counter/(duration),3)
Print(f"{ips} items per second for {round(duration,4)} seconds")

If __name__ == "__main__":
Filenames = find_filenames()
Scan_files_for_gdrive(filenames)