35 lines
1.1 KiB
Python
35 lines
1.1 KiB
Python
import json
|
|
import pymmh3
|
|
import binascii
|
|
|
|
def question_hash(question):
|
|
# MurmurHash3 128-bit hash
|
|
hash_value = pymmh3.hash128(question, x64arch=True)
|
|
# Split the 128-bit integer into two 64-bit integers (high and low)
|
|
h1 = (hash_value >> 64) & 0xFFFFFFFFFFFFFFFF
|
|
h2 = hash_value & 0xFFFFFFFFFFFFFFFF
|
|
# Convert each part to a byte array in big endian order and concatenate
|
|
hash_bytes = h2.to_bytes(8, byteorder='big') + h1.to_bytes(8, byteorder='big')
|
|
# Convert the byte array to a hexadecimal string
|
|
return binascii.hexlify(hash_bytes).decode('utf-8')
|
|
|
|
# Read the testdata.json file
|
|
with open('testdata.json.original', 'r') as f:
|
|
data = json.load(f)
|
|
|
|
# Create the new dictionaries
|
|
questions = {}
|
|
hashed_data = {}
|
|
|
|
for question, answer in data.items():
|
|
hashed_question = question_hash(question)
|
|
questions[hashed_question] = question
|
|
hashed_data[hashed_question] = answer
|
|
|
|
# Write the questions.json file
|
|
with open('questions.json', 'w') as f:
|
|
json.dump(questions, f, indent=4)
|
|
|
|
# # Overwrite the testdata.json file
|
|
with open('testdata.json', 'w') as f:
|
|
json.dump(hashed_data, f, indent=4)
|