-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess_json.py
More file actions
36 lines (28 loc) · 867 Bytes
/
preprocess_json.py
File metadata and controls
36 lines (28 loc) · 867 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import os
import joblib
import json
import pandas as pd
import requests
def create_embedding(text):
result = requests.post("http://localhost:11434/api/embed", json= {
"model": "bge-m3",
"input": text
})
embedding = result.json()["embeddings"] #[0:5]
return embedding
jsons = os.listdir("json") # list all the jsons
my_dict = []
chunk_id =0
for json_file in jsons:
with open(f"json/{json_file}") as f:
content = json.load(f)
print(f"Creating embeddings for {json_file}")
embeddings = create_embedding([c['text'] for c in content['chunks']])
for i, chunk in enumerate(content['chunks']):
chunk['chunk_id'] = chunk_id
chunk['embedding'] = embeddings[i]
chunk_id +=1
my_dict.append(chunk)
df = pd.DataFrame.from_records(my_dict)
# save dataframe
joblib.dump(df,'embeddings.joblib')