-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_references_statement.py
More file actions
69 lines (48 loc) · 2.29 KB
/
Copy pathextract_references_statement.py
File metadata and controls
69 lines (48 loc) · 2.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import json, re
import pprint
import pandas as pd
import re
def extract_references(text, generation_id, model):
"""
Extract references, their IDs, and grounded statements from text.
Args:
text (str): Input text containing references in Wikipedia-style format
Returns:
list: List of dictionaries containing reference information
"""
# Split text into sentences (basic split on periods followed by space)
sentences = re.split(r'(?<=[.!?])\s+', text)
references = []
# Pattern to match reference tags and their content
ref_pattern = r'<ref\s+name="([^"]+)">([^<]+)<\/ref>'
# Iterate through sentences to find references and their context
for i, sentence in enumerate(sentences):
# Find all references in the current sentence
refs = re.finditer(ref_pattern, sentence)
for ref in refs:
ref_id = ref.group(1) # The reference ID
citation = ref.group(2) # The actual citation text
# Get the grounding statement (sentence before the reference)
# Remove the reference tag from the current sentence to get clean text
clean_sentence = re.sub(ref_pattern, '', sentence).strip()
reference_obj = {
'generation_id': generation_id,
'model': model,
'statement': clean_sentence,
'statement_size': len(clean_sentence.split()),
'reference_id': ref_id,
'citation': citation,
'citation_size': len(citation.split()),
}
references.append(reference_obj)
return references
reference_set = open("llm_evaluations.json", 'r')
reference_set = json.load(reference_set)
structured_reference_set = []
#Extraction in a structured way
for instruction in reference_set[0:]:
generation_id, model, source, analysis = instruction["generation_id"], instruction["model"], instruction["text"], instruction["generated_response"]
references = extract_references(analysis, generation_id, model)
structured_reference_set.extend(references)
structured_reference_set = pd.DataFrame(structured_reference_set)
structured_reference_set.to_parquet("eval_statement_set.parquet")