Skip to content
Navigation Menu
{{ message }}
-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathevaluation.py
More file actions
284 lines (239 loc) · 9.81 KB
/
Copy pathevaluation.py
File metadata and controls
284 lines (239 loc) · 9.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import json
import logging
import os
from typing import Any, Dict, List, Tuple
import xml.etree.ElementTree as ET
from openpyxl.utils import column_index_from_string, get_column_letter
import re
logger = logging.getLogger(__name__)
BBox = Tuple[int, int, int, int]
def _resolve_manifest_path(manifest_path: str, maybe_relative: str) -> str:
if os.path.isabs(maybe_relative):
return maybe_relative
return os.path.abspath(os.path.join(os.path.dirname(manifest_path), maybe_relative))
def range_to_bbox(range_str: str) -> BBox:
"""Convert an Excel-style range (e.g., 'A1:F9') to a BBox tuple."""
parts = range_str.split(':')
start_cell = parts[0]
end_cell = parts[1] if len(parts) > 1 else start_cell
col_start_str = ''.join(filter(str.isalpha, start_cell))
row_start_str = ''.join(filter(str.isdigit, start_cell))
col_end_str = ''.join(filter(str.isalpha, end_cell))
row_end_str = ''.join(filter(str.isdigit, end_cell))
c1 = column_index_from_string(col_start_str)
r1 = int(row_start_str)
c2 = column_index_from_string(col_end_str)
r2 = int(row_end_str)
return (r1, c1, r2, c2)
def load_spreadsheet_dataset(path: str) -> List[Dict[str, object]]:
"""
Load a spreadsheet dataset with annotations in JSON format.
Expects pairs of .xlsx and .json files.
"""
dataset = []
for fname in os.listdir(path):
if fname.endswith(".xlsx"):
spreadsheet_path = os.path.join(path, fname)
ann_path = os.path.join(path, fname.replace(".xlsx", ".json"))
if not os.path.exists(ann_path):
logger.warning(f"Annotation file not found for {fname}, skipping.")
continue
with open(ann_path, 'r') as f:
annotations = json.load(f)
tables = annotations.get("tables", [])
if not tables:
continue
bboxes = [range_to_bbox(t['range']) for t in tables]
dataset.append({
"spreadsheet_path": spreadsheet_path,
"bboxes": bboxes,
"ann_path": ann_path
})
return dataset
def load_table_detection_manifest(manifest_path: str) -> List[Dict[str, object]]:
"""Load a manifest-defined spreadsheet table-detection dataset.
Expected shape::
{
"dataset_name": "synthetic_v1",
"dataset_version": "1",
"split_name": "test",
"items": [
{
"spreadsheet_path": "book.xlsx",
"tables": [{"range": "A1:B2"}]
}
]
}
Relative spreadsheet paths are resolved from the manifest directory.
"""
with open(manifest_path, encoding="utf-8") as fh:
manifest: Dict[str, Any] = json.load(fh)
dataset_name = manifest.get("dataset_name", "manifest")
dataset_version = manifest.get("dataset_version", "unspecified")
split_name = manifest.get("split_name", "unspecified")
claim_level = manifest.get("claim_level", "reconstructed")
items = manifest.get("items")
if not isinstance(items, list):
raise ValueError("table-detection manifest must contain an items array")
dataset = []
for idx, item in enumerate(items):
if not isinstance(item, dict):
raise ValueError(f"manifest item {idx} must be an object")
spreadsheet_path = item.get("spreadsheet_path")
tables = item.get("tables", [])
if not spreadsheet_path:
raise ValueError(f"manifest item {idx} missing spreadsheet_path")
if not isinstance(tables, list):
raise ValueError(f"manifest item {idx} tables must be an array")
bboxes = [range_to_bbox(t["range"]) for t in tables if "range" in t]
dataset.append({
"spreadsheet_path": _resolve_manifest_path(manifest_path, spreadsheet_path),
"bboxes": bboxes,
"manifest_path": os.path.abspath(manifest_path),
"dataset_name": dataset_name,
"dataset_version": dataset_version,
"split_name": split_name,
"claim_level": claim_level,
})
return dataset
def load_dong2019_dataset(path: str) -> List[Dict[str, object]]:
"""Load the Dong et al. (2019) table detection dataset.
The function expects two subdirectories under ``path``:
``images`` containing the page images and ``annotations`` with
Pascal VOC XML files describing table bounding boxes.
Returns a list of dictionaries with ``image_path`` and ``bboxes``.
"""
ann_dir = os.path.join(path, "annotations")
img_dir = os.path.join(path, "images")
dataset = []
for fname in os.listdir(ann_dir):
if not fname.endswith(".xml"):
continue
ann_path = os.path.join(ann_dir, fname)
tree = ET.parse(ann_path)
root = tree.getroot()
bboxes: List[BBox] = []
for obj in root.findall(".//object"):
bb = obj.find("bndbox")
xmin = int(bb.find("xmin").text)
ymin = int(bb.find("ymin").text)
xmax = int(bb.find("xmax").text)
ymax = int(bb.find("ymax").text)
bboxes.append((xmin, ymin, xmax, ymax))
image_filename = root.findtext("filename")
img_path = os.path.join(img_dir, image_filename)
dataset.append({"image_path": img_path, "bboxes": bboxes, "ann_path": ann_path})
return dataset
def eob(pred: BBox, gt: BBox) -> float:
"""Compute the Error-of-Boundary metric for a bounding box pair."""
px0, py0, px1, py1 = pred
gx0, gy0, gx1, gy1 = gt
width = gx1 - gx0
height = gy1 - gy0
if width <= 0 or height <= 0:
logger.warning("Invalid ground truth box with non-positive size: %s", gt)
return float("inf")
return 0.25 * (
abs(px0 - gx0) / width +
abs(px1 - gx1) / width +
abs(py0 - gy0) / height +
abs(py1 - gy1) / height
)
def evaluate_detections(
pred_boxes: List[BBox],
gt_boxes: List[BBox],
threshold: float = 0.0,
) -> Tuple[float, float, float]:
"""Evaluate predicted boxes against ground truth using EoB threshold."""
matches = 0
used = set()
for pb in pred_boxes:
for idx, gb in enumerate(gt_boxes):
if idx in used:
continue
if eob(pb, gb) <= threshold:
matches += 1
used.add(idx)
break
precision = matches / len(pred_boxes) if pred_boxes else 0.0
recall = matches / len(gt_boxes) if gt_boxes else 0.0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
return precision, recall, f1
def normalize_qa_answer(answer: object, answer_type: str = "literal") -> str:
"""Normalize QA answers according to their comparison contract."""
text = "" if answer is None else str(answer).strip()
if answer_type == "cell_address":
return text.upper()
if answer_type == "formula":
return re.sub(r"\s+", "", text).upper()
if answer_type == "free_text":
return re.sub(r"\s+", " ", text).casefold()
return text
def load_qa_dataset(path: str) -> List[Dict[str, object]]:
"""
Load a spreadsheet QA dataset.
Expects pairs of .xlsx and .json files.
"""
dataset = []
for fname in os.listdir(path):
if fname.endswith(".xlsx"):
spreadsheet_path = os.path.join(path, fname)
ann_path = os.path.join(path, fname.replace(".xlsx", ".json"))
if not os.path.exists(ann_path):
logger.warning(f"Annotation file not found for {fname}, skipping.")
continue
with open(ann_path, 'r') as f:
annotations = json.load(f)
qa_pairs = annotations.get("qa_pairs", [])
if qa_pairs:
dataset.append({
"spreadsheet_path": spreadsheet_path,
"qa_pairs": qa_pairs
})
return dataset
def load_qa_manifest(manifest_path: str) -> List[Dict[str, object]]:
"""Load a manifest-defined spreadsheet QA dataset.
Each item must include ``spreadsheet_path`` and ``qa_pairs``. QA pairs may
include optional fields such as ``sheet_name``, ``table_range``, and
``answer_type``; they are preserved for downstream evaluation.
"""
with open(manifest_path, encoding="utf-8") as fh:
manifest: Dict[str, Any] = json.load(fh)
dataset_name = manifest.get("dataset_name", "manifest")
dataset_version = manifest.get("dataset_version", "unspecified")
split_name = manifest.get("split_name", "unspecified")
claim_level = manifest.get("claim_level", "reconstructed")
items = manifest.get("items")
if not isinstance(items, list):
raise ValueError("QA manifest must contain an items array")
dataset = []
for idx, item in enumerate(items):
if not isinstance(item, dict):
raise ValueError(f"manifest item {idx} must be an object")
spreadsheet_path = item.get("spreadsheet_path")
qa_pairs = item.get("qa_pairs", [])
if not spreadsheet_path:
raise ValueError(f"manifest item {idx} missing spreadsheet_path")
if not isinstance(qa_pairs, list):
raise ValueError(f"manifest item {idx} qa_pairs must be an array")
dataset.append({
"spreadsheet_path": _resolve_manifest_path(manifest_path, spreadsheet_path),
"qa_pairs": qa_pairs,
"manifest_path": os.path.abspath(manifest_path),
"dataset_name": dataset_name,
"dataset_version": dataset_version,
"split_name": split_name,
"claim_level": claim_level,
})
return dataset
__all__ = [
"load_dong2019_dataset",
"load_spreadsheet_dataset",
"load_table_detection_manifest",
"load_qa_dataset",
"load_qa_manifest",
"range_to_bbox",
"normalize_qa_answer",
"eob",
"evaluate_detections",
]
You can’t perform that action at this time.
