Skip to content
Navigation Menu
{{ message }}
-
Notifications
You must be signed in to change notification settings - Fork 301
Expand file tree
/
Copy pathASSETS.yaml
More file actions
895 lines (893 loc) · 26.2 KB
/
ASSETS.yaml
File metadata and controls
895 lines (893 loc) · 26.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
# ASSETS.yaml — External asset manifest
# Auto-generated from per-eval eval.yaml external_assets fields.
# Regenerate: python tools/generate_asset_manifest.py
#
# This is a read-only manifest (like uv.lock but for external assets).
# To update, edit the external_assets section in the relevant eval.yaml,
# then regenerate this file.
evals:
abstention_bench:
- type: direct_url
source: https://huggingface.co/datasets/rajpurkar/squad_v2/resolve/{SHA}/
fetch_method: download_and_verify
state: pinned
- type: direct_url
source: https://raw.githubusercontent.com/stellalisy/mediQ/{SHA}/
fetch_method: download_and_verify
state: pinned
- type: direct_url
source: https://raw.githubusercontent.com/nyu-mll/BBQ/{SHA}/
fetch_method: download_and_verify
state: pinned
- type: direct_url
source: https://raw.githubusercontent.com/yinzhangyue/SelfAware/{SHA}/
fetch_method: download_and_verify
state: pinned
- type: direct_url
source: https://raw.githubusercontent.com/thunlp/FalseQA/{SHA}/
fetch_method: download_and_verify
state: pinned
- type: direct_url
source: https://raw.githubusercontent.com/Yuki-Asuuna/UMWP/{SHA}/
fetch_method: download_and_verify
state: pinned
- type: direct_url
source: https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/{SHA}/
fetch_method: download_and_verify
state: pinned
- type: direct_url
source: https://github.com/facebookresearch/worldsense/raw/{SHA}/
fetch_method: download_and_verify
state: pinned
- type: direct_url
source: https://drive.google.com/uc?id=12aLKsSKe85G0u5bBTq0X0aKICsdxpaFL
fetch_method: gdown_and_verify
state: pinned
- type: direct_url
source: https://drive.google.com/uc?id=1q-6FIEGufKVBE3s6OdFoLWL2iHQPJh8h
fetch_method: gdown_and_verify
state: pinned
- type: direct_url
source: https://drive.google.com/uc?id={multiple}
fetch_method: gdown_and_verify
state: pinned
- type: huggingface
source: tasksource/bigbench
fetch_method: load_dataset
state: pinned
- type: huggingface
source: allenai/coconot
fetch_method: load_dataset
state: pinned
- type: huggingface
source: Idavidrein/gpqa
fetch_method: load_dataset
state: pinned
- type: huggingface
source: openai/gsm8k
fetch_method: load_dataset
state: pinned
- type: huggingface
source: amayuelas/KUQ
fetch_method: load_dataset
state: pinned
- type: huggingface
source: cais/mmlu
fetch_method: load_dataset
state: pinned
- type: huggingface
source: ninoscherrer/moralchoice
fetch_method: load_dataset
state: pinned
- type: direct_url
source: https://raw.githubusercontent.com/mikejqzhang/SituatedQA/{SHA}/data/qa_data/
fetch_method: other
state: pinned
comment: Custom HF builder downloads via dl_manager
- type: direct_url
source: https://qasper-dataset.s3.us-west-2.amazonaws.com/qasper-{train-dev,test}-v0.3.tgz
fetch_method: other
state: pinned
comment: Custom HF builder downloads via dl_manager
agent_bench:
- type: git_clone
source: https://github.com/THUDM/AgentBench
fetch_method: git_clone
state: pinned
agentdojo:
- type: git_dependency
source: https://github.com/ethz-spylab/agentdojo
fetch_method: pyproject_toml
state: pinned
agentharm:
- type: huggingface
source: ai-safety-institute/AgentHarm
fetch_method: snapshot_download
state: pinned
agentic_misalignment: []
agieval:
- type: direct_url
source: https://raw.githubusercontent.com/ruixiangcui/AGIEval/{SHA}/data/v1_1/
fetch_method: other
state: pinned
comment: Multiple task files fetched via fsspec (load_json_dataset)
ahb:
- type: huggingface
source: sentientfutures/ahb
fetch_method: hf_dataset
state: pinned
aime2024:
- type: huggingface
source: Maxwell-Jia/AIME_2024
fetch_method: hf_dataset
state: pinned
aime2025:
- type: huggingface
source: math-ai/aime25
fetch_method: hf_dataset
state: pinned
aime2026:
- type: huggingface
source: math-ai/aime26
fetch_method: hf_dataset
state: pinned
air_bench:
- type: huggingface
source: stanford-crfm/air-bench-2024
fetch_method: hf_dataset
state: pinned
ape:
- type: direct_url
source: https://raw.githubusercontent.com/AlignmentResearch/AttemptPersuadeEval/{SHA}/src/topics/
fetch_method: other
state: pinned
comment: Fetched via fsspec (load_json_dataset)
apps:
- type: huggingface
source: codeparrot/apps
fetch_method: snapshot_download
state: pinned
arc:
- type: huggingface
source: allenai/ai2_arc
fetch_method: hf_dataset
state: pinned
assistant_bench:
- type: huggingface
source: AssistantBench/AssistantBench
fetch_method: hf_dataset
state: pinned
b3:
- type: huggingface
source: Lakera/b3-agent-security-benchmark-weak
fetch_method: hf_dataset
state: pinned
bbeh:
- type: huggingface
source: BBEH/bbeh
fetch_method: hf_dataset
state: pinned
bbh:
- type: huggingface
source: Joschka/big_bench_hard
fetch_method: hf_dataset
state: pinned
- type: huggingface
source: Joschka/big_bench_hard
fetch_method: load_dataset
state: pinned
bbq:
- type: huggingface
source: heegyu/bbq
fetch_method: snapshot_download
state: pinned
bfcl:
- type: git_clone
source: https://github.com/ShishirPatil/gorilla.git
fetch_method: git_clone
state: pinned
bigcodebench:
- type: huggingface
source: bigcode/bigcodebench
fetch_method: load_dataset
state: pinned
bold:
- type: huggingface
source: AmazonScience/bold
fetch_method: load_dataset
state: pinned
boolq:
- type: huggingface
source: google/boolq
fetch_method: hf_dataset
state: pinned
browse_comp:
- type: direct_url
source: https://openaipublic.blob.core.windows.net/simple-evals/browse_comp_test_set.csv
fetch_method: download_and_verify
state: pinned
chembench:
- type: huggingface
source: jablonkagroup/ChemBench
fetch_method: hf_dataset
state: pinned
class_eval:
- type: huggingface
source: FudanSELab/ClassEval
fetch_method: hf_dataset
state: pinned
coconot:
- type: huggingface
source: allenai/coconot
fetch_method: hf_dataset
state: pinned
commonsense_qa:
- type: huggingface
source: tau/commonsense_qa
fetch_method: hf_dataset
state: pinned
compute_eval:
- type: huggingface
source: nvidia/compute-eval
fetch_method: hf_dataset
state: pinned
core_bench:
- type: direct_url
source: https://corebench.cs.princeton.edu/capsules/{id}.tar.gz
fetch_method: download_and_verify
state: pinned
- type: huggingface
source: siegelz/core-bench
fetch_method: hf_hub_download
state: pinned
cti_realm:
- type: huggingface
source: arjun180-new/cti_realm
fetch_method: hf_hub_download
state: pinned
cve_bench:
- type: git_dependency
source: https://github.com/Scott-Simmons/cve-bench.git
fetch_method: pyproject_toml
state: pinned
cybench:
- type: direct_url
source: https://www.haproxy.org/download/2.8/src/haproxy-2.8.1.tar.gz
fetch_method: wget
state: pinned
cybergym:
- type: huggingface
source: sunblaze-ucb/cybergym
fetch_method: snapshot_download
state: pinned
cybermetric:
- type: direct_url
source: https://raw.githubusercontent.com/cybermetric/CyberMetric/{SHA}/
fetch_method: download_and_verify
state: pinned
cyberseceval_2:
- type: direct_url
source: https://nodejs.org/dist/v20.18.3/node-v20.18.3-linux-x64.tar.xz
fetch_method: curl
state: pinned
- type: direct_url
source: https://raw.githubusercontent.com/meta-llama/PurpleLlama/{SHA}/CybersecurityBenchmarks/datasets/
fetch_method: other
state: pinned
comment: interpreter.json and prompt_injection.json; fetched via fsspec
cyberseceval_3:
- type: huggingface
source: facebook/cyberseceval3-visual-prompt-injection
fetch_method: snapshot_download
state: pinned
- type: huggingface
source: facebook/cyberseceval3-visual-prompt-injection
fetch_method: hf_dataset
state: pinned
cyberseceval_4:
- type: direct_url
source: https://raw.githubusercontent.com/meta-llama/PurpleLlama/fe05293b610dabc3967443f2dd4dc35c4e8971b6/CybersecurityBenchmarks/datasets/
fetch_method: other
state: pinned
comment: instruct.json, autocomplete.json, mitre_benchmark_100_per_category_with_augmentation.json, mitre_frr.json, and multiturn_phishing_challenges.json; fetched via fsspec (load_json_dataset)
- type: direct_url
source: https://raw.githubusercontent.com/meta-llama/PurpleLlama/fe05293b610dabc3967443f2dd4dc35c4e8971b6/CybersecurityBenchmarks/datasets/prompt_injection/prompt_injection_multilingual_machine_translated.json
fetch_method: other
state: pinned
comment: fetched via fsspec (load_json_dataset)
- type: direct_url
source: https://github.com/CrowdStrike/CyberSOCEval_data/archive/ce7daa5bc7da51559ca97476d2277be02631783e.zip
fetch_method: other
state: pinned
comment: malware-analysis and threat-intelligence report data; downloaded via download_command_with_progress (wget/curl)
- type: direct_url
source: https://media.defense.gov/2024/Feb/26/2003399756/-1/-1/0/CSA-SVR-ADAPT-TACTICS-FOR-INITIAL-CLOUD-ACCESS.PDF
fetch_method: requests
state: pinned
comment: threat-intelligence fallback PDF when the pinned CrowdStrike archive lacks this report
- type: direct_url
source: https://www.ncsc.gov.uk/sites/default/files/pdfs/news/svr-cyber-actors-adapt-tactics-for-initial-cloud-access.pdf
fetch_method: requests
state: pinned
comment: alternate mirror for the threat-intelligence fallback PDF
docvqa:
- type: huggingface
source: lmms-lab/DocVQA
fetch_method: hf_dataset
state: pinned
drop:
- type: huggingface
source: EleutherAI/drop
fetch_method: hf_dataset
state: pinned
ds1000:
- type: huggingface
source: xlangai/DS-1000
fetch_method: hf_dataset
state: pinned
fortress:
- type: huggingface
source: ScaleAI/fortress_public
fetch_method: hf_dataset
state: pinned
frontier_cs:
- type: direct_url
source: https://github.com/FrontierCS/Frontier-CS/archive/{SHA}.tar.gz
fetch_method: download_and_verify
state: pinned
- type: direct_url
source: https://raw.githubusercontent.com/MikeMirzayanov/testlib/{SHA}/testlib.h
fetch_method: wget
state: pinned
- type: direct_url
source: https://julialang-s3.julialang.org/bin/linux/x64/1.11/julia-1.11.3-linux-x86_64.tar.gz
fetch_method: wget
state: pinned
- type: huggingface
source: FrontierCS/Frontier-CS
fetch_method: hf_dataset
state: pinned
frontierscience:
- type: huggingface
source: openai/frontierscience
fetch_method: hf_dataset
state: pinned
gaia:
- type: huggingface
source: gaia-benchmark/GAIA
fetch_method: hf_dataset
state: pinned
gdm_in_house_ctf:
- type: direct_url
source: https://hub.docker.com/u/marshw
fetch_method: other
state: floating
comment: Per-challenge Docker images (marshw/cmd_injection, marshw/db_3, marshw/grafana, etc.) pulled at eval runtime
gdm_intercode_ctf:
- type: direct_url
source: https://github.com/princeton-nlp/intercode/archive/{SHA}.zip
fetch_method: download_and_verify
state: pinned
gdm_self_proliferation:
- type: git_clone
source: https://github.com/mistralai/mistral-src.git
fetch_method: git_clone
state: pinned
comment: Pinned to commit 2557e12d0e1878a1562ece30434820f80aa3e12a in mistral.Dockerfile
- type: direct_url
source: https://files.mistral-7b-v0-1.mistral.ai/mistral-7B-v0.1.tar
fetch_method: wget
state: pinned
- type: direct_url
source: pip install git+https://github.com/huggingface/{transformers,peft,accelerate}.git
fetch_method: pip_install_git
state: pinned
comment: Pinned to commit SHAs in mistral.Dockerfile
- type: direct_url
source: https://raw.githubusercontent.com/UKGovernmentBEIS/inspect_evals/{version}/...secrets.zip
fetch_method: requests
state: controlled
gdm_self_reasoning: []
gdm_stealth: []
gdpval:
- type: huggingface
source: openai/gdpval
fetch_method: hf_dataset
state: pinned
- type: huggingface
source: openai/gdpval
fetch_method: load_dataset
state: pinned
gpqa:
- type: direct_url
source: https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv
fetch_method: download_and_verify
state: pinned
gsm8k:
- type: huggingface
source: openai/gsm8k
fetch_method: hf_dataset
state: pinned
healthbench:
- type: direct_url
source: https://openaipublic.blob.core.windows.net/simple-evals/healthbench/{file}.jsonl
fetch_method: other
state: pinned
comment: 4 files with timestamp-encoded filenames; fetched via fsspec (load_json_dataset)
hellaswag:
- type: huggingface
source: Rowan/hellaswag
fetch_method: hf_dataset
state: pinned
hle:
- type: huggingface
source: cais/hle
fetch_method: hf_dataset
state: pinned
humaneval:
- type: huggingface
source: openai/openai_humaneval
fetch_method: hf_dataset
state: pinned
ifeval:
- type: git_dependency
source: https://github.com/josejg/instruction_following_eval
fetch_method: pyproject_toml
state: pinned
- type: huggingface
source: google/IFEval
fetch_method: hf_dataset
state: pinned
ifevalcode:
- type: huggingface
source: Multilingual-Multimodal-NLP/IfEvalCode-testset
fetch_method: hf_dataset
state: pinned
infinite_bench:
- type: huggingface
source: xinrongzhang2022/InfiniteBench
fetch_method: hf_dataset
state: pinned
instrumentaleval:
- type: direct_url
source: https://raw.githubusercontent.com/yf-he/InstrumentalEval/{SHA}/benchmark/
fetch_method: download_and_verify
state: pinned
ipi_coding_agent: []
kernelbench:
- type: direct_url
source: https://astral.sh/uv/0.9.9/install.sh
fetch_method: curl
state: pinned
- type: git_dependency
source: https://github.com/ScalingIntelligence/KernelBench
fetch_method: pyproject_toml
state: pinned
- type: huggingface
source: ScalingIntelligence/KernelBench
fetch_method: hf_dataset
state: pinned
lab_bench:
- type: huggingface
source: futurehouse/lab-bench
fetch_method: hf_dataset
state: pinned
lingoly:
- type: huggingface
source: ambean/lingOly
fetch_method: load_dataset
state: pinned
- type: huggingface
source: jkhouja/LingOly-TOO
fetch_method: hf_dataset
state: pinned
livebench:
- type: git_dependency
source: https://github.com/LiveBench/LiveBench.git
fetch_method: pyproject_toml
state: pinned
- type: huggingface
source: livebench/{math,reasoning,coding,language,data_analysis,instruction_following}
fetch_method: hf_dataset
state: pinned
livecodebench_pro:
- type: huggingface
source: QAQAQAQAQ/LiveCodeBench-Pro
fetch_method: hf_dataset
state: pinned
make_me_pay: []
makemesay:
- type: direct_url
source: https://github.com/openai/evals/raw/{SHA}/evals/registry/data/make_me_say/
fetch_method: download_and_verify
state: pinned
mask:
- type: huggingface
source: cais/MASK
fetch_method: hf_dataset
state: pinned
math:
- type: huggingface
source: DigitalLearningGmbH/MATH-lighteval
fetch_method: hf_dataset
state: pinned
mathvista:
- type: huggingface
source: AI4Math/MathVista
fetch_method: hf_dataset
state: pinned
mbpp:
- type: huggingface
source: google-research-datasets/mbpp
fetch_method: load_dataset
state: pinned
- type: huggingface
source: google-research-datasets/mbpp
fetch_method: hf_dataset
state: pinned
medqa:
- type: huggingface
source: bigbio/med_qa
fetch_method: snapshot_download
state: pinned
mgsm:
- type: direct_url
source: https://openaipublic.blob.core.windows.net/simple-evals/mgsm_{lang}.tsv
fetch_method: download_and_verify
state: pinned
comment: 11 language files
mind2web:
- type: direct_url
source: https://huggingface.co/datasets/osunlp/Mind2Web/resolve/{SHA}/scores_all_data.pkl
fetch_method: download_and_verify
state: pinned
- type: huggingface
source: osunlp/Multimodal-Mind2Web
fetch_method: hf_dataset
state: pinned
mind2web_sc: []
mle_bench:
- type: git_clone
source: https://github.com/openai/mle-bench.git
fetch_method: git_clone
state: pinned
comment: Pinned to commit 2451bcb in Dockerfile
- type: direct_url
source: https://github.com/conda-forge/miniforge/releases/download/24.11.3-0/Miniforge3-Linux-x86_64.sh
fetch_method: wget
state: pinned
- type: direct_url
source: https://github.com/git-lfs/git-lfs/releases/download/v3.6.1/git-lfs-linux-amd64-v3.6.1.tar.gz
fetch_method: curl
state: pinned
- type: git_dependency
source: https://github.com/openai/mle-bench.git
fetch_method: pyproject_toml
state: pinned
mlrc_bench:
- type: direct_url
source: https://github.com/yunx-z/MLRC-Bench/archive/{SHA}.zip
fetch_method: download_and_verify
state: pinned
mmiu:
- type: direct_url
source: https://huggingface.co/datasets/FanqingM/MMIU-Benchmark/resolve/{SHA}/{file}.zip
fetch_method: download_and_verify
state: pinned
- type: huggingface
source: FanqingM/MMIU-Benchmark
fetch_method: hf_dataset
state: pinned
mmlu:
- type: huggingface
source: openai/MMMLU
fetch_method: hf_dataset
state: pinned
- type: huggingface
source: cais/mmlu
fetch_method: hf_dataset
state: pinned
mmlu_pro:
- type: huggingface
source: TIGER-Lab/MMLU-Pro
fetch_method: hf_dataset
state: pinned
mmmu:
- type: huggingface
source: MMMU/MMMU
fetch_method: load_dataset
state: pinned
moru:
- type: huggingface
source: sentientfutures/moru-benchmark
fetch_method: hf_dataset
state: pinned
- type: huggingface
source: sentientfutures/moru-benchmark-dimensions
fetch_method: load_dataset
state: pinned
musr:
- type: huggingface
source: TAUR-Lab/MuSR
fetch_method: hf_dataset
state: pinned
niah:
- type: huggingface
source: opencompass/NeedleBench
fetch_method: load_dataset
state: pinned
novelty_bench:
- type: huggingface
source: yimingzhang/novelty-bench
fetch_method: hf_dataset
state: pinned
onet:
- type: huggingface
source: openthaigpt/thai-onet-m6-exam
fetch_method: hf_dataset
state: pinned
osworld:
- type: git_clone
source: https://github.com/xlang-ai/OSWorld.git
fetch_method: git_clone
state: pinned
comment: Dockerfile sparse clone at a revision pinned to the commit matching OSWORLD_PINNED_COMMIT
- type: git_clone
source: https://github.com/xlang-ai/OSWorld.git
fetch_method: git_clone
state: pinned
comment: Runtime sparse-checkout pinned to OSWORLD_PINNED_COMMIT
- type: direct_url
source: https://raw.githubusercontent.com/epatey/fonts/main/fonts.tar.gz
fetch_method: curl
state: floating
paperbench:
- type: huggingface
source: josancamon/paperbench
fetch_method: load_dataset
state: pinned
paws:
- type: huggingface
source: google-research-datasets/paws
fetch_method: hf_dataset
state: pinned
persistbench: []
personality:
- type: direct_url
source: https://raw.githubusercontent.com/guiem/personality-tests/{SHA}/
fetch_method: download_and_verify
state: pinned
- type: huggingface
source: mirlab/TRAIT
fetch_method: hf_dataset
state: pinned
piqa:
- type: huggingface
source: ybisk/piqa
fetch_method: snapshot_download
state: pinned
- type: direct_url
source: https://storage.googleapis.com/ai2-mosaic/public/physicaliqa/physicaliqa-train-dev.zip
fetch_method: other
state: floating
comment: Fetched by bundled HF builder via download_and_prepare
- type: direct_url
source: https://yonatanbisk.com/piqa/data/tests.jsonl
fetch_method: other
state: floating
comment: Fetched by bundled HF builder (test split)
pre_flight:
- type: huggingface
source: AirsideLabs/pre-flight-06
fetch_method: hf_dataset
state: pinned
pubmedqa:
- type: huggingface
source: qiaojin/PubMedQA
fetch_method: hf_dataset
state: pinned
race_h:
- type: huggingface
source: ehovy/race
fetch_method: hf_dataset
state: pinned
sad:
- type: direct_url
source: https://api.github.com/repos/LRudL/sad/contents/{path}?ref={SHA}
fetch_method: download_and_verify
state: pinned
scbench: []
scicode:
- type: direct_url
source: https://raw.githubusercontent.com/scicode-bench/SciCode/{SHA}/eval/data/
fetch_method: download_and_verify
state: pinned
- type: direct_url
source: https://drive.google.com/uc?id=17G_k65N_6yFFZ2O-jQH00Lh6iaw3z-AW
fetch_method: gdown_and_verify
state: pinned
sciknoweval:
- type: direct_url
source: https://drive.google.com/uc?id={file_id}
fetch_method: gdown_and_verify
state: pinned
- type: huggingface
source: hicai-zju/SciKnowEval
fetch_method: hf_dataset
state: pinned
sec_qa:
- type: huggingface
source: zefang-liu/secqa
fetch_method: hf_dataset
state: pinned
sevenllm:
- type: direct_url
source: https://huggingface.co/datasets/Multilingual-Multimodal-NLP/SEVENLLM-Dataset/raw/{SHA}/
fetch_method: other
state: pinned
comment: Raw HF URL fetched via fsspec (load_json_dataset), not via HF datasets API
simpleqa:
- type: huggingface
source: codelion/SimpleQA-Verified
fetch_method: hf_dataset
state: pinned
- type: direct_url
source: https://openaipublic.blob.core.windows.net/simple-evals/simple_qa_test_set.csv
fetch_method: download_and_verify
state: pinned
comment: Used by simpleqa task
sosbench:
- type: huggingface
source: SOSBench/SOSBench
fetch_method: hf_dataset
state: pinned
squad:
- type: huggingface
source: rajpurkar/squad_v2
fetch_method: hf_dataset
state: pinned
stereoset:
- type: huggingface
source: McGill-NLP/stereoset
fetch_method: hf_dataset
state: pinned
strong_reject:
- type: direct_url
source: https://raw.githubusercontent.com/alexandrasouly/strongreject/{SHA}/strongreject_dataset/
fetch_method: other
state: pinned
comment: Fetched via fsspec (load_csv_dataset)
swe_bench:
- type: git_clone
source: https://github.com/swe-bench/experiments
fetch_method: git_clone
state: pinned
comment: Pinned to commit 559cf877c1095c2e244af73b2de42bd53bd0c9d5 in download_baselines.sh
- type: huggingface
source: princeton-nlp/SWE-bench_Verified
fetch_method: load_dataset
state: pinned
swe_lancer:
- type: direct_url
source: https://hub.docker.com/u/swelancer
fetch_method: other
state: floating
comment: Per-task images (swelancer/swelancer_x86_{issue_id}:releasev1) and monolith image pulled at eval runtime
sycophancy:
- type: direct_url
source: https://raw.githubusercontent.com/meg-tong/sycophancy-eval/{SHA}/datasets/
fetch_method: other
state: pinned
comment: Fetched via fsspec (load_json_dataset)
tac:
- type: huggingface
source: sentientfutures/tac
fetch_method: hf_hub_download
state: pinned
tau2: []
theagentcompany: []
threecb: []
truthfulqa:
- type: huggingface
source: truthfulqa/truthful_qa
fetch_method: hf_dataset
state: pinned
uccb:
- type: huggingface
source: CraneAILabs/UCCB
fetch_method: hf_dataset
state: pinned
usaco:
- type: direct_url
source: https://drive.usercontent.google.com/download?id=1z5ODOJMqyer1QxzYtEUZ2hbAx-7nU8Vi
fetch_method: gdown_and_verify
state: pinned
vimgolf_challenges:
- type: huggingface
source: cybergod-kevin/vimgolf-public-challenges-inspect-eval
fetch_method: hf_dataset
state: pinned
vqa_rad:
- type: huggingface
source: flaviagiammarino/vqa-rad
fetch_method: hf_dataset
state: pinned
vstar_bench:
- type: huggingface
source: craigwu/vstar_bench
fetch_method: hf_dataset
state: pinned
winogrande:
- type: huggingface
source: allenai/winogrande
fetch_method: hf_dataset
state: pinned
wmdp:
- type: huggingface
source: cais/wmdp
fetch_method: hf_dataset
state: pinned
worldsense:
- type: direct_url
source: https://github.com/facebookresearch/worldsense/raw/{SHA}/data/
fetch_method: download_and_verify
state: pinned
writingbench: []
xstest:
- type: huggingface
source: walledai/XSTest
fetch_method: hf_dataset
state: pinned
zerobench:
- type: huggingface
source: jonathan-roberts1/zerobench
fetch_method: hf_dataset
state: pinned
large_local_files_threshold: '>1 MB'
large_local_files:
gdm_self_proliferation:
- path: data/sp08/secrets.zip
size: 52.5M
writingbench:
- path: benchmark_all.jsonl
size: 14.0M
swe_lancer:
- path: data/all_swelancer_tasks.csv
size: 8.0M
cybench:
- path: challenges/network_tools/resources/nettools
size: 5.3M
- path: challenges/network_tools/images/victim/share/nettools
size: 5.3M
- path: challenges/delulu/images/victim/challenge/glibc/libc.so.6
size: 2.1M
- path: challenges/rpgo/resources/rpgo
size: 1.3M
- path: challenges/robust_cbc/images/victim/app/server
size: 1.3M
- path: challenges/flecks_of_gold/solution/flecks_patched
size: 1.0M
- path: challenges/flecks_of_gold/resources/flecks
size: 1.0M
uccb:
- path: uccb_evaluation_results/x-ai_grok-4/detailed_results.json
size: 2.7M
- path: uccb_evaluation_results/cohere_command-a/detailed_results.json
size: 2.6M
- path: uccb_evaluation_results/google_gemini-2.0-flash-001/detailed_results.json
size: 2.5M
- path: uccb_evaluation_results/anthropic_claude-sonnet-4.5/detailed_results.json
size: 2.5M
- path: uccb_evaluation_results/openai_gpt-5/detailed_results.json
size: 1.1M
- path: uccb_evaluation_results/google_gemini-2.5-pro/detailed_results.json
size: 1.1M
agentdojo:
- path: data/suites/workspace_plus/terminal/emma-computer/home-dir/Downloads/1706.03762v7.pdf
size: 2.1M
gdm_stealth:
- path: data/ai_secretary/register.json
size: 2.0M
mind2web_sc:
- path: data/seeact/sample_labeled_all.json
size: 1.2M
You can’t perform that action at this time.
