Updates
Modern generative search engines enhance the reliability of large language model (LLM) responses by providing cited evidence. However, evaluating the answer's attribution, i.e., whether every claim within the generated responses is fully supported by its cited evidence, remains an open problem. This verification, traditionally dependent on costly human evaluation, underscores the urgent need for automatic attribution evaluation methods. To bridge the gap in the absence of standardized benchmarks for these methods, we present AttributionBench, a comprehensive benchmark compiled from various existing attribution datasets. Our extensive experiments on AttributionBench reveal the challenges of automatic attribution evaluation, even for state-of-the-art LLMs. Specifically, our findings show that even a fine-tuned GPT-3.5 only achieves around 80% macro-F1 under a binary classification formulation. A detailed analysis of more than 300 error cases indicates that a majority of failures stem from the model's inability to process nuanced information, and the discrepancy between the information the model has access to and that human annotators do.
Figure 1: The illustration of the attribution evaluation task and two typical error
examples from AttributionBench generated by GPT-3.5 (w/ CoT). The references are
usually manually extracted from webpages by human annotators based on what they think is
useful.
Left: fine-grained information insensitivity (i.e., the model disregarded
or overlooked nuanced details in either the claim or the references, as well as failing to
do necessary summarization or inference from the given references, tasks that humans
naturally perform).
Right: human-model accessible information mismatch (i.e.,
human annotators can see the whole webpage while the model is only given the extracted
evidence, leading to different judgments.)
We also report the average macro-F1 score on 7 test sets of GPT-3.5 with different input fields. Q, C, E, R stands for question, claim, evidence, and response, respectively. Results show that merely appending the questions and responses to the input, which human annotators might have access to, is not the key to solving this task.
We also show the performance of GPT-3.5 (w/ CoT) with several different prompts. Prompt engineering only brings limited gain over 7 test sets. Rather than bringing little gain in overall performance (top), adjusting prompts is actually changing the ratio of FP and FN cases (bottom). 'comp_und' stands for 'comprehensive understanding'.
We also show the error case distribution in 7 test sets:
@misc{li2024attributionbench,
title={AttributionBench: How Hard is Automatic Attribution Evaluation?},
author={Yifei Li and Xiang Yue and Zeyi Liao and Huan Sun},
year={2024},
eprint={2402.15089},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
If used, please also cite the original datasets accordingly:
@misc{malaviya2023expertqa,
title={ExpertQA: Expert-Curated Questions and Attributed Answers},
author={Chaitanya Malaviya and Subin Lee and Sihao Chen and Elizabeth Sieber and Mark Yatskar and Dan Roth},
year={2023},
eprint={2309.07852},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@inproceedings{liu-etal-2023-evaluating,
title = "Evaluating Verifiability in Generative Search Engines",
author = "Liu, Nelson and
Zhang, Tianyi and
Liang, Percy",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.467",
doi = "10.18653/v1/2023.findings-emnlp.467",
pages = "7001--7025",
abstract = "Generative search engines directly generate responses to user queries, along with in-line citations. A prerequisite trait of a trustworthy generative search engine is verifiability, i.e., systems should cite comprehensively (high citation recall; all statements are fully supported by citations) and accurately (high citation precision; every cite supports its associated statement). We conduct human evaluation to audit four popular generative search engines{---}Bing Chat, NeevaAI, perplexity.ai, and YouChat{---}across a diverse set of queries from a variety of sources (e.g., historical Google user queries, dynamically-collected open-ended questions on Reddit, etc.). We find that responses from existing generative search engines are fluent and appear informative, but frequently contain unsupported statements and inaccurate citations: on average, a mere 51.5{\%} of generated sentences are fully supported by citations and only 74.5{\%} of citations support their associated sentence. We believe that these results are concerningly low for systems that may serve as a primary tool for information-seeking users, especially given their facade of trustworthiness. We hope that our results further motivate the development of trustworthy generative search engines and help researchers and users better understand the shortcomings of existing commercial systems.",
}
@misc{bohnet2023attributed,
title={Attributed Question Answering: Evaluation and Modeling for Attributed Large Language Models},
author={Bernd Bohnet and Vinh Q. Tran and Pat Verga and Roee Aharoni and Daniel Andor and Livio Baldini Soares and Massimiliano Ciaramita and Jacob Eisenstein and Kuzman Ganchev and Jonathan Herzig and Kai Hui and Tom Kwiatkowski and Ji Ma and Jianmo Ni and Lierni Sestorain Saralegui and Tal Schuster and William W. Cohen and Michael Collins and Dipanjan Das and Donald Metzler and Slav Petrov and Kellie Webster},
year={2023},
eprint={2212.08037},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{chen2023understanding,
title={Understanding Retrieval Augmentation for Long-Form Question Answering},
author={Hung-Ting Chen and Fangyuan Xu and Shane Arora and Eunsol Choi},
year={2023},
eprint={2310.12150},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@article{dziri-etal-2022-evaluating,
title = "Evaluating Attribution in Dialogue Systems: The {BEGIN} Benchmark",
author = "Dziri, Nouha and
Rashkin, Hannah and
Linzen, Tal and
Reitter, David",
editor = "Roark, Brian and
Nenkova, Ani",
journal = "Transactions of the Association for Computational Linguistics",
volume = "10",
year = "2022",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2022.tacl-1.62",
doi = "10.1162/tacl_a_00506",
pages = "1066--1083",
abstract = "Knowledge-grounded dialogue systems powered by large language models often generate responses that, while fluent, are not attributable to a relevant source of information. Progress towards models that do not exhibit this issue requires evaluation metrics that can quantify its prevalence. To this end, we introduce the Benchmark for Evaluation of Grounded INteraction (Begin), comprising 12k dialogue turns generated by neural dialogue systems trained on three knowledge-grounded dialogue corpora. We collect human annotations assessing the extent to which the models{'} responses can be attributed to the given background information. We then use Begin to analyze eight evaluation metrics. We find that these metrics rely on spurious correlations, do not reliably distinguish attributable abstractive responses from unattributable ones, and perform substantially worse when the knowledge source is longer. Our findings underscore the need for more sophisticated and robust evaluation metrics for knowledge-grounded dialogue. We make Begin publicly available at \url{https://github.com/google/BEGIN-dataset}.",
}
@inproceedings{yue-etal-2023-automatic,
title = "Automatic Evaluation of Attribution by Large Language Models",
author = "Yue, Xiang and
Wang, Boshi and
Chen, Ziru and
Zhang, Kai and
Su, Yu and
Sun, Huan",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.307",
doi = "10.18653/v1/2023.findings-emnlp.307",
pages = "4615--4635",
abstract = "A recent focus of large language model (LLM) development, as exemplified by generative search engines, is to incorporate external references to generate and support its claims. However, evaluating the attribution, i.e., verifying whether the generated statement is fully supported by the cited reference, remains an open problem. Although human evaluation is common practice, it is costly and time-consuming. In this paper, we investigate automatic evaluation of attribution given by LLMs. We begin by defining different types of attribution errors, and then explore two approaches for automatic evaluation: prompting LLMs and fine-tuning smaller LMs. The fine-tuning data is repurposed from related tasks such as question answering, fact-checking, natural language inference, and summarization. We manually curate a set of test examples covering 12 domains from a generative search engine, New Bing. Our results on this curated test set and simulated examples from existing benchmarks highlight both promising signals and challenges. We hope our problem formulation, testbeds, and findings will help lay the foundation for future studies on this important problem.",
}
@misc{kamalloo2023hagrid,
title={HAGRID: A Human-LLM Collaborative Dataset for Generative Information-Seeking with Attribution},
author={Ehsan Kamalloo and Aref Jafari and Xinyu Zhang and Nandan Thakur and Jimmy Lin},
year={2023},
eprint={2307.16883},
archivePrefix={arXiv},
primaryClass={cs.CL}
}