@conference {15308, title = {Deconstructing nuggets: the stability and reliability of complex question answering evaluation}, booktitle = {Proceedings of the 30th annual international ACM SIGIR conference on Research and development in information retrieval}, series = {SIGIR {\textquoteright}07}, year = {2007}, month = {2007///}, pages = {327 - 334}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {A methodology based on "information nuggets" has recently emerged as the de facto standard by which answers to complex questions are evaluated. After several implementations in the TREC question answering tracks, the community has gained a better understanding of its many characteristics. This paper focuses on one particular aspect of the evaluation: the human assignment of nuggets to answer strings, which serves as the basis of the F-score computation. As a byproduct of the TREC 2006 ciQA task, identical answer strings were independently evaluated twice, which allowed us to assess the consistency of human judgments. Based on these results, we explored simulations of assessor behavior that provide a method to quantify scoring variations. Understanding these variations in turn lets researchers be more confident in their comparisons of systems.}, keywords = {complex information needs, human judgments, trec}, isbn = {978-1-59593-597-7}, doi = {10.1145/1277741.1277799}, url = {http://doi.acm.org/10.1145/1277741.1277799}, author = {Jimmy Lin and Zhang,Pengyi} }