@article {15294, title = {Methods for automatically evaluating answers to complex questions}, journal = {Information Retrieval}, volume = {9}, year = {2006}, month = {2006///}, pages = {565 - 587}, abstract = {Evaluation is a major driving force in advancing the state of the art in language technologies. In particular, methods for automatically assessing the quality of machine output is the preferred method for measuring progress, provided that these metrics have been validated against human judgments. Following recent developments in the automatic evaluation of machine translation and document summarization, we present a similar approach, implemented in a measure called POURPRE, an automatic technique for evaluating answers to complex questions based on n-gram co-occurrences between machine output and a human-generated answer key. Until now, the only way to assess the correctness of answers to such questions involves manual determination of whether an information {\textquotedblleft}nugget{\textquotedblright} appears in a system{\textquoteright}s response. The lack of automatic methods for scoring system output is an impediment to progress in the field, which we address with this work. Experiments with the TREC 2003, TREC 2004, and TREC 2005 QA tracks indicate that rankings produced by our metric correlate highly with official rankings, and that POURPRE outperforms direct application of existing metrics.}, doi = {10.1007/s10791-006-9003-7}, author = {Jimmy Lin and Demner-Fushman,D.} }