@article {13351, title = {A unified approach to ranking in probabilistic databases}, journal = {The VLDB Journal}, volume = {20}, year = {2011}, month = {2011/04//}, pages = {249 - 275}, abstract = {Ranking is a fundamental operation in data analysis and decision support and plays an even more crucial role if the dataset being explored exhibits uncertainty. This has led to much work in understanding how to rank the tuples in a probabilistic dataset in recent years. In this article, we present a unified approach to ranking and top-k query processing in probabilistic databases by viewing it as a multi-criterion optimization problem and by deriving a set of features that capture the key properties of a probabilistic dataset that dictate the ranked result. We contend that a single, specific ranking function may not suffice for probabilistic databases, and we instead propose two parameterized ranking functions, called PRF {\textquestiondown} and PRF e, that generalize or can approximate many of the previously proposed ranking functions. We present novel generating functions-based algorithms for efficiently ranking large datasets according to these ranking functions, even if the datasets exhibit complex correlations modeled using probabilistic and/xor trees or Markov networks. We further propose that the parameters of the ranking function be learned from user preferences, and we develop an approach to learn those parameters. Finally, we present a comprehensive experimental study that illustrates the effectiveness of our parameterized ranking functions, especially PRF e, at approximating other ranking functions and the scalability of our proposed algorithms for exact or approximate ranking.}, keywords = {Approximation techniques, Graphical models, Learning to rank, Probabilistic databases, Ranking}, isbn = {1066-8888}, doi = {10.1007/s00778-011-0220-3}, url = {http://dx.doi.org/10.1007/s00778-011-0220-3}, author = {Li,Jian and Saha,Barna and Deshpande, Amol} } @article {16835, title = {Metric space similarity joins}, journal = {ACM Trans. Database Syst.}, volume = {33}, year = {2008}, month = {2008/06//}, pages = {7:1{\textendash}7:38 - 7:1{\textendash}7:38}, abstract = {Similarity join algorithms find pairs of objects that lie within a certain distance ε of each other. Algorithms that are adapted from spatial join techniques are designed primarily for data in a vector space and often employ some form of a multidimensional index. For these algorithms, when the data lies in a metric space, the usual solution is to embed the data in vector space and then make use of a multidimensional index. Such an approach has a number of drawbacks when the data is high dimensional as we must eventually find the most discriminating dimensions, which is not trivial. In addition, although the maximum distance between objects increases with dimension, the ability to discriminate between objects in each dimension does not. These drawbacks are overcome via the introduction of a new method called Quickjoin that does not require a multidimensional index and instead adapts techniques used in distance-based indexing for use in a method that is conceptually similar to the Quicksort algorithm. A formal analysis is provided of the Quickjoin method. Experiments show that the Quickjoin method significantly outperforms two existing techniques.}, keywords = {distance-based indexing, external memory algorithms, nearest neighbor queries, range queries, Ranking, Similarity join}, isbn = {0362-5915}, doi = {10.1145/1366102.1366104}, url = {http://doi.acm.org/10.1145/1366102.1366104}, author = {Jacox,Edwin H. and Samet, Hanan} } @article {16891, title = {Index-driven similarity search in metric spaces (Survey Article)}, journal = {ACM Trans. Database Syst.}, volume = {28}, year = {2003}, month = {2003/12//}, pages = {517 - 580}, abstract = {Similarity search is a very important operation in multimedia databases and other database applications involving complex objects, and involves finding objects in a data set S similar to a query object q, based on some similarity measure. In this article, we focus on methods for similarity search that make the general assumption that similarity is represented with a distance metric d. Existing methods for handling similarity search in this setting typically fall into one of two classes. The first directly indexes the objects based on distances (distance-based indexing), while the second is based on mapping to a vector space (mapping-based approach). The main part of this article is dedicated to a survey of distance-based indexing methods, but we also briefly outline how search occurs in mapping-based methods. We also present a general framework for performing search based on distances, and present algorithms for common types of queries that operate on an arbitrary "search hierarchy." These algorithms can be applied on each of the methods presented, provided a suitable search hierarchy is defined.}, keywords = {distance-based indexing, Hiearchical metric data structures, nearest neighbor queries, range queries, Ranking, similarity searching}, isbn = {0362-5915}, doi = {10.1145/958942.958948}, url = {http://doi.acm.org/10.1145/958942.958948}, author = {Hjaltason,Gisli R. and Samet, Hanan} }