@article {13351, title = {A unified approach to ranking in probabilistic databases}, journal = {The VLDB Journal}, volume = {20}, year = {2011}, month = {2011/04//}, pages = {249 - 275}, abstract = {Ranking is a fundamental operation in data analysis and decision support and plays an even more crucial role if the dataset being explored exhibits uncertainty. This has led to much work in understanding how to rank the tuples in a probabilistic dataset in recent years. In this article, we present a unified approach to ranking and top-k query processing in probabilistic databases by viewing it as a multi-criterion optimization problem and by deriving a set of features that capture the key properties of a probabilistic dataset that dictate the ranked result. We contend that a single, specific ranking function may not suffice for probabilistic databases, and we instead propose two parameterized ranking functions, called PRF {\textquestiondown} and PRF e, that generalize or can approximate many of the previously proposed ranking functions. We present novel generating functions-based algorithms for efficiently ranking large datasets according to these ranking functions, even if the datasets exhibit complex correlations modeled using probabilistic and/xor trees or Markov networks. We further propose that the parameters of the ranking function be learned from user preferences, and we develop an approach to learn those parameters. Finally, we present a comprehensive experimental study that illustrates the effectiveness of our parameterized ranking functions, especially PRF e, at approximating other ranking functions and the scalability of our proposed algorithms for exact or approximate ranking.}, keywords = {Approximation techniques, Graphical models, Learning to rank, Probabilistic databases, Ranking}, isbn = {1066-8888}, doi = {10.1007/s00778-011-0220-3}, url = {http://dx.doi.org/10.1007/s00778-011-0220-3}, author = {Li,Jian and Saha,Barna and Deshpande, Amol} } @conference {13381, title = {Lineage processing over correlated probabilistic databases}, booktitle = {Proceedings of the 2010 international conference on Management of data}, series = {SIGMOD {\textquoteright}10}, year = {2010}, month = {2010///}, pages = {675 - 686}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {In this paper, we address the problem of scalably evaluating conjunctive queries over correlated probabilistic databases containing tuple or attribute uncertainties. Like previous work, we adopt a two-phase approach where we first compute lineages of the output tuples, and then compute the probabilities of the lineage formulas. However unlike previous work, we allow for arbitrary and complex correlations to be present in the data, captured via a forest of junction trees. We observe that evaluating even read-once (tree structured) lineages (e.g., those generated by hierarchical conjunctive queries), polynomially computable over tuple independent probabilistic databases, is $\#$P-complete for lightly correlated probabilistic databases like Markov sequences. We characterize the complexity of exact computation of the probability of the lineage formula on a correlated database using a parameter called lwidth (analogous to the notion of treewidth). For lineages that result in low lwidth, we compute exact probabilities using a novel message passing algorithm, and for lineages that induce large lwidths, we develop approximate Monte Carlo algorithms to estimate the result probabilities. We scale our algorithms to very large correlated probabilistic databases using the previously proposed INDSEP data structure. To mitigate the complexity of lineage evaluation, we develop optimization techniques to process a batch of lineages by sharing computation across formulas, and to exploit any independence relationships that may exist in the data. Our experimental study illustrates the benefits of using our algorithms for processing lineage formulas over correlated probabilistic databases.}, keywords = {conjunctive queries, Indexing, junction trees, lineage, Probabilistic databases}, isbn = {978-1-4503-0032-2}, doi = {10.1145/1807167.1807241}, url = {http://doi.acm.org/10.1145/1807167.1807241}, author = {Kanagal,Bhargav and Deshpande, Amol} } @conference {13360, title = {Consensus answers for queries over probabilistic databases}, booktitle = {Proceedings of the twenty-eighth ACM SIGMOD-SIGACT-SIGART symposium on Principles of database systems}, series = {PODS {\textquoteright}09}, year = {2009}, month = {2009///}, pages = {259 - 268}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {We address the problem of finding a "best" deterministic query answer to a query over a probabilistic database. For this purpose, we propose the notion of a consensus world (or a consensus answer) which is a deterministic world (answer) that minimizes the expected distance to the possible worlds (answers). This problem can be seen as a generalization of the well-studied inconsistent information aggregation problems (e.g. rank aggregation) to probabilistic databases. We consider this problem for various types of queries including SPJ queries, Top-k ranking queries, group-by aggregate queries, and clustering. For different distance metrics, we obtain polynomial time optimal or approximation algorithms for computing the consensus answers (or prove NP-hardness). Most of our results are for a general probabilistic database model, called and/xor tree model, which significantly generalizes previous probabilistic database models like x-tuples and block-independent disjoint models, and is of independent interest.}, keywords = {consensus answers, probabilistic and/xor tree, Probabilistic databases, Query processing, rank aggregation}, isbn = {978-1-60558-553-6}, doi = {10.1145/1559795.1559835}, url = {http://doi.acm.org/10.1145/1559795.1559835}, author = {Li,Jian and Deshpande, Amol} } @conference {13375, title = {Indexing correlated probabilistic databases}, booktitle = {Proceedings of the 35th SIGMOD international conference on Management of data}, series = {SIGMOD {\textquoteright}09}, year = {2009}, month = {2009///}, pages = {455 - 468}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {With large amounts of correlated probabilistic data being generated in a wide range of application domains including sensor networks, information extraction, event detection etc., effectively managing and querying them has become an important research direction. While there is an exhaustive body of literature on querying independent probabilistic data, supporting efficient queries over large-scale, correlated databases remains a challenge. In this paper, we develop efficient data structures and indexes for supporting inference and decision support queries over such databases. Our proposed hierarchical data structure is suitable both for in-memory and disk-resident databases. We represent the correlations in the probabilistic database using a junction tree over the tuple-existence or attribute-value random variables, and use tree partitioning techniques to build an index structure over it. We show how to efficiently answer inference and aggregation queries using such an index, resulting in orders of magnitude performance benefits in most cases. In addition, we develop novel algorithms for efficiently keeping the index structure up-to-date as changes (inserts, updates) are made to the probabilistic database. We present a comprehensive experimental study illustrating the benefits of our approach to query processing in probabilistic databases.}, keywords = {caching, Indexing, inference queries, junction trees, Probabilistic databases}, isbn = {978-1-60558-551-2}, doi = {10.1145/1559845.1559894}, url = {http://doi.acm.org/10.1145/1559845.1559894}, author = {Kanagal,Bhargav and Deshpande, Amol} }