@article {13355, title = {Algorithms for distributional and adversarial pipelined filter ordering problems}, journal = {ACM Trans. Algorithms}, volume = {5}, year = {2009}, month = {2009/03//}, pages = {24:1{\textendash}24:34 - 24:1{\textendash}24:34}, abstract = {Pipelined filter ordering is a central problem in database query optimization. The problem is to determine the optimal order in which to apply a given set of commutative filters (predicates) to a set of elements (the tuples of a relation), so as to find, as efficiently as possible, the tuples that satisfy all of the filters. Optimization of pipelined filter ordering has recently received renewed attention in the context of environments such as the Web, continuous high-speed data streams, and sensor networks. Pipelined filter ordering problems are also studied in areas such as fault detection and machine learning under names such as learning with attribute costs, minimum-sum set cover, and satisficing search. We present algorithms for two natural extensions of the classical pipelined filter ordering problem: (1) a distributional-type problem where the filters run in parallel and the goal is to maximize throughput, and (2) an adversarial-type problem where the goal is to minimize the expected value of multiplicative regret. We present two related algorithms for solving (1), both running in time O(n2), which improve on the O(n3 log n) algorithm of Kodialam. We use techniques from our algorithms for (1) to obtain an algorithm for (2).}, keywords = {flow algorithms, Pipelined filter ordering, Query optimization, selection ordering}, isbn = {1549-6325}, doi = {10.1145/1497290.1497300}, url = {http://doi.acm.org/10.1145/1497290.1497300}, author = {Condon,Anne and Deshpande, Amol and Hellerstein,Lisa and Wu,Ning} } @conference {13385, title = {Minimizing Communication Cost in Distributed Multi-query Processing}, booktitle = {IEEE 25th International Conference on Data Engineering, 2009. ICDE {\textquoteright}09}, year = {2009}, month = {2009/04/29/March}, pages = {772 - 783}, publisher = {IEEE}, organization = {IEEE}, abstract = {Increasing prevalence of large-scale distributed monitoring and computing environments such as sensor networks, scientific federations, Grids etc., has led to a renewed interest in the area of distributed query processing and optimization. In this paper we address a general, distributed multi-query processing problem motivated by the need to minimize the communication cost in these environments. Specifically we address the problem of optimally sharing data movement across the communication edges in a distributed communication network given a set of overlapping queries and query plans for them (specifying the operations to be executed). Most of the problem variations of our general problem can be shown to be NP-Hard by a reduction from the Steiner tree problem. However, we show that the problem can be solved optimally if the communication network is a tree, and present a novel algorithm for finding an optimal data movement plan. For general communication networks, we present efficient approximation algorithms for several variations of the problem. Finally, we present an experimental study over synthetic datasets showing both the need for exploiting the sharing of data movement and the effectiveness of our algorithms at finding such plans.}, keywords = {Approximation algorithms, Communication networks, Computer science, Cost function, Data engineering, distributed communication network, distributed databases, distributed multi-query processing, grid computing, Large-scale systems, NP-hard, optimisation, Polynomials, Publish-subscribe, publish-subscribe systems, Query optimization, Query processing, sensor networks, Steiner tree problem, Tree graphs, trees (mathematics)}, isbn = {978-1-4244-3422-0}, doi = {10.1109/ICDE.2009.85}, author = {Li,Jian and Deshpande, Amol and Khuller, Samir} } @article {17870, title = {Active semantic caching to optimize multidimensional data analysis in parallel and distributed environments}, journal = {Parallel Computing}, volume = {33}, year = {2007}, month = {2007/08//}, pages = {497 - 520}, abstract = {In this paper, we present a multi-query optimization framework based on the concept of active semantic caching. The framework permits the identification and transparent reuse of data and computation in the presence of multiple queries (or query batches) that specify user-defined operators and aggregations originating from scientific data-analysis applications. We show how query scheduling techniques, coupled with intelligent cache replacement policies, can further improve the performance of query processing by leveraging the active semantic caching operators. We also propose a methodology for functionally decomposing complex queries in terms of primitives so that multiple reuse sites are exposed to the query optimizer, to increase the amount of reuse. The optimization framework and the database system implemented with it are designed to be efficient irrespective of the underlying parallel and/or distributed machine configuration. We present experimental results highlighting the performance improvements obtained by our methods using real scientific data-analysis applications on multiple parallel and distributed processing configurations (e.g., single symmetric multiprocessor (SMP) machine, cluster of SMP nodes, and a Grid computing configuration).}, keywords = {Active semantic caching, Parallel databases, Query optimization, Scientific data analysis}, isbn = {0167-8191}, doi = {10.1016/j.parco.2007.03.001}, url = {http://www.sciencedirect.com/science/article/pii/S0167819107000506}, author = {Andrade,Henrique and Kurc,Tahsin and Sussman, Alan and Saltz,Joel} } @conference {13370, title = {Flow algorithms for two pipelined filter ordering problems}, booktitle = {Proceedings of the twenty-fifth ACM SIGMOD-SIGACT-SIGART symposium on Principles of database systems}, series = {PODS {\textquoteright}06}, year = {2006}, month = {2006///}, pages = {193 - 202}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {Pipelined filter ordering is a central problem in database query optimization, and has received renewed attention recently in the context of environments such as the web, continuous high-speed data streams and sensor networks. We present algorithms for two natural extensions of the classical pipelined filter ordering problem: (1) a distributional type problem where the filters run in parallel and the goal is to maximize throughput, and (2) an adversarial type problem where the goal is to minimize the expected value of multiplicative regret. We show that both problems can be solved using similar flow algorithms, which find an optimal ordering scheme in time O(n2), where n is the number of filters. Our algorithm for (1) improves on an earlier O(n3 log n) algorithm of Kodialam.}, keywords = {flow algorithms, Pipelined filter ordering, Query optimization, selection ordering}, isbn = {1-59593-318-2}, doi = {10.1145/1142351.1142379}, url = {http://doi.acm.org/10.1145/1142351.1142379}, author = {Condon,Anne and Deshpande, Amol and Hellerstein,Lisa and Wu,Ning} } @conference {13363, title = {Decoupled query optimization for federated database systems}, booktitle = {18th International Conference on Data Engineering, 2002. Proceedings}, year = {2002}, month = {2002///}, pages = {716 - 727}, publisher = {IEEE}, organization = {IEEE}, abstract = {We study the problem of query optimization in federated relational database systems. The nature of federated databases explicitly decouples many aspects of the optimization process, often making it imperative for the optimizer to consult underlying data sources while doing cost-based optimization. This not only increases the cost of optimization, but also changes the trade-offs involved in the optimization process significantly. The dominant cost in the decoupled optimization process is the "cost of costing" that traditionally has been considered insignificant. The optimizer can only afford a few rounds of messages to the underlying data sources and hence the optimization techniques in this environment must be geared toward gathering all the required cost information with minimal communication. In this paper, we explore the design space for a query optimizer in this environment and demonstrate the need for decoupling various aspects of the optimization process. We present minimum-communication decoupled variants of various query optimization techniques, and discuss tradeoffs in their performance in this scenario. We have implemented these techniques in the Cohera federated database system and our experimental results, somewhat surprisingly, indicate that a simple two-phase optimization scheme performs fairly well as long as the physical database design is known to the optimizer, though more aggressive algorithms are required otherwise}, keywords = {Algorithm design and analysis, Cohera federated database, Computer science, Corporate acquisitions, Cost function, Database systems, decoupled optimization, Design optimization, distributed databases, federated databases, federated relational database systems, Internet, Query optimization, query optimizer, Query processing, Relational databases, Space exploration}, isbn = {0-7695-1531-2}, doi = {10.1109/ICDE.2002.994788}, author = {Deshpande, Amol and Hellerstein,J. M} } @article {17820, title = {Probabilistic object bases}, journal = {ACM Trans. Database Syst.}, volume = {26}, year = {2001}, month = {2001/09//}, pages = {264 - 312}, abstract = {Although there are many applications where an object-oriented data model is a good way of representing and querying data, current object database systems are unable to handle objects whose attributes are uncertain. In this article, we extend previous work by Kornatzky and Shimony to develop an algebra to handle object bases with uncertainty. We propose concepts of consistency for such object bases, together with an NP-completeness result, and classes of probabilistic object bases for which consistency is polynomially checkable. In addition, as certain operations involve conjunctions and disjunctions of events, and as the probability of conjunctive and disjunctive events depends both on the probabilities of the primitive events involved as well as on what is known (if anything) about the relationship between the events, we show how all our algebraic operations may be performed under arbitrary probabilistic conjunction and disjunction strategies. We also develop a host of equivalence results in our algebra, which may be used as rewrite rules for query optimization. Last but not least, we have developed a prototype probabilistic object base server on top of ObjectStore. We describe experiments to assess the efficiency of different possible rewrite rules.}, keywords = {consistency, object-oriented database, probabilistic object algebra, probabilistic object base, probability, query language, Query optimization}, isbn = {0362-5915}, doi = {10.1145/502030.502031}, url = {http://doi.acm.org/10.1145/502030.502031}, author = {Eiter,Thomas and Lu,James J. and Lukasiewicz,Thomas and V.S. Subrahmanian} } @article {16479, title = {Logic-based query optimization for object databases}, journal = {IEEE Transactions on Knowledge and Data Engineering}, volume = {12}, year = {2000}, month = {2000/08//Jul}, pages = {529 - 547}, abstract = {We present a technique for transferring query optimization techniques, developed for relational databases, into object databases. We demonstrate this technique for ODMG database schemas defined in ODL and object queries expressed in OQL. The object schema is represented using a logical representation (Datalog). Semantic knowledge about the object data model, e.g., class hierarchy information, relationship between objects, etc., as well as semantic knowledge about a particular schema and application domain are expressed as integrity constraints. An OQL object query is represented as a logic query and query optimization is performed in the Datalog representation. We obtain equivalent (optimized) logic queries, and subsequently obtain equivalent (optimized) OQL queries for each equivalent logic query. We present one optimization technique for semantic query optimization (SQO) based on the residue technique of U. Charavarthy et al. (1990; 1986; 1988). We show that our technique generalizes previous research on SQO for object databases. We handle a large class of OQL queries, including queries with constructors and methods. We demonstrate how SQO can be used to eliminate queries which contain contradictions and simplify queries, e.g., by eliminating joins, or by reducing the access scope for evaluating a query to some specific subclass(es). We also demonstrate how the definition of a method or integrity constraints describing the method, can be used in optimizing a query with a method}, keywords = {access scope, application domain, class hierarchy information, Constraint optimization, data integrity, Data models, Datalog, Datalog representation, deductive databases, equivalent logic query, integrity constraints, Lifting equipment, Logic, logic based query optimization, logic programming, logic queries, logic query, logical representation, object data model, object databases, object queries, object schema, object-oriented databases, ODL, ODMG database schemas, optimization technique, optimized OQL queries, OQL object query, query languages, Query optimization, query optimization techniques, Query processing, Relational databases, residue technique, semantic knowledge, semantic query optimization}, isbn = {1041-4347}, doi = {10.1109/69.868906}, author = {Grant,J. and Gryz,J. and Minker, Jack and Raschid, Louiqa} }