@conference {19505, title = {The Provenance of WINE}, booktitle = {Dependable Computing Conference (EDCC), 2012 Ninth European}, year = {2012}, month = {2012///}, pages = {126 - 131}, abstract = {The results of cyber security experiments are often impossible to reproduce, owing to the lack of adequate descriptions of the data collection and experimental processes. Such provenance information is difficult to record consistently when collecting data from distributed sensors and when sharing raw data among research groups with variable standards for documenting the steps that produce the final experimental result. In the WINE benchmark, which provides field data for cyber security experiments, we aim to make the experimental process self-documenting. The data collected includes provenance information {\textendash} such as when, where and how an attack was first observed or detected {\textendash} and allows researchers to gauge information quality. Experiments are conducted on a common test bed, which provides tools for recording each procedural step. The ability to understand the provenance of research results enables rigorous cyber security experiments, conducted at scale.}, keywords = {Benchmark testing, CYBER SECURITY, cyber security experiments, data attacks, data collection, dependability benchmarking, distributed databases, distributed sensors, experimental research, field data, information quality, MALWARE, Pipelines, provenance, provenance information, raw data sharing, research groups, security of data, self-documenting experimental process, sensor fusion, software, variable standards, WINE, WINE benchmark}, author = {Tudor Dumitras and Efstathopoulos, P.} } @article {16495, title = {A Dual Framework and Algorithms for Targeted Online Data Delivery}, journal = {IEEE Transactions on Knowledge and Data Engineering}, volume = {23}, year = {2011}, month = {2011///}, pages = {5 - 21}, abstract = {A variety of emerging online data delivery applications challenge existing techniques for data delivery to human users, applications, or middleware that are accessing data from multiple autonomous servers. In this paper, we develop a framework for formalizing and comparing pull-based solutions and present dual optimization approaches. The first approach, most commonly used nowadays, maximizes user utility under the strict setting of meeting a priori constraints on the usage of system resources. We present an alternative and more flexible approach that maximizes user utility by satisfying all users. It does this while minimizing the usage of system resources. We discuss the benefits of this latter approach and develop an adaptive monitoring solution Satisfy User Profiles (SUPs). Through formal analysis, we identify sufficient optimality conditions for SUP. Using real (RSS feeds) and synthetic traces, we empirically analyze the behavior of SUP under varying conditions. Our experiments show that we can achieve a high degree of satisfaction of user utility when the estimations of SUP closely estimate the real event stream, and has the potential to save a significant amount of system resources. We further show that SUP can exploit feedback to improve user utility with only a moderate increase in resource utilization.}, keywords = {client/server multitier systems, distributed databases, online data delivery., online information services}, isbn = {1041-4347}, doi = {http://doi.ieeecomputersociety.org/10.1109/TKDE.2010.15}, author = {Roitman,Haggai and Gal,Avigdor and Raschid, Louiqa} } @conference {13385, title = {Minimizing Communication Cost in Distributed Multi-query Processing}, booktitle = {IEEE 25th International Conference on Data Engineering, 2009. ICDE {\textquoteright}09}, year = {2009}, month = {2009/04/29/March}, pages = {772 - 783}, publisher = {IEEE}, organization = {IEEE}, abstract = {Increasing prevalence of large-scale distributed monitoring and computing environments such as sensor networks, scientific federations, Grids etc., has led to a renewed interest in the area of distributed query processing and optimization. In this paper we address a general, distributed multi-query processing problem motivated by the need to minimize the communication cost in these environments. Specifically we address the problem of optimally sharing data movement across the communication edges in a distributed communication network given a set of overlapping queries and query plans for them (specifying the operations to be executed). Most of the problem variations of our general problem can be shown to be NP-Hard by a reduction from the Steiner tree problem. However, we show that the problem can be solved optimally if the communication network is a tree, and present a novel algorithm for finding an optimal data movement plan. For general communication networks, we present efficient approximation algorithms for several variations of the problem. Finally, we present an experimental study over synthetic datasets showing both the need for exploiting the sharing of data movement and the effectiveness of our algorithms at finding such plans.}, keywords = {Approximation algorithms, Communication networks, Computer science, Cost function, Data engineering, distributed communication network, distributed databases, distributed multi-query processing, grid computing, Large-scale systems, NP-hard, optimisation, Polynomials, Publish-subscribe, publish-subscribe systems, Query optimization, Query processing, sensor networks, Steiner tree problem, Tree graphs, trees (mathematics)}, isbn = {978-1-4244-3422-0}, doi = {10.1109/ICDE.2009.85}, author = {Li,Jian and Deshpande, Amol and Khuller, Samir} } @conference {17888, title = {Spatial indexing of distributed multidimensional datasets}, booktitle = {IEEE International Symposium on Cluster Computing and the Grid, 2005. CCGrid 2005}, volume = {2}, year = {2005}, month = {2005/05/09/12}, pages = {743- 750 Vol. 2 - 743- 750 Vol. 2}, publisher = {IEEE}, organization = {IEEE}, abstract = {While declustering methods for distributed multidimensional indexing of large datasets have been researched widely in the past, replication techniques for multidimensional indexes have not been investigated deeply. In general, a centralized index server may become the performance bottleneck in a wide area network rather than the data servers, since the index is likely to be accessed more often than any of the datasets in the servers. In this paper, we present two different multidimensional indexing algorithms for a distributed environment - a centralized global index and a two-level hierarchical index. Our experimental results show that the centralized scheme does not scale well for either insertion or searching the index. In order to improve the scalability of the index server, we have employed a replication protocol for both the centralized and two-level index schemes that allows some inconsistency between replicas without affecting correctness. Our experiments show that the two-level hierarchical index scheme shows better scalability for both building and searching the index than the non-replicated centralized index, but replication can make the centralized index faster than the two-level hierarchical index for searching in some cases.}, keywords = {centralized global index algorithm, centralized index server, Computer science, database indexing, distributed databases, distributed multidimensional dataset, Educational institutions, File servers, Indexing, Large-scale systems, Multidimensional systems, Network servers, replication protocol, replication techniques, scalability, Sensor systems, spatial data structures, spatial indexing, two-level hierarchical index algorithm, wide area networks}, isbn = {0-7803-9074-1}, doi = {10.1109/CCGRID.2005.1558637}, author = {Nam,B. and Sussman, Alan} } @conference {16462, title = {Exploiting multiple paths to express scientific queries}, booktitle = {16th International Conference on Scientific and Statistical Database Management, 2004. Proceedings}, year = {2004}, month = {2004/06/21/23}, pages = {357 - 360}, publisher = {IEEE}, organization = {IEEE}, abstract = {The purpose of this demonstration is to present the main features of the BioNavigation system. Scientific data collection needed in various stages of scientific discovery is typically performed manually. For each scientific object of interest (e.g., a gene, a sequence), scientists query a succession of Web resources following links between retrieved entries. Each of the steps provides part of the intended characterization of the scientific object. This process is sometimes partially supported by hard-coded scripts or complex queries that will be evaluated by a mediation-based data integration system or against a data warehouse. These approaches fail in guiding the scientists during the collection process. In contrast, the BioNavigation approach presented in the paper provides the scientists with information on the available alternative resources, their provenance, and the costs of data collection. The BioNavigation system enhances a mediation-based integration system and provides scientists with support for the following: to ask queries at a high conceptual level; to visualize the multiple alternative resources that may be exploited to execute their data collection queries; to choose the final execution path to evaluate their queries.}, keywords = {access protocols, biology computing, BioNavigation system, complex queries, Costs, Data analysis, data handling, Data visualization, data warehouse, Data warehouses, Databases, diseases, distributed databases, hard-coded scripts, information resources, Information retrieval, mediation-based data integration system, multiple paths, query evaluation, Query processing, scientific data collection, scientific discovery, scientific information, scientific information systems, scientific object of interest, scientific queries, sequences, Web resources}, isbn = {0-7695-2146-0}, doi = {10.1109/SSDM.2004.1311231}, author = {Lacroix,Z. and Moths,T. and Parekh,K. and Raschid, Louiqa and Vidal,M. -E} } @conference {17878, title = {Improving access to multi-dimensional self-describing scientific datasets}, booktitle = {3rd IEEE/ACM International Symposium on Cluster Computing and the Grid, 2003. Proceedings. CCGrid 2003}, year = {2003}, month = {2003/05/12/15}, pages = {172 - 179}, publisher = {IEEE}, organization = {IEEE}, abstract = {Applications that query into very large multidimensional datasets are becoming more common. Many self-describing scientific data file formats have also emerged, which have structural metadata to help navigate the multi-dimensional arrays that are stored in the files. The files may also contain application-specific semantic metadata. In this paper, we discuss efficient methods for performing searches for subsets of multi-dimensional data objects, using semantic information to build multidimensional indexes, and group data items into properly sized chunks to maximize disk I/O bandwidth. This work is the first step in the design and implementation of a generic indexing library that will work with various high-dimension scientific data file formats containing semantic information about the stored data. To validate the approach, we have implemented indexing structures for NASA remote sensing data stored in the HDF format with a specific schema (HDF-EOS), and show the performance improvements that are gained from indexing the datasets, compared to using the existing HDF library for accessing the data.}, keywords = {Application software, application-specific semantic metadata, Bandwidth, Computer science, database indexing, disk I/O bandwidth, distributed databases, Educational institutions, Indexing, indexing structures, Libraries, meta data, Middleware, multidimensional arrays, multidimensional datasets, Multidimensional systems, NASA, NASA remote sensing data, Navigation, query formulation, self-describing scientific data file formats, structural metadata, very large databases}, isbn = {0-7695-1919-9}, doi = {10.1109/CCGRID.2003.1199366}, author = {Nam,B. and Sussman, Alan} } @conference {13363, title = {Decoupled query optimization for federated database systems}, booktitle = {18th International Conference on Data Engineering, 2002. Proceedings}, year = {2002}, month = {2002///}, pages = {716 - 727}, publisher = {IEEE}, organization = {IEEE}, abstract = {We study the problem of query optimization in federated relational database systems. The nature of federated databases explicitly decouples many aspects of the optimization process, often making it imperative for the optimizer to consult underlying data sources while doing cost-based optimization. This not only increases the cost of optimization, but also changes the trade-offs involved in the optimization process significantly. The dominant cost in the decoupled optimization process is the "cost of costing" that traditionally has been considered insignificant. The optimizer can only afford a few rounds of messages to the underlying data sources and hence the optimization techniques in this environment must be geared toward gathering all the required cost information with minimal communication. In this paper, we explore the design space for a query optimizer in this environment and demonstrate the need for decoupling various aspects of the optimization process. We present minimum-communication decoupled variants of various query optimization techniques, and discuss tradeoffs in their performance in this scenario. We have implemented these techniques in the Cohera federated database system and our experimental results, somewhat surprisingly, indicate that a simple two-phase optimization scheme performs fairly well as long as the physical database design is known to the optimizer, though more aggressive algorithms are required otherwise}, keywords = {Algorithm design and analysis, Cohera federated database, Computer science, Corporate acquisitions, Cost function, Database systems, decoupled optimization, Design optimization, distributed databases, federated databases, federated relational database systems, Internet, Query optimization, query optimizer, Query processing, Relational databases, Space exploration}, isbn = {0-7695-1531-2}, doi = {10.1109/ICDE.2002.994788}, author = {Deshpande, Amol and Hellerstein,J. M} } @conference {16779, title = {Integrating distributed scientific data sources with MOCHA and XRoaster}, booktitle = {Thirteenth International Conference on Scientific and Statistical Database Management, 2001. SSDBM 2001. Proceedings}, year = {2001}, month = {2001///}, pages = {263 - 266}, publisher = {IEEE}, organization = {IEEE}, abstract = {MOCHA is a novel middleware system for integrating distributed data sources that we have developed at the University of Maryland. MOCHA is based on the idea that the code that implements user-defined types and functions should be automatically deployed to remote sites by the middleware system itself. To this end, we have developed an XML-based framework to specify metadata about data sites, data sets, and user-defined types and functions. XRoaster is a graphical tool that we have developed to help the user create all the XML metadata elements to be used in MOCHA}, keywords = {client-server systems, data sets, data sites, Databases, Distributed computing, distributed databases, distributed scientific data source integration, Educational institutions, graphical tool, hypermedia markup languages, IP networks, java, Large-scale systems, Maintenance engineering, meta data, metadata, Middleware, middleware system, MOCHA, Query processing, remote sites, scientific information systems, user-defined types, visual programming, XML, XML metadata elements, XML-based framework, XRoaster}, isbn = {0-7695-1218-6}, doi = {10.1109/SSDM.2001.938560}, author = {Rodriguez-Martinez,M. and Roussopoulos, Nick and McGann,J. M and Kelley,S. and Mokwa,J. and White,B. and Jala,J.} } @article {16769, title = {Techniques for update handling in the enhanced client-server DBMS}, journal = {IEEE Transactions on Knowledge and Data Engineering}, volume = {10}, year = {1998}, month = {1998/06//May}, pages = {458 - 476}, abstract = {The Client-Server computing paradigm has significantly influenced the way modern Database Management Systems are designed and built. In such systems, clients maintain data pages in their main-memory caches, originating from the server{\textquoteright}s database. The Enhanced Client-Server architecture takes advantage of all the available client resources, including their long-term memory. Clients can cache server data into their own disk units if these data are part of their operational spaces. However, when updates occur at the server, a number of clients may need to not only be notified about these changes, but also obtain portions of the updates as well. In this paper, we examine the problem of managing server imposed updates that affect data cached on client disk managers. We propose a number of server update propagation techniques in the context of the Enhanced Client-Server DBMS architecture, and examine the performance of these strategies through detailed simulation experiments. In addition, we study how the various settings of the network affect the performance of these policies}, keywords = {client disk managers, client resources, client-server computing paradigm, client-server systems, Computational modeling, Computer architecture, concurrency control, data pages, Database systems, distributed databases, enhanced client-server DBMS, Hardware, Local area networks, long-term memory, main-memory caches, Network servers, operational spaces, Personal communication networks, server update propagation techniques, Transaction databases, update handling, Workstations, Yarn}, isbn = {1041-4347}, doi = {10.1109/69.687978}, author = {Delis,A. and Roussopoulos, Nick} } @article {16740, title = {ADMS: a testbed for incremental access methods}, journal = {IEEE Transactions on Knowledge and Data Engineering}, volume = {5}, year = {1993}, month = {1993/10//}, pages = {762 - 774}, abstract = {ADMS is an advanced database management system developed-to experiment with incremental access methods for large and distributed databases. It has been developed over the past eight years at the University of Maryland. The paper provides an overview of ADMS, and describes its capabilities and the performance attained by its incremental access methods. This paper also describes an enhanced client-server architecture that allows an incremental gateway access to multiple heterogeneous commercial database management systems}, keywords = {Add-drop multiplexers, ADMS, advanced database management system, client-server architecture, commercial database management systems, Computational modeling, Database systems, distributed databases, heterogeneous DBMS, incremental access methods, incremental gateway, Information retrieval, interoperability, join index, large databases, Navigation, network operating systems, Object oriented databases, Object oriented modeling, Query processing, System testing, very large databases, view index, Workstations}, isbn = {1041-4347}, doi = {10.1109/69.243508}, author = {Roussopoulos, Nick and Economou,N. and Stamenas,A.} } @conference {16781, title = {An algebra and calculus for relational multidatabase systems}, booktitle = {, First International Workshop on Interoperability in Multidatabase Systems, 1991. IMS {\textquoteright}91. Proceedings}, year = {1991}, month = {1991/04//}, pages = {118 - 124}, publisher = {IEEE}, organization = {IEEE}, abstract = {With the existence of many autonomous databases widely accessible through computer networks, users will require the capability to jointly manipulate data in different databases. A multidatabase system provides such a capability through a multidatabase manipulation language. The authors propose a theoretical foundation for such languages by presenting a multirelational algebra and calculus based on the relational algebra and calculus. The proposal is illustrated by various queries on an example multidatabase}, keywords = {Algebra, autonomous databases, Calculus, Computer networks, Computer science, Data models, Data structures, Database systems, database theory, distributed databases, Military computing, multidatabase manipulation language, multidatabase system, multirelational algebra, query languages, relational algebra, Relational databases, Spatial databases, theoretical foundation}, isbn = {0-8186-2205-9}, doi = {10.1109/IMS.1991.153694}, author = {Grant,J. and Litwin,W. and Roussopoulos, Nick and Sellis,T.} } @article {16768, title = {A pipeline N-way join algorithm based on the 2-way semijoin program}, journal = {IEEE Transactions on Knowledge and Data Engineering}, volume = {3}, year = {1991}, month = {1991/12//}, pages = {486 - 495}, abstract = {The semijoin has been used as an effective operator in reducing data transmission and processing over a network that allows forward size reduction of relations and intermediate results generated during the processing of a distributed query. The authors propose a relational operator, two-way semijoin, which enhanced the semijoin with backward size reduction capability for more cost-effective query processing. A pipeline N-way join algorithm for joining the reduced relations residing on N sites is introduced. The main advantage of this algorithm is that it eliminates the need for transferring and storing intermediate results among the sites. A set of experiments showing that the proposed algorithm outperforms all known conventional join algorithms that generate intermediate results is included}, keywords = {2-way semijoin program, backward size reduction, Bandwidth, Computer networks, Costs, Data communication, data transmission, Database systems, database theory, Delay, distributed databases, distributed query, forward size reduction, intermediate results, Local area networks, network, Parallel algorithms, pipeline N-way join algorithm, pipeline processing, Pipelines, programming theory, Query processing, Relational databases, relational operator, SITES, Workstations}, isbn = {1041-4347}, doi = {10.1109/69.109109}, author = {Roussopoulos, Nick and Kang,H.} }