@conference {13381, title = {Lineage processing over correlated probabilistic databases}, booktitle = {Proceedings of the 2010 international conference on Management of data}, series = {SIGMOD {\textquoteright}10}, year = {2010}, month = {2010///}, pages = {675 - 686}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {In this paper, we address the problem of scalably evaluating conjunctive queries over correlated probabilistic databases containing tuple or attribute uncertainties. Like previous work, we adopt a two-phase approach where we first compute lineages of the output tuples, and then compute the probabilities of the lineage formulas. However unlike previous work, we allow for arbitrary and complex correlations to be present in the data, captured via a forest of junction trees. We observe that evaluating even read-once (tree structured) lineages (e.g., those generated by hierarchical conjunctive queries), polynomially computable over tuple independent probabilistic databases, is $\#$P-complete for lightly correlated probabilistic databases like Markov sequences. We characterize the complexity of exact computation of the probability of the lineage formula on a correlated database using a parameter called lwidth (analogous to the notion of treewidth). For lineages that result in low lwidth, we compute exact probabilities using a novel message passing algorithm, and for lineages that induce large lwidths, we develop approximate Monte Carlo algorithms to estimate the result probabilities. We scale our algorithms to very large correlated probabilistic databases using the previously proposed INDSEP data structure. To mitigate the complexity of lineage evaluation, we develop optimization techniques to process a batch of lineages by sharing computation across formulas, and to exploit any independence relationships that may exist in the data. Our experimental study illustrates the benefits of using our algorithms for processing lineage formulas over correlated probabilistic databases.}, keywords = {conjunctive queries, Indexing, junction trees, lineage, Probabilistic databases}, isbn = {978-1-4503-0032-2}, doi = {10.1145/1807167.1807241}, url = {http://doi.acm.org/10.1145/1807167.1807241}, author = {Kanagal,Bhargav and Deshpande, Amol} } @conference {13375, title = {Indexing correlated probabilistic databases}, booktitle = {Proceedings of the 35th SIGMOD international conference on Management of data}, series = {SIGMOD {\textquoteright}09}, year = {2009}, month = {2009///}, pages = {455 - 468}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {With large amounts of correlated probabilistic data being generated in a wide range of application domains including sensor networks, information extraction, event detection etc., effectively managing and querying them has become an important research direction. While there is an exhaustive body of literature on querying independent probabilistic data, supporting efficient queries over large-scale, correlated databases remains a challenge. In this paper, we develop efficient data structures and indexes for supporting inference and decision support queries over such databases. Our proposed hierarchical data structure is suitable both for in-memory and disk-resident databases. We represent the correlations in the probabilistic database using a junction tree over the tuple-existence or attribute-value random variables, and use tree partitioning techniques to build an index structure over it. We show how to efficiently answer inference and aggregation queries using such an index, resulting in orders of magnitude performance benefits in most cases. In addition, we develop novel algorithms for efficiently keeping the index structure up-to-date as changes (inserts, updates) are made to the probabilistic database. We present a comprehensive experimental study illustrating the benefits of our approach to query processing in probabilistic databases.}, keywords = {caching, Indexing, inference queries, junction trees, Probabilistic databases}, isbn = {978-1-60558-551-2}, doi = {10.1145/1559845.1559894}, url = {http://doi.acm.org/10.1145/1559845.1559894}, author = {Kanagal,Bhargav and Deshpande, Amol} } @conference {14251, title = {Real-time shape retrieval for robotics using skip Tri-Grams}, booktitle = {IEEE/RSJ International Conference on Intelligent Robots and Systems, 2009. IROS 2009}, year = {2009}, month = {2009/10/10/15}, pages = {4731 - 4738}, publisher = {IEEE}, organization = {IEEE}, abstract = {The real time requirement is an additional constraint on many intelligent applications in robotics, such as shape recognition and retrieval using a mobile robot platform. In this paper, we present a scalable approach for efficiently retrieving closed contour shapes. The contour of an object is represented by piecewise linear segments. A skip Tri-Gram is obtained by selecting three segments in the clockwise order while allowing a constant number of segments to be {\~A}‚{\^A}{\textquestiondown}skipped{\~A}‚{\^A}{\textquestiondown} in between. The main idea is to use skip Tri-Grams of the segments to implicitly encode the distant dependency of the shape. All skip Tri-Grams are used for efficiently retrieving closed contour shapes without pairwise matching feature points from two shapes. The retrieval is at least an order of magnitude faster than other state-of-the-art algorithms. We score 80\% in the Bullseye retrieval test on the whole MPEG 7 shape dataset. We further test the algorithm using a mobile robot platform in an indoor environment. 8 objects are used for testing from different viewing directions, and we achieve 82\% accuracy.}, keywords = {Bullseye retrieval test, Clocks, closed contour shape retrieval, Image retrieval, Image segmentation, Indexing, Information retrieval, Intelligent robots, Jacobian matrices, mobile robot, Mobile robots, MPEG 7 shape dataset, piecewise linear segments, Piecewise linear techniques, Real time systems, real-time shape retrieval, robot vision, SHAPE, shape recognition, shape representation, skip Tri-Grams, Testing}, isbn = {978-1-4244-3803-7}, doi = {10.1109/IROS.2009.5354738}, author = {Li,Yi and Bitsakos,K. and Ferm{\"u}ller, Cornelia and Aloimonos, J.} } @conference {17877, title = {DiST: fully decentralized indexing for querying distributed multidimensional datasets}, booktitle = {Parallel and Distributed Processing Symposium, 2006. IPDPS 2006. 20th International}, year = {2006}, month = {2006/04//}, publisher = {IEEE}, organization = {IEEE}, abstract = {Grid computing and peer-to-peer (P2P) systems are emerging as new paradigms for managing large scale distributed resources across wide area networks. While grid computing focuses on managing heterogeneous resources and relies on centralized managers for resource and data discovery, P2P systems target scalable, decentralized methods for publishing and searching for data. In large distributed systems, a centralized resource manager is a potential performance bottleneck and decentralization can help avoid this bottleneck, as is done in P2P systems. However, the query functionality provided by most existing P2P systems is very rudimentary, and is not directly applicable to grid resource management. In this paper, we propose a fully decentralized multidimensional indexing structure, called DiST, that operates in a fully distributed environment with no centralized control. In DiST, each data server only acquires information about data on other servers from executing and routing queries. We describe the DiST algorithms for maintaining the decentralized network of data servers, including adding and deleting servers, the query routing algorithm, and failure recovery algorithms. We also evaluate the performance of the decentralized scheme against a more structured hierarchical indexing scheme that we have previously shown to perform well in distributed grid environments}, keywords = {Computer network management, distributed multidimensional dataset querying, failure recovery, fault tolerant computing, fully decentralized multidimensional indexing, grid computing, Indexing, large scale distributed resource management, Large-scale systems, Multidimensional systems, Network servers, P2P systems, Peer to peer computing, peer-to-peer computing, peer-to-peer systems, Publishing, Query processing, query routing, resource allocation, Resource management, telecommunication network routing, wide area networks}, isbn = {1-4244-0054-6}, doi = {10.1109/IPDPS.2006.1639280}, author = {Nam,Beomseok and Sussman, Alan} } @conference {17888, title = {Spatial indexing of distributed multidimensional datasets}, booktitle = {IEEE International Symposium on Cluster Computing and the Grid, 2005. CCGrid 2005}, volume = {2}, year = {2005}, month = {2005/05/09/12}, pages = {743- 750 Vol. 2 - 743- 750 Vol. 2}, publisher = {IEEE}, organization = {IEEE}, abstract = {While declustering methods for distributed multidimensional indexing of large datasets have been researched widely in the past, replication techniques for multidimensional indexes have not been investigated deeply. In general, a centralized index server may become the performance bottleneck in a wide area network rather than the data servers, since the index is likely to be accessed more often than any of the datasets in the servers. In this paper, we present two different multidimensional indexing algorithms for a distributed environment - a centralized global index and a two-level hierarchical index. Our experimental results show that the centralized scheme does not scale well for either insertion or searching the index. In order to improve the scalability of the index server, we have employed a replication protocol for both the centralized and two-level index schemes that allows some inconsistency between replicas without affecting correctness. Our experiments show that the two-level hierarchical index scheme shows better scalability for both building and searching the index than the non-replicated centralized index, but replication can make the centralized index faster than the two-level hierarchical index for searching in some cases.}, keywords = {centralized global index algorithm, centralized index server, Computer science, database indexing, distributed databases, distributed multidimensional dataset, Educational institutions, File servers, Indexing, Large-scale systems, Multidimensional systems, Network servers, replication protocol, replication techniques, scalability, Sensor systems, spatial data structures, spatial indexing, two-level hierarchical index algorithm, wide area networks}, isbn = {0-7803-9074-1}, doi = {10.1109/CCGRID.2005.1558637}, author = {Nam,B. and Sussman, Alan} } @conference {15029, title = {Strategies for exploring large scale data}, booktitle = {Parallel Architectures, Algorithms and Networks, 2004. Proceedings. 7th International Symposium on}, year = {2004}, month = {2004/05//}, pages = {2 - 2}, abstract = {We consider the problem of querying large scale multidimensional time series data to discover events of interest, test and validate hypotheses, or to associate temporal patterns with specific events. This type of data currently dominates most other types of available data, and will very likely become even more prevalent in the future given the current trends in collecting time series of business, scientific, demographic, and simulation data. The ability to explore such collections interactively, even at a coarse level, will be critical in discovering the information and knowledge embedded in such collections. We develop indexing techniques and search algorithms to efficiently handle temporal range value querying of multidimensional time series data. Our indexing uses linear space data structures that enable the handling of queries in I/O time that is essentially the same as that of handling a single time slice, assuming the availability of a logarithmic number of processors as a function of the temporal window. A data structure with provably almost optimal asymptotic bounds is also presented for the case when the number of multidimensional objects is relatively small. These techniques improve significantly over standard techniques for either serial or parallel processing, and are evaluated by extensive experimental results that confirm their superior performance.}, keywords = {algorithms;, association;, asymptotic, bounds;, business, data, data;, database, databases;, demographic, discovery;, Indexing, indexing;, information, knowledge, large, linear, mining;, multidimensional, objects;, optimal, Parallel, pattern, processing;, query, querying;, range, scale, scientific, search, serial, series, series;, simulation, space, structure;, structures;, techniques;, temporal, TIME, value, very, window;}, doi = {10.1109/ISPAN.2004.1300447}, author = {JaJa, Joseph F.} } @conference {17878, title = {Improving access to multi-dimensional self-describing scientific datasets}, booktitle = {3rd IEEE/ACM International Symposium on Cluster Computing and the Grid, 2003. Proceedings. CCGrid 2003}, year = {2003}, month = {2003/05/12/15}, pages = {172 - 179}, publisher = {IEEE}, organization = {IEEE}, abstract = {Applications that query into very large multidimensional datasets are becoming more common. Many self-describing scientific data file formats have also emerged, which have structural metadata to help navigate the multi-dimensional arrays that are stored in the files. The files may also contain application-specific semantic metadata. In this paper, we discuss efficient methods for performing searches for subsets of multi-dimensional data objects, using semantic information to build multidimensional indexes, and group data items into properly sized chunks to maximize disk I/O bandwidth. This work is the first step in the design and implementation of a generic indexing library that will work with various high-dimension scientific data file formats containing semantic information about the stored data. To validate the approach, we have implemented indexing structures for NASA remote sensing data stored in the HDF format with a specific schema (HDF-EOS), and show the performance improvements that are gained from indexing the datasets, compared to using the existing HDF library for accessing the data.}, keywords = {Application software, application-specific semantic metadata, Bandwidth, Computer science, database indexing, disk I/O bandwidth, distributed databases, Educational institutions, Indexing, indexing structures, Libraries, meta data, Middleware, multidimensional arrays, multidimensional datasets, Multidimensional systems, NASA, NASA remote sensing data, Navigation, query formulation, self-describing scientific data file formats, structural metadata, very large databases}, isbn = {0-7695-1919-9}, doi = {10.1109/CCGRID.2003.1199366}, author = {Nam,B. and Sussman, Alan} } @article {17842, title = {Visualization of large data sets with the Active Data Repository}, journal = {IEEE Computer Graphics and Applications}, volume = {21}, year = {2001}, month = {2001/08//Jul}, pages = {24 - 33}, abstract = {We implement ray-casting-based volume rendering and isosurface rendering methods using the Active Data Repository (ADR) for visualizing out-of-core data sets. We have developed the ADR object-oriented framework to provide support for applications that employ range queries with user-defined mapping and aggregation operations on large-scale multidimensional data. ADR targets distributed-memory parallel machines with one or more disks attached to each node. It is designed as a set of modular services implemented in C++, which can be customized for application-specific processing. The ADR runtime system supports common operations such as memory management, data retrieval, and scheduling of processing across a parallel machine}, keywords = {active data repository, ADR runtime system, Algorithm design and analysis, application-specific processing, C++, data mining, data retrieval, data visualisation, Data visualization, distributed-memory parallel machines, Indexing, Information retrieval, isosurface rendering, Isosurfaces, large data sets, large-scale multidimensional data, Memory management, modular services, out-of-core data sets, parallel machine, Parallel machines, Partitioning algorithms, ray-casting-based volume rendering, Rendering (computer graphics), Runtime, software libraries, storage management}, isbn = {0272-1716}, doi = {10.1109/38.933521}, author = {Kurc, T. and Catalyurek,U. and Chang,Chialin and Sussman, Alan and Saltz, J.} } @conference {17080, title = {Developing the next generation of Earth science data systems: the Global Land Cover Facility}, booktitle = {Geoscience and Remote Sensing Symposium, 1999. IGARSS {\textquoteright}99 Proceedings. IEEE 1999 International}, volume = {1}, year = {1999}, month = {1999///}, pages = {616-618 vol.1 - 616-618 vol.1}, publisher = {IEEE}, organization = {IEEE}, abstract = {A recent initiative by NASA has resulted in the formation of a federation of Earth science data partners. These Earth Science Information Partners (ESIPs) have been tasked with creating novel Earth science data products and services as well as distributing new and existing data sets to the Earth science community and the general public. The University of Maryland established its ESIP activities with the creation of the Global Land Cover Facility (GLCF). This joint effort of the Institute for Advanced Computer Studies (UMIACS) and the Department of Geography has developed an operational data archiving and distribution system aimed at advancing current land cover research efforts. The success of the GLCF is tied closely to assessing user needs as well. As the timely delivery of data products to the research community. This paper discusses the development and implementation of a web-based interface that allows users to query the authors{\textquoteright} data holdings and perform user requested processing tasks on demand. The GLCF takes advantage of a scaleable, high performance computing architecture for the manipulation of very large remote sensing data sets and the rapid spatial indexing of multiple format data types. The user interface has been developed with the cooperation of the Human-Computer Interaction Laboratory (HCIL) and demonstrates advances in spatial and temporal querying tools as well as the ability to overlay multiple raster and vector data sets. Their work provides one perspective concerning how critical earth science data may be handled in the near future by a coalition of distributed data centers}, keywords = {Computer architecture, data archiving, data distribution system, Data systems, Distributed computing, Earth science data products, Earth science data system, ESIP, geographic information system, geographic information systems, Geography, geophysical measurement technique, geophysical signal processing, geophysical techniques, Geoscience, GIS, GLCF, Global Land Cover Facility, High performance computing, Indexing, information service, Information services, Institute for Advanced Computer Studies, land cover, NASA, next generation, PACS, Remote sensing, terrain mapping, UMIACS, University of Maryland, User interfaces, web-based interface}, isbn = {0-7803-5207-6}, doi = {10.1109/IGARSS.1999.773583}, author = {Lindsay,F.E. and Townshend,J.R.G. and JaJa, Joseph F. and Humphries,J. and Plaisant, Catherine and Shneiderman, Ben} }