@conference {13385, title = {Minimizing Communication Cost in Distributed Multi-query Processing}, booktitle = {IEEE 25th International Conference on Data Engineering, 2009. ICDE {\textquoteright}09}, year = {2009}, month = {2009/04/29/March}, pages = {772 - 783}, publisher = {IEEE}, organization = {IEEE}, abstract = {Increasing prevalence of large-scale distributed monitoring and computing environments such as sensor networks, scientific federations, Grids etc., has led to a renewed interest in the area of distributed query processing and optimization. In this paper we address a general, distributed multi-query processing problem motivated by the need to minimize the communication cost in these environments. Specifically we address the problem of optimally sharing data movement across the communication edges in a distributed communication network given a set of overlapping queries and query plans for them (specifying the operations to be executed). Most of the problem variations of our general problem can be shown to be NP-Hard by a reduction from the Steiner tree problem. However, we show that the problem can be solved optimally if the communication network is a tree, and present a novel algorithm for finding an optimal data movement plan. For general communication networks, we present efficient approximation algorithms for several variations of the problem. Finally, we present an experimental study over synthetic datasets showing both the need for exploiting the sharing of data movement and the effectiveness of our algorithms at finding such plans.}, keywords = {Approximation algorithms, Communication networks, Computer science, Cost function, Data engineering, distributed communication network, distributed databases, distributed multi-query processing, grid computing, Large-scale systems, NP-hard, optimisation, Polynomials, Publish-subscribe, publish-subscribe systems, Query optimization, Query processing, sensor networks, Steiner tree problem, Tree graphs, trees (mathematics)}, isbn = {978-1-4244-3422-0}, doi = {10.1109/ICDE.2009.85}, author = {Li,Jian and Deshpande, Amol and Khuller, Samir} } @conference {17877, title = {DiST: fully decentralized indexing for querying distributed multidimensional datasets}, booktitle = {Parallel and Distributed Processing Symposium, 2006. IPDPS 2006. 20th International}, year = {2006}, month = {2006/04//}, publisher = {IEEE}, organization = {IEEE}, abstract = {Grid computing and peer-to-peer (P2P) systems are emerging as new paradigms for managing large scale distributed resources across wide area networks. While grid computing focuses on managing heterogeneous resources and relies on centralized managers for resource and data discovery, P2P systems target scalable, decentralized methods for publishing and searching for data. In large distributed systems, a centralized resource manager is a potential performance bottleneck and decentralization can help avoid this bottleneck, as is done in P2P systems. However, the query functionality provided by most existing P2P systems is very rudimentary, and is not directly applicable to grid resource management. In this paper, we propose a fully decentralized multidimensional indexing structure, called DiST, that operates in a fully distributed environment with no centralized control. In DiST, each data server only acquires information about data on other servers from executing and routing queries. We describe the DiST algorithms for maintaining the decentralized network of data servers, including adding and deleting servers, the query routing algorithm, and failure recovery algorithms. We also evaluate the performance of the decentralized scheme against a more structured hierarchical indexing scheme that we have previously shown to perform well in distributed grid environments}, keywords = {Computer network management, distributed multidimensional dataset querying, failure recovery, fault tolerant computing, fully decentralized multidimensional indexing, grid computing, Indexing, large scale distributed resource management, Large-scale systems, Multidimensional systems, Network servers, P2P systems, Peer to peer computing, peer-to-peer computing, peer-to-peer systems, Publishing, Query processing, query routing, resource allocation, Resource management, telecommunication network routing, wide area networks}, isbn = {1-4244-0054-6}, doi = {10.1109/IPDPS.2006.1639280}, author = {Nam,Beomseok and Sussman, Alan} } @conference {17888, title = {Spatial indexing of distributed multidimensional datasets}, booktitle = {IEEE International Symposium on Cluster Computing and the Grid, 2005. CCGrid 2005}, volume = {2}, year = {2005}, month = {2005/05/09/12}, pages = {743- 750 Vol. 2 - 743- 750 Vol. 2}, publisher = {IEEE}, organization = {IEEE}, abstract = {While declustering methods for distributed multidimensional indexing of large datasets have been researched widely in the past, replication techniques for multidimensional indexes have not been investigated deeply. In general, a centralized index server may become the performance bottleneck in a wide area network rather than the data servers, since the index is likely to be accessed more often than any of the datasets in the servers. In this paper, we present two different multidimensional indexing algorithms for a distributed environment - a centralized global index and a two-level hierarchical index. Our experimental results show that the centralized scheme does not scale well for either insertion or searching the index. In order to improve the scalability of the index server, we have employed a replication protocol for both the centralized and two-level index schemes that allows some inconsistency between replicas without affecting correctness. Our experiments show that the two-level hierarchical index scheme shows better scalability for both building and searching the index than the non-replicated centralized index, but replication can make the centralized index faster than the two-level hierarchical index for searching in some cases.}, keywords = {centralized global index algorithm, centralized index server, Computer science, database indexing, distributed databases, distributed multidimensional dataset, Educational institutions, File servers, Indexing, Large-scale systems, Multidimensional systems, Network servers, replication protocol, replication techniques, scalability, Sensor systems, spatial data structures, spatial indexing, two-level hierarchical index algorithm, wide area networks}, isbn = {0-7803-9074-1}, doi = {10.1109/CCGRID.2005.1558637}, author = {Nam,B. and Sussman, Alan} } @conference {17588, title = {Efficient algorithms for location and sizing problems in network design}, booktitle = {IEEE Global Telecommunications Conference, 2001. GLOBECOM {\textquoteright}01}, volume = {4}, year = {2001}, month = {2001///}, pages = {2586-2590 vol.4 - 2586-2590 vol.4}, publisher = {IEEE}, organization = {IEEE}, abstract = {Large-scale location, sizing and homing problems of distributed network elements, have received much attention recently due to the massive deployment of broadband communication networks for services like Internet telephony and Web caching. Key considerations in designing these networks include modularity of capacity, economies of scale in cost, and reliability. We formulate a general class of such network design problems as Mixed-Integer Programs. These problems are computationally intractable in general; under various asymptotic conditions, we show how to compute near-optimal solutions. To deal with arbitrary instances, we develop new algorithms based on linear programming, as well as greedy randomized adaptive search. These algorithms achieved near-optimal solutions with reasonable computation time for our experiments}, keywords = {Algorithm design and analysis, Broadband communication, broadband communication networks, broadband networks, capacity modularity, Communication networks, computer network reliability, distributed network elements, Economies of scale, greedy randomized adaptive search, homing, integer programming, Intelligent networks, Internet telephony, large-scale location problems, Large-scale systems, Linear programming, mixed-integer programs, near-optimal solutions, network design, Optical switches, randomised algorithms, reliability, search problems, sizing, Technological innovation, telecommunication network planning, Web caching}, isbn = {0-7803-7206-9}, doi = {10.1109/GLOCOM.2001.966243}, author = {Kumaran,K. and Srinivasan, Aravind and Qiong Wang and Lanning,S. and Ramakrishnan,K. G} } @conference {16779, title = {Integrating distributed scientific data sources with MOCHA and XRoaster}, booktitle = {Thirteenth International Conference on Scientific and Statistical Database Management, 2001. SSDBM 2001. Proceedings}, year = {2001}, month = {2001///}, pages = {263 - 266}, publisher = {IEEE}, organization = {IEEE}, abstract = {MOCHA is a novel middleware system for integrating distributed data sources that we have developed at the University of Maryland. MOCHA is based on the idea that the code that implements user-defined types and functions should be automatically deployed to remote sites by the middleware system itself. To this end, we have developed an XML-based framework to specify metadata about data sites, data sets, and user-defined types and functions. XRoaster is a graphical tool that we have developed to help the user create all the XML metadata elements to be used in MOCHA}, keywords = {client-server systems, data sets, data sites, Databases, Distributed computing, distributed databases, distributed scientific data source integration, Educational institutions, graphical tool, hypermedia markup languages, IP networks, java, Large-scale systems, Maintenance engineering, meta data, metadata, Middleware, middleware system, MOCHA, Query processing, remote sites, scientific information systems, user-defined types, visual programming, XML, XML metadata elements, XML-based framework, XRoaster}, isbn = {0-7695-1218-6}, doi = {10.1109/SSDM.2001.938560}, author = {Rodriguez-Martinez,M. and Roussopoulos, Nick and McGann,J. M and Kelley,S. and Mokwa,J. and White,B. and Jala,J.} } @article {16297, title = {An experiment to assess the cost-benefits of code inspections in large scale software development}, journal = {IEEE Transactions on Software Engineering}, volume = {23}, year = {1997}, month = {1997/06//}, pages = {329 - 346}, abstract = {We conducted a long term experiment to compare the costs and benefits of several different software inspection methods. These methods were applied by professional developers to a commercial software product they were creating. Because the laboratory for this experiment was a live development effort, we took special care to minimize cost and risk to the project, while maximizing our ability to gather useful data. The article has several goals: (1) to describe the experiment{\textquoteright}s design and show how we used simulation techniques to optimize it; (2) to present our results and discuss their implications for both software practitioners and researchers; and (3) to discuss several new questions raised by our findings. For each inspection, we randomly assigned three independent variables: (1) the number of reviewers on each inspection team (1, 2, or 4); (2) the number of teams inspecting the code unit (1 or 2); and (3) the requirement that defects be repaired between the first and second team{\textquoteright}s inspections. The reviewers for each inspection were randomly selected without replacement from a pool of 11 experienced software developers. The dependent variables for each inspection included inspection interval (elapsed time), total effort, and the defect detection rate. Our results showed that these treatments did not significantly influence the defect detection effectiveness, but that certain combinations of changes dramatically increased the inspection interval}, keywords = {Analysis of variance, code inspection cost benefits, code unit, commercial software product, Computer Society, cost-benefit analysis, Costs, defect detection effectiveness, defect detection rate, Design optimization, experienced software developers, experiment design, independent variables, Inspection, inspection interval, inspection team, Laboratories, large scale software development, Large-scale systems, live development effort, long term experiment, professional aspects, professional developers, Programming, reviewers, simulation techniques, software cost estimation, software inspection methods, software practitioners, Software quality, Switches}, isbn = {0098-5589}, doi = {10.1109/32.601071}, author = {Porter, Adam and Siy,H. P and Toman,C. A and Votta,L. G.} } @article {14816, title = {The Paradyn parallel performance measurement tool}, journal = {Computer}, volume = {28}, year = {1995}, month = {1995/11//}, pages = {37 - 46}, abstract = {Paradyn is a tool for measuring the performance of large-scale parallel programs. Our goal in designing a new performance tool was to provide detailed, flexible performance information without incurring the space (and time) overhead typically associated with trace-based tools. Paradyn achieves this goal by dynamically instrumenting the application and automatically controlling this instrumentation in search of performance problems. Dynamic instrumentation lets us defer insertion until the moment it is needed (and remove it when it is no longer needed); Paradyn{\textquoteright}s Performance Consultant decides when and where to insert instrumentation}, keywords = {Aerodynamics, Automatic control, automatic instrumentation control, Debugging, dynamic instrumentation, flexible performance information, high level languages, insertion, Instruments, large-scale parallel program, Large-scale systems, measurement, Paradyn parallel performance measurement tool, Parallel machines, parallel programming, Performance Consultant, Programming profession, scalability, software performance evaluation, software tools}, isbn = {0018-9162}, doi = {10.1109/2.471178}, author = {Miller, B. P and Callaghan, M. D and Cargille, J. M and Hollingsworth, Jeffrey K and Irvin, R. B and Karavanic, K. L and Kunchithapadam, K. and Newhall, T.} } @conference {14760, title = {Dynamic program instrumentation for scalable performance tools}, booktitle = {Scalable High-Performance Computing Conference, 1994., Proceedings of the}, year = {1994}, month = {1994/05//}, pages = {841 - 850}, publisher = {IEEE}, organization = {IEEE}, abstract = {Presents a new technique called {\textquoteleft}dynamic instrumentation{\textquoteright} that provides efficient, scalable, yet detailed data collection for large-scale parallel applications. Our approach is unique because it defers inserting any instrumentation until the application is in execution. We can insert or change instrumentation at any time during execution by modifying the application{\textquoteright}s binary image. Only the instrumentation required for the currently selected analysis or visualization is inserted. As a result, our technique collects several orders of magnitude less data than traditional data collection approaches. We have implemented a prototype of our dynamic instrumentation on the CM-5, and present results for several real applications. In addition, we include recommendations to operating system designers, compiler writers, and computer architects about the features necessary to permit efficient monitoring of large-scale parallel systems}, keywords = {Application software, binary image, compiler writing, Computer architecture, Computer displays, Computerized monitoring, Concurrent computing, data acquisition, data collection, data visualisation, Data visualization, dynamic program instrumentation, efficient monitoring, executing program, Instruments, large-scale parallel applications, Large-scale systems, operating system design, Operating systems, parallel programming, program analysis, program diagnostics, program visualization, Programming profession, Sampling methods, scalable performance tools, software tools}, isbn = {0-8186-5680-8}, doi = {10.1109/SHPCC.1994.296728}, author = {Hollingsworth, Jeffrey K and Miller, B. P and Cargille, J.} } @article {14189, title = {A syntactic approach to scale-space-based corner description}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, volume = {16}, year = {1994}, month = {1994/07//}, pages = {748 - 751}, abstract = {Planar curves are described by information about corners integrated over various levels of resolution. The detection of corners takes place on a digital representation. To compensate for ambiguities arising from sampling problems due to the discreteness, results about the local behavior of curvature extrema in continuous scale-space are employed}, keywords = {Computer vision, corner detection, curvature extrema, edge detection, IMAGE PROCESSING, image resolution, Image segmentation, Laboratories, Large-scale systems, PARALLEL PROCESSING, pattern recognition, planar curves, resolution, Sampling methods, sampling problems, scale space based corner description, SHAPE, Smoothing methods, syntactic approach}, isbn = {0162-8828}, doi = {10.1109/34.297957}, author = {Ferm{\"u}ller, Cornelia and Kropatsch,W.} } @conference {16320, title = {Software metric classification trees help guide the maintenance of large-scale systems}, booktitle = {, Conference on Software Maintenance, 1989., Proceedings}, year = {1989}, month = {1989/10/16/19}, pages = {116 - 123}, publisher = {IEEE}, organization = {IEEE}, abstract = {The 80:20 rule states that approximately 20\% of a software system is responsible for 80\% of its errors. The authors propose an automated method for generating empirically-based models of error-prone software objects. These models are intended to help localize the troublesome 20\%. The method uses a recursive algorithm to automatically generate classification trees whose nodes are multivalued functions based on software metrics. The purpose of the classification trees is to identify components that are likely to be error prone or costly, so that developers can focus their resources accordingly. A feasibility study was conducted using 16 NASA projects. On average, the classification trees correctly identified 79.3\% of the software modules that had high development effort or faults}, keywords = {automated method, automatic programming, classification, Classification tree analysis, classification trees, Computer errors, empirically-based models, error-prone software objects, Fault diagnosis, feasibility study, high development effort, Large-scale systems, multivalued functions, NASA, NASA projects, recursive algorithm, Software algorithms, software engineering, Software maintenance, Software measurement, software metrics, software modules, Software systems, trees (mathematics)}, isbn = {0-8186-1965-1}, doi = {10.1109/ICSM.1989.65202}, author = {Selby,R. W and Porter, Adam} }