@conference {17781, title = {T-rex: A domain-independent system for automated cultural information extraction}, booktitle = {Proceedings of the First International Conference on Computational Cultural Dynamics (ICCCD 2007)}, year = {2007}, month = {2007///}, abstract = {RDF (Resource Description Framework) is a web standarddefined by the World Wide Web Consortium. In RDF, we can define schemas of interest. For example, we can define a schema about tribes on the Pakistan-Afghanistan borderland, or a schema about violent events. An RDF instance is a set of facts that are compatible with the schema. The principal contribution of this paper is the development of a scalable system called T-REX (short for {\textquotedblleft}The RDF EXtractor{\textquotedblright}) that allows us to extract instances associated with a user-specified schema, independently of the domain about which we wish to extract data. Using T-REX, we have successfully extracted information about various aspects of about 20 tribes living in the Pakistan-Afghanistan border. Moreover, we have used T-REX to successfully extract occurrences of violent events from a set of 80 news sites in approximately 50 countries. T-REX scales well {\textendash} it has processed approximately 45,000 web pages per day for the last 6 months. }, author = {Albanese, M. and V.S. Subrahmanian} }