@conference {12543, title = {Learning action dictionaries from video}, booktitle = {Image Processing, 2008. ICIP 2008. 15th IEEE International Conference on}, year = {2008}, month = {2008/10//}, pages = {1704 - 1707}, abstract = {Summarizing the contents of a video containing human activities is an important problem in computer vision and has important applications in automated surveillance systems. Summarizing a video requires one to identify and learn a {\textquoteright}vocabulary{\textquoteright} of action-phrases corresponding to specific events and actions occurring in the video. We propose a generative model for dynamic scenes containing human activities as a composition of independent action-phrases - each of which is derived from an underlying vocabulary. Given a long video sequence, we propose a completely unsupervised approach to learn the vocabulary. Once the vocabulary is learnt, a video segment can be decomposed into a collection of phrases for summarization. We then describe methods to learn the correlations between activities and sequentiality of events. We also propose a novel method for building invariances to spatial transforms in the summarization scheme.}, keywords = {(artificial, action, action-phrases;learning, automated, decomposition;video, dictionaries;spatial, intelligence);video, segment, segmentation;learning, sequence;computer, Surveillance, surveillance;, systems;computer, transforms;video, vision;image, vision;independent}, doi = {10.1109/ICIP.2008.4712102}, author = {Turaga,P. and Chellapa, Rama} }