@article{10.1145/3687129,
author = {Panda, Aditya and Mukherjee, Dipti Prasad},
title = {Knowledge Guided Transformer Network for Compositional Zero-shot Learning},
year = {2024},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
issn = {1551-6857},
url = {https://doi.org/10.1145/3687129},
doi = {10.1145/3687129},
abstract = {Compositional Zero-shot Learning (CZSL) attempts to recognise images of new compositions of states and objects, when images of only a subset of state-object compositions are available as training data. An example of CZSL is to recognise images of peeled apple by a model when it is trained using images of peeled orange, ripe apple and ripe orange. There are two major challenges in solving CZSL. First, the visual features of state vary depending on the context in a state-object composition. For example state like ripe produces distinct visual properties in the compositions ripe orange and ripe banana. Hence, understanding the context dependency of state features is a necessary requirement to solve CZSL. Second, the extent of association between the features of a state and an object vary significantly in different images of same composition. For example in different images of peeled orange, the oranges may be peeled to different extents. As a consequence, the visual features of images of the class peeled orange may vary. Hence there exists significant amount of intra-class variability among the visual features of different images of a composition. Existing approaches merely look for existence or absence of features of a particular state or object in a composition. Our approach not only looks for the existence of a particular state features or object features but also the extent of association of state features and object features to better tackle the intra-class variability in visual features of compositional images. The proposed architecture is constructed using a novel Knowledge Guided Transformer. The transformer based framework is utilised for processing larger context dependency between the state and object. Extensive experiments on C-GQA, MIT-States and UT-Zappos50k datasets demonstrate the superiority of the proposed approach in comparison with the state-of-the-arts in both open-world and closed-world CZSL settings.},
note = {Just Accepted},
journal = {ACM Trans. Multimedia Comput. Commun. Appl.},
month = {aug},
keywords = {Compositionality, Compositional zero-shot learning, state-object composition, partial association}
}