@Article{cmc.2020.011886,
AUTHOR = {Yugang Li, Haibo Sun, Zhe Chen, Yudan Ding, Siqi Zhou},
TITLE = {Stacked Attention Networks for Referring Expressions  Comprehension},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {65},
YEAR = {2020},
NUMBER = {3},
PAGES = {2529--2541},
URL = {http://www.techscience.com/cmc/v65n3/40185},
ISSN = {1546-2226},
ABSTRACT = {Referring expressions comprehension is the task of locating the image region 
described by a natural language expression, which refer to the properties of the region or 
the relationships with other regions. Most previous work handles this problem by 
selecting the most relevant regions from a set of candidate regions, when there are many 
candidate regions in the set these methods are inefficient. Inspired by recent success of 
image captioning by using deep learning methods, in this paper we proposed a framework 
to understand the referring expressions by multiple steps of reasoning. We present a 
model for referring expressions comprehension by selecting the most relevant region
directly from the image. The core of our model is a recurrent attention network which can 
be seen as an extension of Memory Network. The proposed model capable of improving 
the results by multiple computational hops. We evaluate the proposed model on two 
referring expression datasets: Visual Genome and Flickr30k Entities. The experimental 
results demonstrate that the proposed model outperform previous state-of-the-art methods
both in accuracy and efficiency. We also conduct an ablation experiment to show that the 
performance of the model is not getting better with the increase of the attention layers.},
DOI = {10.32604/cmc.2020.011886}
}