@Article{csse.2023.035119, AUTHOR = {Xiaorui Zhang, Xianglong Zeng, Wei Sun, Yongjun Ren,2,3, Tong Xu}, TITLE = {Multimodal Spatiotemporal Feature Map for Dynamic Gesture Recognition}, JOURNAL = {Computer Systems Science and Engineering}, VOLUME = {46}, YEAR = {2023}, NUMBER = {1}, PAGES = {671--686}, URL = {http://www.techscience.com/csse/v46n1/51347}, ISSN = {}, ABSTRACT = {Gesture recognition technology enables machines to read human gestures and has significant application prospects in the fields of human-computer interaction and sign language translation. Existing researches usually use convolutional neural networks to extract features directly from raw gesture data for gesture recognition, but the networks are affected by much interference information in the input data and thus fit to some unimportant features. In this paper, we proposed a novel method for encoding spatio-temporal information, which can enhance the key features required for gesture recognition, such as shape, structure, contour, position and hand motion of gestures, thereby improving the accuracy of gesture recognition. This encoding method can encode arbitrarily multiple frames of gesture data into a single frame of the spatio-temporal feature map and use the spatio-temporal feature map as the input to the neural network. This can guide the model to fit important features while avoiding the use of complex recurrent network structures to extract temporal features. In addition, we designed two sub-networks and trained the model using a sub-network pre-training strategy that trains the sub-networks first and then the entire network, so as to avoid the sub-networks focusing too much on the information of a single category feature and being overly influenced by each other’s features. Experimental results on two public gesture datasets show that the proposed spatio-temporal information encoding method achieves advanced accuracy.}, DOI = {10.32604/csse.2023.035119} }