Please refer to Google Scholar for the full publication list.
2026
Preprint
World Model for Robot Learning: A Comprehensive Survey
Bohan Hou*, Gen Li*, Jindou Jia*, Tuo An*, Xinying Guo*, Sicong Leng, Haoran Geng, Yanjie Ze, Tatsuya Harada, Philip Torr, Oier Mees, Marc Pollefeys, Zhuang Liu, Jiajun Wu, Pieter Abbeel, Jitendra Malik, Yilun Du, and Jianfei Yang
@article{wm-survey,title={World Model for Robot Learning: A Comprehensive Survey},author={Hou, Bohan and Li, Gen and Jia, Jindou and An, Tuo and Guo, Xinying and Leng, Sicong and Geng, Haoran and Ze, Yanjie and Harada, Tatsuya and Torr, Philip and Mees, Oier and Pollefeys, Marc and Liu, Zhuang and Wu, Jiajun and Abbeel, Pieter and Malik, Jitendra and Du, Yilun and Yang, Jianfei},year={2026},journal={arXiv},}
Preprint
CompassAD: Intent-Driven 3D Affordance Grounding in Functionally Competing Objects
Jingliang Li, Jindou Jia, Tuo An, Chuhao Zhou, Xiangyu Chen, Shilin Shan, Boyu Ma, Bofan Lyu, Gen Li†, and Jianfei Yang†
@article{compassad,title={CompassAD: Intent-Driven 3D Affordance Grounding in Functionally Competing Objects},author={Li, Jingliang and Jia, Jindou and An, Tuo and Zhou, Chuhao and Chen, Xiangyu and Shan, Shilin and Ma, Boyu and Lyu, Bofan and Li, Gen and Yang, Jianfei},year={2026},journal={arXiv},}
Preprint
Evo-0: Vision-Language-Action Model with Implicit Spatial Understanding
Tao Lin*, Gen Li*, Yilei Zhong, Yanwen Zou, Yuxin Du, Jiting Liu, Encheng Gu, and Bo Zhao
@article{evo0,title={Evo-0: Vision-Language-Action Model with Implicit Spatial Understanding},author={Lin, Tao and Li, Gen and Zhong, Yilei and Zou, Yanwen and Du, Yuxin and Liu, Jiting and Gu, Encheng and Zhao, Bo},year={2026},journal={arXiv},}
RSS’26
Action-to-Action Flow Matching
Jindou Jia*, Gen Li*, Xiangyu Chen, Tuo An, Yuxuan Hu, Jingliang Li, Xinying Guo, and Jianfei Yang
@inproceedings{a2a,title={Action-to-Action Flow Matching},author={Jia, Jindou and Li, Gen and Chen, Xiangyu and An, Tuo and Hu, Yuxuan and Li, Jingliang and Guo, Xinying and Yang, Jianfei},year={2026},booktitle={Robotics: Science and Systems},}
CVPR’26
Evo-1: Lightweight Vision-Language-Action Model with Preserved Semantic Alignment
Tao Lin, Yilei Zhong, Yuxin Du, Jingjing Zhang, Jiting Liu, Yinxinyu Chen, Encheng Gu, Ziyan Liu, Hongyi Cai, Yanwen Zou, Lixing Zou, Zhaoye Zhou, Gen Li†, and Bo Zhao†
In IEEE/CVF Conference on Computer Vision and Pattern Recognition, 2026
@inproceedings{evo1,title={Evo-1: Lightweight Vision-Language-Action Model with Preserved Semantic Alignment},author={Lin, Tao and Zhong, Yilei and Du, Yuxin and Zhang, Jingjing and Liu, Jiting and Chen, Yinxinyu and Gu, Encheng and Liu, Ziyan and Cai, Hongyi and Zou, Yanwen and Zou, Lixing and Zhou, Zhaoye and Li, Gen and Zhao, Bo},year={2026},booktitle={IEEE/CVF Conference on Computer Vision and Pattern Recognition},}
CVPR’26
PALM: Progress-Aware Policy Learning via Affordance Reasoning for Long-Horizon Robotic Manipulation
Yuanzhe Liu, Jingyuan Zhu, Yuchen Mo, Gen Li, Xu Cao, Jin Jin, Yifan Shen, Zhengyuan Li, Tianjiao Yu, Wenzhen Yuan, Fangqiang Ding, and Ismini Lourentzou
In IEEE/CVF Conference on Computer Vision and Pattern Recognition, 2026
@inproceedings{palm,title={PALM: Progress-Aware Policy Learning via Affordance Reasoning for Long-Horizon Robotic Manipulation},author={Liu, Yuanzhe and Zhu, Jingyuan and Mo, Yuchen and Li, Gen and Cao, Xu and Jin, Jin and Shen, Yifan and Li, Zhengyuan and Yu, Tianjiao and Yuan, Wenzhen and Ding, Fangqiang and Lourentzou, Ismini},year={2026},booktitle={IEEE/CVF Conference on Computer Vision and Pattern Recognition},}
AAAI’26
Mask2IV: Interaction-Centric Video Generation via Mask Trajectories
Gen Li, Bo Zhao, Jianfei Yang, and Laura Sevilla-Lara
In AAAI Conference on Artificial Intelligence, 2026
@inproceedings{Mask2IV,title={Mask2IV: Interaction-Centric Video Generation via Mask Trajectories},author={Li, Gen and Zhao, Bo and Yang, Jianfei and Sevilla-Lara, Laura},year={2026},booktitle={AAAI Conference on Artificial Intelligence},}
2025
ACM MM’25
Dual Enhancement on 3D Vision-Language Perception for Monocular 3D Visual Grounding
Yuzhen Li, Min Liu, Yuan Bian, Xueping Wang, Zhaoyang Li, Gen Li, and Yaonan Wang
In Proceedings of the 33rd ACM International Conference on Multimedia, 2025
@inproceedings{li2025dual,title={Dual Enhancement on 3D Vision-Language Perception for Monocular 3D Visual Grounding},author={Li, Yuzhen and Liu, Min and Bian, Yuan and Wang, Xueping and Li, Zhaoyang and Li, Gen and Wang, Yaonan},booktitle={Proceedings of the 33rd ACM International Conference on Multimedia},year={2025},}
ICCV’25
Learning Precise Affordances from Egocentric Videos for Robotic Manipulation
Gen Li, Nikolaos Tsagkas, Jifei Song, Ruaridh Mon-Williams, Sethu Vijayakumar, Kun Shao, and Laura Sevilla-Lara
In IEEE/CVF International Conference on Computer Vision, 2025
@inproceedings{Aff-Grasp,title={Learning Precise Affordances from Egocentric Videos for Robotic Manipulation},author={Li, Gen and Tsagkas, Nikolaos and Song, Jifei and Mon-Williams, Ruaridh and Vijayakumar, Sethu and Shao, Kun and Sevilla-Lara, Laura},year={2025},booktitle={IEEE/CVF International Conference on Computer Vision},}
ICCV’25
Principles of Visual Tokens for Efficient Video Understanding
Xinyue Hao, Gen Li, Shreyank N Gowda, Robert B Fisher, Jonathan Huang, Anurag Arnab, and Laura Sevilla-Lara
In IEEE/CVF International Conference on Computer Vision, 2025
@inproceedings{hao2024principles,title={Principles of Visual Tokens for Efficient Video Understanding},author={Hao, Xinyue and Li, Gen and Gowda, Shreyank N and Fisher, Robert B and Huang, Jonathan and Arnab, Anurag and Sevilla-Lara, Laura},booktitle={IEEE/CVF International Conference on Computer Vision},year={2025},}
IROS’25
Resource-Efficient Affordance Grounding with Complementary Depth and Semantic Prompts
Yizhou Huang, Fan Yang, Guoliang Zhu, Gen Li, Hao Shi, Yukun Zuo, Wenrui Chen, Zhiyong Li, and Kailun Yang
In International Conference on Intelligent Robots and Systems, 2025
@inproceedings{huang2025resource,title={Resource-Efficient Affordance Grounding with Complementary Depth and Semantic Prompts},author={Huang, Yizhou and Yang, Fan and Zhu, Guoliang and Li, Gen and Shi, Hao and Zuo, Yukun and Chen, Wenrui and Li, Zhiyong and Yang, Kailun},booktitle={International Conference on Intelligent Robots and Systems},year={2025},}
@article{ELLMER,title={Embodied Large Language Models Enable Robots to Complete Complex Tasks in Unpredictable Environments},author={Mon-Williams, Ruaridh and Li, Gen and Long, Ran and Du, Wenqian and Lucas, Chris},journal={Nature Machine Intelligence},year={2025},}
2024
ECCVW’24
Watt for what: Rethinking deep learning’s energy-performance relationship
Shreyank N Gowda, Xinyue Hao, Gen Li, Shashank Narayana Gowda, Xiaobo Jin, and Laura Sevilla-Lara
In European Conference on Computer Vision Workshop, 2024
@inproceedings{gowda2025watt,title={Watt for what: Rethinking deep learning’s energy-performance relationship},author={Gowda, Shreyank N and Hao, Xinyue and Li, Gen and Gowda, Shashank Narayana and Jin, Xiaobo and Sevilla-Lara, Laura},booktitle={European Conference on Computer Vision Workshop},pages={388--405},year={2024},organization={Springer},}
CVPR’24
One-Shot Open Affordance Learning with Foundation Models
Gen Li, Deqing Sun, Laura Sevilla-Lara, and Varun Jampani
In IEEE/CVF Conference on Computer Vision and Pattern Recognition, 2024
@inproceedings{OOAL,title={One-Shot Open Affordance Learning with Foundation Models},author={Li, Gen and Sun, Deqing and Sevilla-Lara, Laura and Jampani, Varun},booktitle={IEEE/CVF Conference on Computer Vision and Pattern Recognition},year={2024},}
2023
IJCNN’23
Referenceless User Controllable Semantic Image Synthesis
Jonghyun Kim, Gen Li, and Joongkyu Kim
In International Joint Conference on Neural Networks, 2023
@inproceedings{Refer,title={Referenceless User Controllable Semantic Image Synthesis},author={Kim, Jonghyun and Li, Gen and Kim, Joongkyu},booktitle={International Joint Conference on Neural Networks},year={2023},}
CVPR’23
LOCATE: Localize and Transfer Object Parts for Weakly Supervised Affordance Grounding
Gen Li, Varun Jampani, Deqing Sun, and Laura Sevilla-Lara
In IEEE/CVF Conference on Computer Vision and Pattern Recognition, 2023
@inproceedings{LOCATE,title={LOCATE: Localize and Transfer Object Parts for Weakly Supervised Affordance Grounding},author={Li, Gen and Jampani, Varun and Sun, Deqing and Sevilla-Lara, Laura},booktitle={IEEE/CVF Conference on Computer Vision and Pattern Recognition},year={2023},}
2021
CVPR’21
Adaptive Prototype Learning and Allocation for Few-Shot Segmentation
Gen Li, Varun Jampani, Laura Sevilla-Lara, Deqing Sun, Jonghyun Kim, and Joongkyu Kim
In IEEE/CVF Conference on Computer Vision and Pattern Recognition, 2021
@inproceedings{ASGNet,title={Adaptive Prototype Learning and Allocation for Few-Shot Segmentation},author={Li, Gen and Jampani, Varun and Sevilla-Lara, Laura and Sun, Deqing and Kim, Jonghyun and Kim, Joongkyu},booktitle={IEEE/CVF Conference on Computer Vision and Pattern Recognition},pages={8334--8343},year={2021},}
BMVC’21
SuperStyleNet: Deep Image Synthesis with Superpixel Based Style Encoder
Jonghyun Kim, Gen Li, Cheolkon Jung, and Joongkyu Kim
@inproceedings{kim2021superstylenet,title={SuperStyleNet: Deep Image Synthesis with Superpixel Based Style Encoder},author={Kim, Jonghyun and Li, Gen and Jung, Cheolkon and Kim, Joongkyu},booktitle={British Machine Vision Conference},year={2021},}
PR
Weakly-supervised temporal attention 3D network for human action recognition
Jonghyun Kim, Gen Li, Inyong Yun, Cheolkon Jung, and Joongkyu Kim
@article{KIM2021108068,title={Weakly-supervised temporal attention 3D network for human action recognition},journal={Pattern Recognition},volume={119},pages={108068},year={2021},issn={0031-3203},author={Kim, Jonghyun and Li, Gen and Yun, Inyong and Jung, Cheolkon and Kim, Joongkyu},keywords={Action recognition, Temporal attention, Convolutional neural network, Weakly-supervised learning, Video analysis, Video classification},}
Neurocom
Edge and identity preserving network for face super-resolution
Jonghyun Kim, Gen Li, Inyong Yun, Cheolkon Jung, and Joongkyu Kim
@article{KIM202111,title={Edge and identity preserving network for face super-resolution},journal={Neurocomputing},volume={446},pages={11-22},year={2021},issn={0925-2312},author={Kim, Jonghyun and Li, Gen and Yun, Inyong and Jung, Cheolkon and Kim, Joongkyu},keywords={Super-resolution, Face hallucination, Edge block, Identity loss, Image enhancement},}
2020
Access
Depth-Wise Asymmetric Bottleneck With Point-Wise Aggregation Decoder for Real-Time Semantic Segmentation in Urban Scenes
Gen Li, Shenlu Jiang, Inyong Yun, Jonghyun Kim, and Joongkyu Kim
@article{dab_access,title={Depth-Wise Asymmetric Bottleneck With Point-Wise Aggregation Decoder for Real-Time Semantic Segmentation in Urban Scenes},author={Li, Gen and Jiang, Shenlu and Yun, Inyong and Kim, Jonghyun and Kim, Joongkyu},journal={IEEE Access},year={2020},volume={8},number={},pages={27495-27506},}
2019
BMVC’19
DABNet: Depth-wise asymmetric bottleneck for real-time semantic segmentation
@inproceedings{DABNet,title={DABNet: Depth-wise asymmetric bottleneck for real-time semantic segmentation},author={Li, Gen and Kim, Joongkyu},booktitle={British Machine Vision Conference},year={2019},}