publications | Gen Li

Please refer to Google Scholar for the full publication list.

2026

Preprint

World Model for Robot Learning: A Comprehensive Survey

Bohan Hou^*, Gen Li^*, Jindou Jia^*, Tuo An^*, Xinying Guo^*, Sicong Leng, Haoran Geng, Yanjie Ze, Tatsuya Harada, Philip Torr, Oier Mees, Marc Pollefeys, Zhuang Liu, Jiajun Wu, Pieter Abbeel, Jitendra Malik, Yilun Du, and Jianfei Yang

arXiv, 2026

arXiv Bib Code Website

@article{wm-survey,
  title = {World Model for Robot Learning: A Comprehensive Survey},
  author = {Hou, Bohan and Li, Gen and Jia, Jindou and An, Tuo and Guo, Xinying and Leng, Sicong and Geng, Haoran and Ze, Yanjie and Harada, Tatsuya and Torr, Philip and Mees, Oier and Pollefeys, Marc and Liu, Zhuang and Wu, Jiajun and Abbeel, Pieter and Malik, Jitendra and Du, Yilun and Yang, Jianfei},
  year = {2026},
  journal = {arXiv},
}

Preprint

CompassAD: Intent-Driven 3D Affordance Grounding in Functionally Competing Objects

Jingliang Li, Jindou Jia, Tuo An, Chuhao Zhou, Xiangyu Chen, Shilin Shan, Boyu Ma, Bofan Lyu, Gen Li^†, and Jianfei Yang^†

arXiv, 2026

arXiv Bib Code Website

@article{compassad,
  title = {CompassAD: Intent-Driven 3D Affordance Grounding in Functionally Competing Objects},
  author = {Li, Jingliang and Jia, Jindou and An, Tuo and Zhou, Chuhao and Chen, Xiangyu and Shan, Shilin and Ma, Boyu and Lyu, Bofan and Li, Gen and Yang, Jianfei},
  year = {2026},
  journal = {arXiv},
}

Preprint

Evo-0: Vision-Language-Action Model with Implicit Spatial Understanding

Tao Lin^*, Gen Li^*, Yilei Zhong, Yanwen Zou, Yuxin Du, Jiting Liu, Encheng Gu, and Bo Zhao

arXiv, 2026

arXiv Bib Code Website

@article{evo0,
  title = {Evo-0: Vision-Language-Action Model with Implicit Spatial Understanding},
  author = {Lin, Tao and Li, Gen and Zhong, Yilei and Zou, Yanwen and Du, Yuxin and Liu, Jiting and Gu, Encheng and Zhao, Bo},
  year = {2026},
  journal = {arXiv},
}

RSS’26

Action-to-Action Flow Matching

Jindou Jia^*, Gen Li^*, Xiangyu Chen, Tuo An, Yuxuan Hu, Jingliang Li, Xinying Guo, and Jianfei Yang

In Robotics: Science and Systems, 2026

arXiv Bib Code Website

@inproceedings{a2a,
  title = {Action-to-Action Flow Matching},
  author = {Jia, Jindou and Li, Gen and Chen, Xiangyu and An, Tuo and Hu, Yuxuan and Li, Jingliang and Guo, Xinying and Yang, Jianfei},
  year = {2026},
  booktitle = {Robotics: Science and Systems},
}

CVPR’26

Evo-1: Lightweight Vision-Language-Action Model with Preserved Semantic Alignment

Tao Lin, Yilei Zhong, Yuxin Du, Jingjing Zhang, Jiting Liu, Yinxinyu Chen, Encheng Gu, Ziyan Liu, Hongyi Cai, Yanwen Zou, Lixing Zou, Zhaoye Zhou, Gen Li^†, and Bo Zhao^†

In IEEE/CVF Conference on Computer Vision and Pattern Recognition, 2026

arXiv Bib Code Website

@inproceedings{evo1,
  title = {Evo-1: Lightweight Vision-Language-Action Model with Preserved Semantic Alignment},
  author = {Lin, Tao and Zhong, Yilei and Du, Yuxin and Zhang, Jingjing and Liu, Jiting and Chen, Yinxinyu and Gu, Encheng and Liu, Ziyan and Cai, Hongyi and Zou, Yanwen and Zou, Lixing and Zhou, Zhaoye and Li, Gen and Zhao, Bo},
  year = {2026},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition},
}

CVPR’26

PALM: Progress-Aware Policy Learning via Affordance Reasoning for Long-Horizon Robotic Manipulation

Yuanzhe Liu, Jingyuan Zhu, Yuchen Mo, Gen Li, Xu Cao, Jin Jin, Yifan Shen, Zhengyuan Li, Tianjiao Yu, Wenzhen Yuan, Fangqiang Ding, and Ismini Lourentzou

In IEEE/CVF Conference on Computer Vision and Pattern Recognition, 2026

arXiv Bib

@inproceedings{palm,
  title = {PALM: Progress-Aware Policy Learning via Affordance Reasoning for Long-Horizon Robotic Manipulation},
  author = {Liu, Yuanzhe and Zhu, Jingyuan and Mo, Yuchen and Li, Gen and Cao, Xu and Jin, Jin and Shen, Yifan and Li, Zhengyuan and Yu, Tianjiao and Yuan, Wenzhen and Ding, Fangqiang and Lourentzou, Ismini},
  year = {2026},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition},
}

AAAI’26

Mask2IV: Interaction-Centric Video Generation via Mask Trajectories

Gen Li, Bo Zhao, Jianfei Yang, and Laura Sevilla-Lara

In AAAI Conference on Artificial Intelligence, 2026

arXiv Bib Code Website

@inproceedings{Mask2IV,
  title = {Mask2IV: Interaction-Centric Video Generation via Mask Trajectories},
  author = {Li, Gen and Zhao, Bo and Yang, Jianfei and Sevilla-Lara, Laura},
  year = {2026},
  booktitle = {AAAI Conference on Artificial Intelligence},
}

2025

ACM MM’25

Dual Enhancement on 3D Vision-Language Perception for Monocular 3D Visual Grounding

Yuzhen Li, Min Liu, Yuan Bian, Xueping Wang, Zhaoyang Li, Gen Li, and Yaonan Wang

In Proceedings of the 33rd ACM International Conference on Multimedia, 2025

arXiv Bib

@inproceedings{li2025dual,
  title = {Dual Enhancement on 3D Vision-Language Perception for Monocular 3D Visual Grounding},
  author = {Li, Yuzhen and Liu, Min and Bian, Yuan and Wang, Xueping and Li, Zhaoyang and Li, Gen and Wang, Yaonan},
  booktitle = {Proceedings of the 33rd ACM International Conference on Multimedia},
  year = {2025},
}

ICCV’25

Learning Precise Affordances from Egocentric Videos for Robotic Manipulation

Gen Li, Nikolaos Tsagkas, Jifei Song, Ruaridh Mon-Williams, Sethu Vijayakumar, Kun Shao, and Laura Sevilla-Lara

In IEEE/CVF International Conference on Computer Vision, 2025

arXiv Bib Code Website

@inproceedings{Aff-Grasp,
  title = {Learning Precise Affordances from Egocentric Videos for Robotic Manipulation},
  author = {Li, Gen and Tsagkas, Nikolaos and Song, Jifei and Mon-Williams, Ruaridh and Vijayakumar, Sethu and Shao, Kun and Sevilla-Lara, Laura},
  year = {2025},
  booktitle = {IEEE/CVF International Conference on Computer Vision},
}

ICCV’25

Principles of Visual Tokens for Efficient Video Understanding

Xinyue Hao, Gen Li, Shreyank N Gowda, Robert B Fisher, Jonathan Huang, Anurag Arnab, and Laura Sevilla-Lara

In IEEE/CVF International Conference on Computer Vision, 2025

arXiv Bib

@inproceedings{hao2024principles,
  title = {Principles of Visual Tokens for Efficient Video Understanding},
  author = {Hao, Xinyue and Li, Gen and Gowda, Shreyank N and Fisher, Robert B and Huang, Jonathan and Arnab, Anurag and Sevilla-Lara, Laura},
  booktitle = {IEEE/CVF International Conference on Computer Vision},
  year = {2025},
}

IROS’25

Resource-Efficient Affordance Grounding with Complementary Depth and Semantic Prompts

Yizhou Huang, Fan Yang, Guoliang Zhu, Gen Li, Hao Shi, Yukun Zuo, Wenrui Chen, Zhiyong Li, and Kailun Yang

In International Conference on Intelligent Robots and Systems, 2025

arXiv Bib Code

@inproceedings{huang2025resource,
  title = {Resource-Efficient Affordance Grounding with Complementary Depth and Semantic Prompts},
  author = {Huang, Yizhou and Yang, Fan and Zhu, Guoliang and Li, Gen and Shi, Hao and Zuo, Yukun and Chen, Wenrui and Li, Zhiyong and Yang, Kailun},
  booktitle = {International Conference on Intelligent Robots and Systems},
  year = {2025},
}

NMI

Embodied Large Language Models Enable Robots to Complete Complex Tasks in Unpredictable Environments

Ruaridh Mon-Williams^†, Gen Li^†, Ran Long, Wenqian Du, and Chris Lucas

Nature Machine Intelligence, 2025

Bib PDF Video Code

@article{ELLMER,
  title = {Embodied Large Language Models Enable Robots to Complete Complex Tasks in Unpredictable Environments},
  author = {Mon-Williams, Ruaridh and Li, Gen and Long, Ran and Du, Wenqian and Lucas, Chris},
  journal = {Nature Machine Intelligence},
  year = {2025},
}

2024

ECCVW’24

Watt for what: Rethinking deep learning’s energy-performance relationship

Shreyank N Gowda, Xinyue Hao, Gen Li, Shashank Narayana Gowda, Xiaobo Jin, and Laura Sevilla-Lara

In European Conference on Computer Vision Workshop, 2024

Bib PDF

@inproceedings{gowda2025watt,
  title = {Watt for what: Rethinking deep learning’s energy-performance relationship},
  author = {Gowda, Shreyank N and Hao, Xinyue and Li, Gen and Gowda, Shashank Narayana and Jin, Xiaobo and Sevilla-Lara, Laura},
  booktitle = {European Conference on Computer Vision Workshop},
  pages = {388--405},
  year = {2024},
  organization = {Springer},
}

CVPR’24

One-Shot Open Affordance Learning with Foundation Models

Gen Li, Deqing Sun, Laura Sevilla-Lara, and Varun Jampani

In IEEE/CVF Conference on Computer Vision and Pattern Recognition, 2024

arXiv Bib Code Website

@inproceedings{OOAL,
  title = {One-Shot Open Affordance Learning with Foundation Models},
  author = {Li, Gen and Sun, Deqing and Sevilla-Lara, Laura and Jampani, Varun},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  year = {2024},
}

2023

IJCNN’23

Referenceless User Controllable Semantic Image Synthesis

Jonghyun Kim, Gen Li, and Joongkyu Kim

In International Joint Conference on Neural Networks, 2023

arXiv Bib Code

@inproceedings{Refer,
  title = {Referenceless User Controllable Semantic Image Synthesis},
  author = {Kim, Jonghyun and Li, Gen and Kim, Joongkyu},
  booktitle = {International Joint Conference on Neural Networks},
  year = {2023},
}

CVPR’23

LOCATE: Localize and Transfer Object Parts for Weakly Supervised Affordance Grounding

Gen Li, Varun Jampani, Deqing Sun, and Laura Sevilla-Lara

In IEEE/CVF Conference on Computer Vision and Pattern Recognition, 2023

arXiv Bib Code Website

@inproceedings{LOCATE,
  title = {LOCATE: Localize and Transfer Object Parts for Weakly Supervised Affordance Grounding},
  author = {Li, Gen and Jampani, Varun and Sun, Deqing and Sevilla-Lara, Laura},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  year = {2023},
}

2021

CVPR’21

Adaptive Prototype Learning and Allocation for Few-Shot Segmentation

Gen Li, Varun Jampani, Laura Sevilla-Lara, Deqing Sun, Jonghyun Kim, and Joongkyu Kim

In IEEE/CVF Conference on Computer Vision and Pattern Recognition, 2021

arXiv Bib Code Website

@inproceedings{ASGNet,
  title = {Adaptive Prototype Learning and Allocation for Few-Shot Segmentation},
  author = {Li, Gen and Jampani, Varun and Sevilla-Lara, Laura and Sun, Deqing and Kim, Jonghyun and Kim, Joongkyu},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages = {8334--8343},
  year = {2021},
}

BMVC’21

SuperStyleNet: Deep Image Synthesis with Superpixel Based Style Encoder

Jonghyun Kim, Gen Li, Cheolkon Jung, and Joongkyu Kim

In British Machine Vision Conference, 2021

arXiv Bib Code

@inproceedings{kim2021superstylenet,
  title = {SuperStyleNet: Deep Image Synthesis with Superpixel Based Style Encoder},
  author = {Kim, Jonghyun and Li, Gen and Jung, Cheolkon and Kim, Joongkyu},
  booktitle = {British Machine Vision Conference},
  year = {2021},
}

Weakly-supervised temporal attention 3D network for human action recognition

Jonghyun Kim, Gen Li, Inyong Yun, Cheolkon Jung, and Joongkyu Kim

Pattern Recognition, 2021

Bib PDF

@article{KIM2021108068,
  title = {Weakly-supervised temporal attention 3D network for human action recognition},
  journal = {Pattern Recognition},
  volume = {119},
  pages = {108068},
  year = {2021},
  issn = {0031-3203},
  author = {Kim, Jonghyun and Li, Gen and Yun, Inyong and Jung, Cheolkon and Kim, Joongkyu},
  keywords = {Action recognition, Temporal attention, Convolutional neural network, Weakly-supervised learning, Video analysis, Video classification},
}

Neurocom

Edge and identity preserving network for face super-resolution

Jonghyun Kim, Gen Li, Inyong Yun, Cheolkon Jung, and Joongkyu Kim

Neurocomputing, 2021

arXiv Bib PDF Code

@article{KIM202111,
  title = {Edge and identity preserving network for face super-resolution},
  journal = {Neurocomputing},
  volume = {446},
  pages = {11-22},
  year = {2021},
  issn = {0925-2312},
  author = {Kim, Jonghyun and Li, Gen and Yun, Inyong and Jung, Cheolkon and Kim, Joongkyu},
  keywords = {Super-resolution, Face hallucination, Edge block, Identity loss, Image enhancement},
}

2020

Access

Depth-Wise Asymmetric Bottleneck With Point-Wise Aggregation Decoder for Real-Time Semantic Segmentation in Urban Scenes

Gen Li, Shenlu Jiang, Inyong Yun, Jonghyun Kim, and Joongkyu Kim

IEEE Access, 2020

Bib PDF

@article{dab_access,
  title = {Depth-Wise Asymmetric Bottleneck With Point-Wise Aggregation Decoder for Real-Time Semantic Segmentation in Urban Scenes},
  author = {Li, Gen and Jiang, Shenlu and Yun, Inyong and Kim, Jonghyun and Kim, Joongkyu},
  journal = {IEEE Access},
  year = {2020},
  volume = {8},
  number = {},
  pages = {27495-27506},
}

2019

BMVC’19

DABNet: Depth-wise asymmetric bottleneck for real-time semantic segmentation

Gen Li and Joongkyu Kim

In British Machine Vision Conference, 2019

arXiv Bib Code

@inproceedings{DABNet,
  title = {DABNet: Depth-wise asymmetric bottleneck for real-time semantic segmentation},
  author = {Li, Gen and Kim, Joongkyu},
  booktitle = {British Machine Vision Conference},
  year = {2019},
}