diff --git a/index.html b/index.html index d49000f..181d30d 100644 --- a/index.html +++ b/index.html @@ -6,7 +6,7 @@ content="We introduce token-wise consistency terms between the image content and object segmentation maps in training text-to-image models for enhanced multi-category instance composition and photorealism."> -
- Given a training prompt that faithfully describes an image, we adopt a POS tagger and Grounded SAM to extract all binary segmentation maps of the image corresponding to noun tokens from the prompt. Then, we jointly optimize the denoising U-Net of the diffusion model with both its original denoising and our grounding objective. + Given a training prompt that faithfully describes an image, we adopt a POS tagger and Grounded SAM to extract all binary segmentation maps of the image corresponding to noun tokens from the prompt. Then, we jointly optimize the denoising U-Net of the diffusion model with both its original denoising and our token-wise objective.
- @misc{wang2023tokencompose,
- title={TokenCompose: Grounding Diffusion with Token-level Supervision},
- author={Zirui Wang and Zhizhou Sha and Zheng Ding and Yilin Wang and Zhuowen Tu},
- year={2023},
- eprint={2312.03626},
- archivePrefix={arXiv},
- primaryClass={cs.CV}
+ @InProceedings{Wang2024TokenCompose,
+ author = {Wang, Zirui and Sha, Zhizhou and Ding, Zheng and Wang, Yilin and Tu, Zhuowen},
+ title = {TokenCompose: Text-to-Image Diffusion with Token-level Supervision},
+ booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2024},
+ pages = {8553-8564}
}