paper.bib

@Manual{blender,
   title = {Blender - a 3D modelling and rendering package},
   author = {Blender Online Community},
   organization = {Blender Foundation},
   address = {Stichting Blender Foundation, Amsterdam},
   year = {2019},
   url = {http://www.blender.org},
}
@Manual{visii,
   title = {ViSII - A VIrtual Scene Imaging Interface},
   author = {Nathan V. Morrical},
   year = {2020},
   url = {https://github.com/owl-project/ViSII},
}
@article{straub2019replica,
  title={The Replica Dataset: A Digital Replica of Indoor Spaces},
  author={Straub, Julian and Whelan, Thomas and Ma, Lingni and Chen, Yufan and Wijmans, Erik and Green, Simon and Engel, Jakob J and Mur-Artal, Raul and Ren, Carl and Verma, Shobhit and others},
  journal={arXiv preprint arXiv:1906.05797},
  doi = {10.48550/ARXIV.1906.05797},
  year={2019}
}
@Article{suncg,
  author      = {Shuran Song and Fisher Yu and Andy Zeng and Angel X. Chang and Manolis Savva and Thomas Funkhouser},
  title       = {Semantic Scene Completion from a Single Depth Image},
  date        = {2016-11-28},
  eprint      = {1611.08974v1},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  abstract    = {This paper focuses on semantic scene completion, a task for producing a complete 3D voxel representation of volumetric occupancy and semantic labels for a scene from a single-view depth map observation. Previous work has considered scene completion and semantic labeling of depth maps separately. However, we observe that these two problems are tightly intertwined. To leverage the coupled nature of these two tasks, we introduce the semantic scene completion network (SSCNet), an end-to-end 3D convolutional network that takes a single depth image as input and simultaneously outputs occupancy and semantic labels for all voxels in the camera view frustum. Our network uses a dilation-based 3D context module to efficiently expand the receptive field and enable 3D context learning. To train our network, we construct SUNCG - a manually created large-scale dataset of synthetic 3D scenes with dense volumetric annotations. Our experiments demonstrate that the joint model outperforms methods addressing each task in isolation and outperforms alternative approaches on the semantic scene completion task.},
  file        = {online:http\://arxiv.org/pdf/1611.08974v1:PDF},
  doi = {10.48550/ARXIV.1611.08974},
  keywords    = {cs.CV},
  timestamp   = {2019.10.11},
}
@Article{Sundermeyer2019,
  author      = {Martin Sundermeyer and Zoltan-Csaba Marton and Maximilian Durner and Manuel Brucker and Rudolph Triebel},
  title       = {Implicit 3D Orientation Learning for 6D Object Detection from RGB Images},
  doi = {10.48550/ARXIV.1902.01275},
  date        = {2019-02-04},
  eprint      = {1902.01275v2},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  abstract    = {We propose a real-time RGB-based pipeline for object detection and 6D pose estimation. Our novel 3D orientation estimation is based on a variant of the Denoising Autoencoder that is trained on simulated views of a 3D model using Domain Randomization. This so-called Augmented Autoencoder has several advantages over existing methods: It does not require real, pose-annotated training data, generalizes to various test sensors and inherently handles object and view symmetries. Instead of learning an explicit mapping from input images to object poses, it provides an implicit representation of object orientations defined by samples in a latent space. Our pipeline achieves state-of-the-art performance on the T-LESS dataset both in the RGB and RGB-D domain. We also evaluate on the LineMOD dataset where we can compete with other synthetically trained approaches. We further increase performance by correcting 3D orientation estimates to account for perspective errors when the object deviates from the image center and show extended results.},
  file        = {online:http\://arxiv.org/pdf/1902.01275v2:PDF},
  keywords    = {cs.CV},
  timestamp   = {2019.10.24},
}
@inproceedings{habitat,
  title     =     {Habitat: {A} {P}latform for {E}mbodied {AI} {R}esearch},
  author    =     {{Manolis Savva*} and {Abhishek Kadian*} and {Oleksandr Maksymets*} and Yili Zhao and Erik Wijmans and Bhavana Jain and Julian Straub and Jia Liu and Vladlen Koltun and Jitendra Malik and Devi Parikh and Dhruv Batra},
  booktitle =     {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
  doi={10.1109/ICCV.2019.00943},
  year      =     {2019}
}
@Article{Su2015,
  author      = {Hao Su and Charles R. Qi and Yangyan Li and Leonidas Guibas},
  title       = {Render for CNN: Viewpoint Estimation in Images Using CNNs Trained with Rendered 3D Model Views},
  doi = {10.48550/ARXIV.1505.05641},
  date        = {2015-05-21},
  eprint      = {1505.05641v1},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  abstract    = {Object viewpoint estimation from 2D images is an essential task in computer vision. However, two issues hinder its progress: scarcity of training data with viewpoint annotations, and a lack of powerful features. Inspired by the growing availability of 3D models, we propose a framework to address both issues by combining render-based image synthesis and CNNs. We believe that 3D models have the potential in generating a large number of images of high variation, which can be well exploited by deep CNN with a high learning capacity. Towards this goal, we propose a scalable and overfit-resistant image synthesis pipeline, together with a novel CNN specifically tailored for the viewpoint estimation task. Experimentally, we show that the viewpoint estimation from our pipeline can significantly outperform state-of-the-art methods on PASCAL 3D+ benchmark.},
  file        = {online:http\://arxiv.org/pdf/1505.05641v1:PDF},
  keywords    = {cs.CV},
  timestamp   = {2019.10.24},
}
@misc{NDDS,
author = {Thang To and Jonathan Tremblay and Duncan McKay and Yukie Yamaguchi and Kirby Leung and Adrian Balanon and Jia Cheng and William Hodge and Stan Birchfield},
note= {\url{https://github.com/NVIDIA/Dataset_Synthesizer}},
title = {{NDDS}: {NVIDIA} Deep Learning Dataset Synthesizer},
Year = 2018
}
@Article{Hodan2019,
  author    = {Hodan, Tomas and Vineet, Vibhav and Gal, Ran and Shalev, Emanuel and Hanzelka, Jon and Connell, Treb and Urbina, Pedro and Sinha, Sudipta N and Guenter, Brian},
  title     = {Photorealistic Image Synthesis for Object Instance Detection},
  journal   = {arXiv preprint arXiv:1902.03334},
  doi = {10.48550/ARXIV.1902.03334},
  timestamp = {2019.10.25},
  year      = {2019},
}
@Article{Stillleben,
  author      = {Max Schwarz and Sven Behnke},
  date        = {2020-05-12},
  title       = {Stillleben: Realistic Scene Synthesis for Deep Learning in Robotics},
  eprint      = {2005.05659v1},
  eprintclass = {cs.CV},
  eprinttype  = {arXiv},
  abstract    = {Training data is the key ingredient for deep learning approaches, but difficult to obtain for the specialized domains often encountered in robotics. We describe a synthesis pipeline capable of producing training data for cluttered scene perception tasks such as semantic segmentation, object detection, and correspondence or pose estimation. Our approach arranges object meshes in physically realistic, dense scenes using physics simulation. The arranged scenes are rendered using high-quality rasterization with randomized appearance and material parameters. Noise and other transformations introduced by the camera sensors are simulated. Our pipeline can be run online during training of a deep neural network, yielding applications in life-long learning and in iterative render-and-compare approaches. We demonstrate the usability by learning semantic segmentation on the challenging YCB-Video dataset without actually using any training frames, where our method achieves performance comparable to a conventionally trained model. Additionally, we show successful application in a real-world regrasping system.},
  file        = {:http\://arxiv.org/pdf/2005.05659v1:PDF},
  doi = {10.48550/ARXIV.2005.05659},
  keywords    = {cs.CV},
}
@inproceedings{Front3D,
	author = {Huan Fu and Bowen Cai and Lin Gao and Ling-Xiao Zhang and Jiaming Wang and Cao Li and Qixun Zeng and Chengyue Sun and Rongfei Jia and Binqiang Zhao and others},
	booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision},
	pages = {10933--10942},
	title = {3D-FRONT: 3D furnished rooms with layouts and semantics},
	doi = {10.48550/ARXIV.2011.09127},
	year = {2021}
}
@techreport{shapenet,
	author = {Angel X. Chang and Thomas Funkhouser and Leonidas Guibas and Pat Hanrahan and Qixing Huang and Zimo Li and Silvio Savarese and Manolis Savva and Shuran Song and Hao Su and Jianxiong Xiao and Li Yi and Fisher Yu},
	institution = {Stanford University --- Princeton University --- Toyota Technological Institute at Chicago},
	number = {arXiv:1512.03012 [cs.GR]},
	title = {ShapeNet: An Information-Rich 3D Model Repository},
	doi = {10.48550/ARXIV.1512.03012},
	year = {2015}
}
@misc{nvisii,
      title={NViSII: A Scriptable Tool for Photorealistic Image Generation},
      author={Nathan Morrical and Jonathan Tremblay and Yunzhi Lin and Stephen Tyree and Stan Birchfield and Valerio Pascucci and Ingo Wald},
      year={2021},
      eprint={2105.13962},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
@article{denninger2019blenderproc,
  title={BlenderProc},
  author={Denninger, Maximilian and Sundermeyer, Martin and Winkelbauer, Dominik and Zidan, Youssef and Olefir, Dmitry and Elbadrawy, Mohamad and Lodhi, Ahsan and Katam, Harinandan},
  journal={arXiv preprint arXiv:1911.01911},
  year={2019},
  doi = {10.48550/ARXIV.1911.01911},
}
@inproceedings{denninger2020blenderproc,
  title={Blenderproc: Reducing the reality gap with photorealistic rendering},
  author={Denninger, Maximilian and Sundermeyer, Martin and Winkelbauer, Dominik and Olefir, Dmitry and Hodan, Tomas and Zidan, Youssef and Elbadrawy, Mohamad and Knauer, Markus and Katam, Harinandan and Lodhi, Ahsan},
  booktitle={International Conference on Robotics: Sciene and Systems, RSS 2020},
  year={2020}
}

@article{kubric,
    title = {Kubric: a scalable dataset generator},
    author = {Klaus Greff and Francois Belletti and Lucas Beyer and Carl Doersch and
              Yilun Du and Daniel Duckworth and David J Fleet and Dan Gnanapragasam and
              Florian Golemo and Charles Herrmann and Thomas Kipf and Abhijit Kundu and
              Dmitry Lagun and Issam Laradji and Hsueh-Ti (Derek) Liu and Henning Meyer and
              Yishu Miao and Derek Nowrouzezahrai and Cengiz Oztireli and Etienne Pot and
              Noha Radwan and Daniel Rebain and Sara Sabour and Mehdi S. M. Sajjadi and Matan Sela and
              Vincent Sitzmann and Austin Stone and Deqing Sun and Suhani Vora and Ziyu Wang and
              Tianhao Wu and Kwang Moo Yi and Fangcheng Zhong and Andrea Tagliasacchi},
    booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
    doi = {10.48550/ARXIV.2203.03570},
    year = {2022},
}