Uses 24 GB VRAM - what optimizations can be made? #104

FurkanGozukara · 2024-01-30T19:23:23Z

I have installed as described in the gradio demo

It is literally filling entire 24 GB VRAM working but that much VRAM

what VRAM optimizations can be made? such as loading models perhaps 16 bit ? 8 bit?

I am making 1 click auto installer and an advanced gradio app

below is the pipeline

MAX_SEED = np.iinfo(np.int32).max
device = get_torch_device()
dtype = torch.float16 if str(device).__contains__("cuda") else torch.float32
STYLE_NAMES = list(styles.keys())
DEFAULT_STYLE_NAME = "Watercolor"

# Load face encoder
app = FaceAnalysis(name='antelopev2', root='checkpoints', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(640, 640))

# Path to InstantID models
face_adapter = f'checkpoints/ip-adapter.bin'
controlnet_path = f'checkpoints/ControlNetModel'

# Load pipeline
controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=dtype)

def main(pretrained_model_name_or_path="wangqixun/YamerMIX_v8"):

    if pretrained_model_name_or_path.endswith(
            ".ckpt"
        ) or pretrained_model_name_or_path.endswith(".safetensors"):
            scheduler_kwargs = hf_hub_download(
                repo_id="wangqixun/YamerMIX_v8",
                subfolder="scheduler",
                filename="scheduler_config.json",
            )

            (tokenizers, text_encoders, unet, _, vae) = load_models_xl(
                pretrained_model_name_or_path=pretrained_model_name_or_path,
                scheduler_name=None,
                weight_dtype=dtype,
            )

            scheduler = diffusers.EulerDiscreteScheduler.from_config(scheduler_kwargs)
            pipe = StableDiffusionXLInstantIDPipeline(
                vae=vae,
                text_encoder=text_encoders[0],
                text_encoder_2=text_encoders[1],
                tokenizer=tokenizers[0],
                tokenizer_2=tokenizers[1],
                unet=unet,
                scheduler=scheduler,
                controlnet=controlnet,
            ).to(device)

    else:
        pipe = StableDiffusionXLInstantIDPipeline.from_pretrained(
            pretrained_model_name_or_path,
            controlnet=controlnet,
            torch_dtype=dtype,
            safety_checker=None,
            feature_extractor=None,
        ).to(device)

        pipe.scheduler = diffusers.EulerDiscreteScheduler.from_config(pipe.scheduler.config)

    pipe.load_ip_adapter_instantid(face_adapter)

    def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
        if randomize_seed:
            seed = random.randint(0, MAX_SEED)
        return seed

    def swap_to_gallery(images):
        return gr.update(value=images, visible=True), gr.update(visible=True), gr.update(visible=False)

    def upload_example_to_gallery(images, prompt, style, negative_prompt):
        return gr.update(value=images, visible=True), gr.update(visible=True), gr.update(visible=False)

    def remove_back_to_files():
        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)

    def remove_tips():
        return gr.update(visible=False)

    def convert_from_cv2_to_image(img: np.ndarray) -> Image:
        return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))

    def convert_from_image_to_cv2(img: Image) -> np.ndarray:
        return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

    def draw_kps(image_pil, kps, color_list=[(255,0,0), (0,255,0), (0,0,255), (255,255,0), (255,0,255)]):
        stickwidth = 4
        limbSeq = np.array([[0, 2], [1, 2], [3, 2], [4, 2]])
        kps = np.array(kps)

        w, h = image_pil.size
        out_img = np.zeros([h, w, 3])

        for i in range(len(limbSeq)):
            index = limbSeq[i]
            color = color_list[index[0]]

            x = kps[index][:, 0]
            y = kps[index][:, 1]
            length = ((x[0] - x[1]) ** 2 + (y[0] - y[1]) ** 2) ** 0.5
            angle = math.degrees(math.atan2(y[0] - y[1], x[0] - x[1]))
            polygon = cv2.ellipse2Poly((int(np.mean(x)), int(np.mean(y))), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
            out_img = cv2.fillConvexPoly(out_img.copy(), polygon, color)
        out_img = (out_img * 0.6).astype(np.uint8)

        for idx_kp, kp in enumerate(kps):
            color = color_list[idx_kp]
            x, y = kp
            out_img = cv2.circle(out_img.copy(), (int(x), int(y)), 10, color, -1)

        out_img_pil = Image.fromarray(out_img.astype(np.uint8))
        return out_img_pil

    def resize_img(input_image, max_side=1280, min_side=1024, size=None, 
                pad_to_max_side=False, mode=PIL.Image.BILINEAR, base_pixel_number=64):

            w, h = input_image.size
            if size is not None:
                w_resize_new, h_resize_new = size
            else:
                ratio = min_side / min(h, w)
                w, h = round(ratio*w), round(ratio*h)
                ratio = max_side / max(h, w)
                input_image = input_image.resize([round(ratio*w), round(ratio*h)], mode)
                w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
                h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
            input_image = input_image.resize([w_resize_new, h_resize_new], mode)

            if pad_to_max_side:
                res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255
                offset_x = (max_side - w_resize_new) // 2
                offset_y = (max_side - h_resize_new) // 2
                res[offset_y:offset_y+h_resize_new, offset_x:offset_x+w_resize_new] = np.array(input_image)
                input_image = Image.fromarray(res)
            return input_image

    def apply_style(style_name: str, positive: str, negative: str = "") -> tuple[str, str]:
        p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
        return p.replace("{prompt}", positive), n + ' ' + negative

    def generate_image(face_image, pose_image, prompt, negative_prompt, style_name, num_steps, identitynet_strength_ratio, adapter_strength_ratio, guidance_scale, seed, progress=gr.Progress(track_tqdm=True)):

        if face_image is None:
            raise gr.Error(f"Cannot find any input face image! Please upload the face image")
        
        if prompt is None:
            prompt = "a person"
        
        # apply the style template
        prompt, negative_prompt = apply_style(style_name, prompt, negative_prompt)
        
        face_image = load_image(face_image[0])
        face_image = resize_img(face_image)
        face_image_cv2 = convert_from_image_to_cv2(face_image)
        height, width, _ = face_image_cv2.shape
        
        # Extract face features
        face_info = app.get(face_image_cv2)
        
        if len(face_info) == 0:
            raise gr.Error(f"Cannot find any face in the image! Please upload another person image")
        
        face_info = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*x['bbox'][3]-x['bbox'][1])[-1]  # only use the maximum face
        face_emb = face_info['embedding']
        face_kps = draw_kps(convert_from_cv2_to_image(face_image_cv2), face_info['kps'])
        
        if pose_image is not None:
            pose_image = load_image(pose_image[0])
            pose_image = resize_img(pose_image)
            pose_image_cv2 = convert_from_image_to_cv2(pose_image)
            
            face_info = app.get(pose_image_cv2)
            
            if len(face_info) == 0:
                raise gr.Error(f"Cannot find any face in the reference image! Please upload another person image")
            
            face_info = face_info[-1]
            face_kps = draw_kps(pose_image, face_info['kps'])
            
            width, height = face_kps.size
        
        generator = torch.Generator(device=device).manual_seed(seed)
        
        print("Start inference...")
        print(f"[Debug] Prompt: {prompt}, \n[Debug] Neg Prompt: {negative_prompt}")
        
        pipe.set_ip_adapter_scale(adapter_strength_ratio)
        images = pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            image_embeds=face_emb,
            image=face_kps,
            controlnet_conditioning_scale=float(identitynet_strength_ratio),
            num_inference_steps=num_steps,
            guidance_scale=guidance_scale,
            height=height,
            width=width,
            generator=generator
        ).images

        return images, gr.update(visible=True)

The text was updated successfully, but these errors were encountered:

garbe-github-support · 2024-01-31T06:11:52Z

+1

b4sh · 2024-01-31T11:08:05Z

I'm not a programmer, but I've experimented a bit with xformers and sequential CPU offload. Without xformers the demo didn't work at all on my card (RTX4070). After using xformers it works, but the pipeline allocates about 6G of shared memory. Generating an image with a resolution of 1024 takes about 60 sec.

Sequential CPU offload give me very small VRAM usage, but generating an image takes about 70 sec.

https://huggingface.co/blog/simple_sdxl_optimizations
https://github.com/b4sh/InstantID

FurkanGozukara · 2024-01-31T11:10:06Z

Sequential CPU offload

Sequential CPU offload is giving me this error

how did you fix?

G:\instant id auto installer\venv\lib\site-packages\insightface\utils\transform.py:68: FutureWarning: `rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.
To use the future default and silence this warning we advise to pass `rcond=None`, to keep using the old, explicitly pass `rcond=-1`.
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
Start inference...
[Debug] Prompt: watercolor painting, a man. vibrant, beautiful, painterly, detailed, textural, artistic,
[Debug] Neg Prompt: (lowres, low quality, worst quality:1.2), (text:1.2), watermark, anime, photorealistic, 35mm film, deformed, glitch, low contrast, noisy (lowres, low quality, worst quality:1.2), (text:1.2), watermark, (frame:1.2), deformed, ugly, deformed eyes, blur, out of focus, blurry, deformed cat, deformed, photo, anthropomorphic cat, monochrome, pet collar, gun, weapon, blue, 3d, drones, drone, buildings in background, green
Traceback (most recent call last):
  File "G:\instant id auto installer\venv\lib\site-packages\gradio\queueing.py", line 495, in call_prediction
    output = await route_utils.call_process_api(
  File "G:\instant id auto installer\venv\lib\site-packages\gradio\route_utils.py", line 232, in call_process_api
    output = await app.get_blocks().process_api(
  File "G:\instant id auto installer\venv\lib\site-packages\gradio\blocks.py", line 1561, in process_api
    result = await self.call_function(
  File "G:\instant id auto installer\venv\lib\site-packages\gradio\blocks.py", line 1179, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "G:\instant id auto installer\venv\lib\site-packages\anyio\to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "G:\instant id auto installer\venv\lib\site-packages\anyio\_backends\_asyncio.py", line 2134, in run_sync_in_worker_thread
    return await future
  File "G:\instant id auto installer\venv\lib\site-packages\anyio\_backends\_asyncio.py", line 851, in run
    result = context.run(func, *args)
  File "G:\instant id auto installer\venv\lib\site-packages\gradio\utils.py", line 695, in wrapper
    response = f(*args, **kwargs)
  File "G:\instant id auto installer\venv\lib\site-packages\gradio\utils.py", line 695, in wrapper
    response = f(*args, **kwargs)
  File "G:\instant id auto installer\web-ui.py", line 216, in generate_image
    images = pipe(
  File "G:\instant id auto installer\venv\lib\site-packages\torch\utils\_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "G:\instant id auto installer\pipeline_stable_diffusion_xl_instantid.py", line 522, in __call__
    prompt_image_emb = self._encode_prompt_image_emb(image_embeds,
  File "G:\instant id auto installer\pipeline_stable_diffusion_xl_instantid.py", line 235, in _encode_prompt_image_emb
    prompt_image_emb = self.image_proj_model(prompt_image_emb)
  File "G:\instant id auto installer\venv\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "G:\instant id auto installer\venv\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "G:\instant id auto installer\venv\lib\site-packages\ip_adapter\resampler.py", line 135, in forward
    x = self.proj_in(x)
  File "G:\instant id auto installer\venv\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "G:\instant id auto installer\venv\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "G:\instant id auto installer\venv\lib\site-packages\torch\nn\modules\linear.py", line 114, in forward
    return F.linear(input, self.weight, self.bias)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)

b4sh · 2024-01-31T11:28:17Z

how did you fix?

Without looking at your code, I can't tell you how to fix it.

Just a note, I use Linux running on WSL2, maybe that matters. I'll try to run it on Windows.

I just checked, VRAM usage while generating image only 1G with Sequential CPU offload.

FurkanGozukara · 2024-01-31T11:37:33Z

Sequential CPU offload

yes Sequential CPU offload is amazing but I am getting RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)
due to pipeline_stable_diffusion_xl_instantid and the method _encode_prompt_image_emb

hortom · 2024-01-31T21:16:30Z

how did you fix?

Without looking at your code, I can't tell you how to fix it.

Just a note, I use Linux running on WSL2, maybe that matters. I'll try to run it on Windows.

I just checked, VRAM usage while generating image only 1G with Sequential CPU offload.

Do you think that this can be added to https://github.com/ZHO-ZHO-ZHO/ComfyUI-InstantID/ so ComfyUI would run on a 4070 with InstantID?

b4sh · 2024-01-31T21:36:57Z

Sequential CPU offload removes the model from VRAM, so after each generated image the model would have to be reloaded from disk. I don't know if this is a good solution. At the moment, gradio demo with xformers works best IMHO. I experimented with ZHO-ZHO-ZHO/ComfyUI-InstantID/ and xformers, it works on 4070, but every second generation returns an OOM error.

FurkanGozukara · 2024-01-31T23:43:31Z

Sequential CPU offload removes the model from VRAM, so after each generated image the model would have to be reloaded from disk. I don't know if this is a good solution. At the moment, gradio demo with xformers works best IMHO. I experimented with ZHO-ZHO-ZHO/ComfyUI-InstantID/ and xformers, it works on 4070, but every second generation returns an OOM error.

it is not same as running it on CPU

camoody1 · 2024-02-10T07:19:25Z

Sequential CPU offload removes the model from VRAM, so after each generated image the model would have to be reloaded from disk. I don't know if this is a good solution. At the moment, gradio demo with xformers works best IMHO. I experimented with ZHO-ZHO-ZHO/ComfyUI-InstantID/ and xformers, it works on 4070, but every second generation returns an OOM error.

Is there some code we could add or change in their existing files that would allow us to use these changes in Comfy? It might not be the best solution, but I've got a 3060 12GB that can't run it, so anything is better than the current situation.

hortom · 2024-02-10T07:27:54Z

Sequential CPU offload removes the model from VRAM, so after each generated image the model would have to be reloaded from disk. I don't know if this is a good solution. At the moment, gradio demo with xformers works best IMHO. I experimented with ZHO-ZHO-ZHO/ComfyUI-InstantID/ and xformers, it works on 4070, but every second generation returns an OOM error.

Is there some code we could add or change in their existing files that would allow us to use these changes in Comfy? It might not be the best solution, but I've got a 3060 12GB that can't run it, so anything is better than the current situation.

This PR solves it for me: ZHO-ZHO-ZHO/ComfyUI-InstantID#87
That is why I added it to ZHO's unofficial solution which level_2 optimisation/offloading works on 12GB.

Also, we should have soon the official as well: cubiq/ComfyUI_IPAdapter_plus#242 (comment)

camoody1 · 2024-02-10T07:51:30Z

This PR solves it for me: ZHO-ZHO-ZHO/ComfyUI-InstantID#87 That is why I added it to ZHO's unofficial solution which level_2 optimisation/offloading works on 12GB.

Also, we should have soon the official as well: cubiq/ComfyUI_IPAdapter_plus#242 (comment)

The official support has me very excited to try. As for the modifications you suggest in the other thread, exactly where were you making these changes and in which files? I'm not familiar at all with this code, but I'm not afraid to mess around in it given good directions. :)

hortom · 2024-02-10T09:45:32Z

This PR solves it for me: ZHO-ZHO-ZHO/ComfyUI-InstantID#87 That is why I added it to ZHO's unofficial solution which level_2 optimisation/offloading works on 12GB.
Also, we should have soon the official as well: cubiq/ComfyUI_IPAdapter_plus#242 (comment)

The official support has me very excited to try. As for the modifications you suggest in the other thread, exactly where were you making these changes and in which files? I'm not familiar at all with this code, but I'm not afraid to mess around in it given good directions. :)

The modifications can be seen here per files per lines: https://github.com/ZHO-ZHO-ZHO/ComfyUI-InstantID/pull/87/files

FurkanGozukara · 2024-02-10T10:26:22Z

well we also fixed the cpu off load in a standalone gradio app

1 click install with downloading models and on the fly model changing as well

enabled vae slicing, xformers too

shared on here so far : https://www.patreon.com/posts/1-click-very-gui-97769887

SoutPark99 · 2024-02-14T11:47:55Z

well we also fixed the cpu off load in a standalone gradio app

1 click install with downloading models and on the fly model changing as well

enabled vae slicing, xformers too

shared on here so far : https://www.patreon.com/posts/1-click-very-gui-97769887

awww behind a paywall, :-( looks neat. Well at least with your GUI are you able to use the normal XL safetensors from Civit? Or this odd format that Aitrepreneur has got us using https://huggingface.co/stablediffusionapi

FurkanGozukara · 2024-02-14T11:50:25Z

@SoutPark99

my gui can use any civitAI models i made it support that way

put into models folder and restart the app
or give its full path in the box

sorry for late reply

b4sh · 2024-02-14T11:54:46Z

awww behind a paywall, :-(

Try this. https://github.com/cubiq/ComfyUI_InstantID - IMHO the best implementation so far. It runs on a 12G VRAM card without any problem and does not use all the VRAM.

albusdemens · 2024-04-12T02:45:01Z

I managed to run the code in ~30 seconds on an RTX 3060 (12 GB VRAM). Approach:

Include @b4sh 's changes in the current main
Reduce image size to 512 in resize_img (app.py)

I used WSL 2.

FurkanGozukara mentioned this issue Jan 30, 2024

CUDA out of memory. Tried to allocate 14.00 MiB. GPU 0 has a total capacty of 4.00 GiB of which 0 bytes is free. Of the allocated memory 3.41 GiB is allocated by PyTorch, and 65.23 MiB is reserved by PyTorch but unallocated #72

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uses 24 GB VRAM - what optimizations can be made? #104

Uses 24 GB VRAM - what optimizations can be made? #104

FurkanGozukara commented Jan 30, 2024

garbe-github-support commented Jan 31, 2024

b4sh commented Jan 31, 2024

FurkanGozukara commented Jan 31, 2024

b4sh commented Jan 31, 2024

FurkanGozukara commented Jan 31, 2024 •

edited

Loading

hortom commented Jan 31, 2024

b4sh commented Jan 31, 2024

FurkanGozukara commented Jan 31, 2024

camoody1 commented Feb 10, 2024

hortom commented Feb 10, 2024

camoody1 commented Feb 10, 2024

hortom commented Feb 10, 2024

FurkanGozukara commented Feb 10, 2024

SoutPark99 commented Feb 14, 2024

FurkanGozukara commented Feb 14, 2024

b4sh commented Feb 14, 2024

albusdemens commented Apr 12, 2024 •

edited

Loading

Uses 24 GB VRAM - what optimizations can be made? #104

Uses 24 GB VRAM - what optimizations can be made? #104

Comments

FurkanGozukara commented Jan 30, 2024

garbe-github-support commented Jan 31, 2024

b4sh commented Jan 31, 2024

FurkanGozukara commented Jan 31, 2024

b4sh commented Jan 31, 2024

FurkanGozukara commented Jan 31, 2024 • edited Loading

hortom commented Jan 31, 2024

b4sh commented Jan 31, 2024

FurkanGozukara commented Jan 31, 2024

camoody1 commented Feb 10, 2024

hortom commented Feb 10, 2024

camoody1 commented Feb 10, 2024

hortom commented Feb 10, 2024

FurkanGozukara commented Feb 10, 2024

SoutPark99 commented Feb 14, 2024

FurkanGozukara commented Feb 14, 2024

b4sh commented Feb 14, 2024

albusdemens commented Apr 12, 2024 • edited Loading

FurkanGozukara commented Jan 31, 2024 •

edited

Loading

albusdemens commented Apr 12, 2024 •

edited

Loading