You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Traceback (most recent call last):
File "/root/miniconda3/lib/python3.10/site-packages/xinference/api/restful_api.py", line 1566, in create_chat_completion
data = await model.chat(
File "/root/miniconda3/lib/python3.10/site-packages/xoscar/backends/context.py", line 227, in send
return self._process_result_message(result)
File "/root/miniconda3/lib/python3.10/site-packages/xoscar/backends/context.py", line 102, in _process_result_message
raise message.as_instanceof_cause()
File "/root/miniconda3/lib/python3.10/site-packages/xoscar/backends/pool.py", line 659, in send
result = await self._run_coro(message.message_id, coro)
File "/root/miniconda3/lib/python3.10/site-packages/xoscar/backends/pool.py", line 370, in _run_coro
return await coro
File "/root/miniconda3/lib/python3.10/site-packages/xoscar/api.py", line 384, in on_receive
return await super().on_receive(message) # type: ignore
File "xoscar/core.pyx", line 558, in on_receive
raise ex
File "xoscar/core.pyx", line 520, in xoscar.core._BaseActor.on_receive
async with self._lock:
File "xoscar/core.pyx", line 521, in xoscar.core._BaseActor.on_receive
with debug_async_timeout('actor_lock_timeout',
File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.on_receive
result = await result
File "/root/miniconda3/lib/python3.10/site-packages/xinference/core/utils.py", line 45, in wrapped
ret = await func(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/xinference/core/model.py", line 87, in wrapped_func
ret = await fn(self, *args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/xoscar/api.py", line 462, in _wrapper
r = await func(self, *args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/xinference/core/model.py", line 488, in chat
response = await self._call_wrapper(
File "/root/miniconda3/lib/python3.10/site-packages/xinference/core/model.py", line 111, in _async_wrapper
return await fn(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/xinference/core/model.py", line 371, in _call_wrapper
ret = await asyncio.to_thread(fn, *args, **kwargs)
File "/root/miniconda3/lib/python3.10/asyncio/threads.py", line 25, in to_thread
return await loop.run_in_executor(None, func_call)
File "/root/miniconda3/lib/python3.10/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/xinference/model/llm/pytorch/intern_vl.py", line 330, in chat
response, history = self._model.chat(
File "/root/.cache/huggingface/modules/transformers_modules/internvl-chat-pytorch-26b-Int8/modeling_internvl_chat.py", line 304, in chat
generation_output = self.generate(
File "/root/miniconda3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/internvl-chat-pytorch-26b-Int8/modeling_internvl_chat.py", line 339, in generate
vit_embeds = self.extract_feature(pixel_values)
File "/root/.cache/huggingface/modules/transformers_modules/internvl-chat-pytorch-26b-Int8/modeling_internvl_chat.py", line 211, in extract_feature
vit_embeds = self.vision_model(
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/internvl-chat-pytorch-26b-Int8/modeling_intern_vit.py", line 411, in forward
encoder_outputs = self.encoder(
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/internvl-chat-pytorch-26b-Int8/modeling_intern_vit.py", line 347, in forward
layer_outputs = encoder_layer(
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/internvl-chat-pytorch-26b-Int8/modeling_intern_vit.py", line 289, in forward
hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/internvl-chat-pytorch-26b-Int8/modeling_intern_vit.py", line 246, in forward
x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
File "/root/.cache/huggingface/modules/transformers_modules/internvl-chat-pytorch-26b-Int8/modeling_intern_vit.py", line 238, in _flash_attn
context, _ = self.inner_attn(
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/internvl-chat-pytorch-26b-Int8/modeling_intern_vit.py", line 77, in forward
output = flash_attn_unpadded_qkvpacked_func(
File "/root/miniconda3/lib/python3.10/site-packages/flash_attn/flash_attn_interface.py", line 893, in flash_attn_varlen_qkvpacked_func
return FlashAttnVarlenQKVPackedFunc.apply(
File "/root/miniconda3/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/root/miniconda3/lib/python3.10/site-packages/flash_attn/flash_attn_interface.py", line 290, in forward
out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward(
File "/root/miniconda3/lib/python3.10/site-packages/flash_attn/flash_attn_interface.py", line 86, in _flash_attn_varlen_forward
out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_cuda.varlen_fwd(
RuntimeError: [address=0.0.0.0:33747, pid=4651] FlashAttention only supports Ampere GPUs or newer.
Traceback (most recent call last):
File "/root/miniconda3/lib/python3.10/site-packages/gradio/queueing.py", line 527, in process_events
response = await route_utils.call_process_api(
File "/root/miniconda3/lib/python3.10/site-packages/gradio/route_utils.py", line 261, in call_process_api
output = await app.get_blocks().process_api(
File "/root/miniconda3/lib/python3.10/site-packages/gradio/blocks.py", line 1786, in process_api
result = await self.call_function(
File "/root/miniconda3/lib/python3.10/site-packages/gradio/blocks.py", line 1350, in call_function
prediction = await utils.async_iteration(iterator)
File "/root/miniconda3/lib/python3.10/site-packages/gradio/utils.py", line 583, in async_iteration
return await iterator.anext()
File "/root/miniconda3/lib/python3.10/site-packages/gradio/utils.py", line 576, in anext
return await anyio.to_thread.run_sync(
File "/root/miniconda3/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
return await get_async_backend().run_sync_in_worker_thread(
File "/root/miniconda3/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2177, in run_sync_in_worker_thread
return await future
File "/root/miniconda3/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 859, in run
result = context.run(func, *args)
File "/root/miniconda3/lib/python3.10/site-packages/gradio/utils.py", line 559, in run_sync_iterator_async
return next(iterator)
File "/root/miniconda3/lib/python3.10/site-packages/gradio/utils.py", line 742, in gen_wrapper
response = next(iterator)
File "/root/miniconda3/lib/python3.10/site-packages/xinference/core/chat_interface.py", line 229, in predict
response = model.chat(
File "/root/miniconda3/lib/python3.10/site-packages/xinference/client/restful/restful_client.py", line 425, in chat
raise RuntimeError(
RuntimeError: Failed to generate chat completion, detail: [address=0.0.0.0:33747, pid=4651] FlashAttention only supports Ampere GPUs or newer.
I tested an error on another machine
Successfully built chatglm-cpp antlr4-python3-runtime controlnet-aux FlagEmbedding transformers-stream-generator encodec gast quantile-python cdifflib ffmpy oss2 wget aliyun-python-sdk-core crcmod
Failed to build llama-cpp-python
ERROR: ERROR: Failed to build installable wheels for some pyproject.toml based projects (llama-cpp-python)
The text was updated successfully, but these errors were encountered:
Traceback (most recent call last):
File "/root/miniconda3/lib/python3.10/site-packages/xinference/api/restful_api.py", line 1566, in create_chat_completion
data = await model.chat(
File "/root/miniconda3/lib/python3.10/site-packages/xoscar/backends/context.py", line 227, in send
return self._process_result_message(result)
File "/root/miniconda3/lib/python3.10/site-packages/xoscar/backends/context.py", line 102, in _process_result_message
raise message.as_instanceof_cause()
File "/root/miniconda3/lib/python3.10/site-packages/xoscar/backends/pool.py", line 659, in send
result = await self._run_coro(message.message_id, coro)
File "/root/miniconda3/lib/python3.10/site-packages/xoscar/backends/pool.py", line 370, in _run_coro
return await coro
File "/root/miniconda3/lib/python3.10/site-packages/xoscar/api.py", line 384, in on_receive
return await super().on_receive(message) # type: ignore
File "xoscar/core.pyx", line 558, in on_receive
raise ex
File "xoscar/core.pyx", line 520, in xoscar.core._BaseActor.on_receive
async with self._lock:
File "xoscar/core.pyx", line 521, in xoscar.core._BaseActor.on_receive
with debug_async_timeout('actor_lock_timeout',
File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.on_receive
result = await result
File "/root/miniconda3/lib/python3.10/site-packages/xinference/core/utils.py", line 45, in wrapped
ret = await func(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/xinference/core/model.py", line 87, in wrapped_func
ret = await fn(self, *args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/xoscar/api.py", line 462, in _wrapper
r = await func(self, *args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/xinference/core/model.py", line 488, in chat
response = await self._call_wrapper(
File "/root/miniconda3/lib/python3.10/site-packages/xinference/core/model.py", line 111, in _async_wrapper
return await fn(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/xinference/core/model.py", line 371, in _call_wrapper
ret = await asyncio.to_thread(fn, *args, **kwargs)
File "/root/miniconda3/lib/python3.10/asyncio/threads.py", line 25, in to_thread
return await loop.run_in_executor(None, func_call)
File "/root/miniconda3/lib/python3.10/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/xinference/model/llm/pytorch/intern_vl.py", line 330, in chat
response, history = self._model.chat(
File "/root/.cache/huggingface/modules/transformers_modules/internvl-chat-pytorch-26b-Int8/modeling_internvl_chat.py", line 304, in chat
generation_output = self.generate(
File "/root/miniconda3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/internvl-chat-pytorch-26b-Int8/modeling_internvl_chat.py", line 339, in generate
vit_embeds = self.extract_feature(pixel_values)
File "/root/.cache/huggingface/modules/transformers_modules/internvl-chat-pytorch-26b-Int8/modeling_internvl_chat.py", line 211, in extract_feature
vit_embeds = self.vision_model(
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/internvl-chat-pytorch-26b-Int8/modeling_intern_vit.py", line 411, in forward
encoder_outputs = self.encoder(
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/internvl-chat-pytorch-26b-Int8/modeling_intern_vit.py", line 347, in forward
layer_outputs = encoder_layer(
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/internvl-chat-pytorch-26b-Int8/modeling_intern_vit.py", line 289, in forward
hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/internvl-chat-pytorch-26b-Int8/modeling_intern_vit.py", line 246, in forward
x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
File "/root/.cache/huggingface/modules/transformers_modules/internvl-chat-pytorch-26b-Int8/modeling_intern_vit.py", line 238, in _flash_attn
context, _ = self.inner_attn(
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/internvl-chat-pytorch-26b-Int8/modeling_intern_vit.py", line 77, in forward
output = flash_attn_unpadded_qkvpacked_func(
File "/root/miniconda3/lib/python3.10/site-packages/flash_attn/flash_attn_interface.py", line 893, in flash_attn_varlen_qkvpacked_func
return FlashAttnVarlenQKVPackedFunc.apply(
File "/root/miniconda3/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/root/miniconda3/lib/python3.10/site-packages/flash_attn/flash_attn_interface.py", line 290, in forward
out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward(
File "/root/miniconda3/lib/python3.10/site-packages/flash_attn/flash_attn_interface.py", line 86, in _flash_attn_varlen_forward
out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_cuda.varlen_fwd(
RuntimeError: [address=0.0.0.0:33747, pid=4651] FlashAttention only supports Ampere GPUs or newer.
Traceback (most recent call last):
File "/root/miniconda3/lib/python3.10/site-packages/gradio/queueing.py", line 527, in process_events
response = await route_utils.call_process_api(
File "/root/miniconda3/lib/python3.10/site-packages/gradio/route_utils.py", line 261, in call_process_api
output = await app.get_blocks().process_api(
File "/root/miniconda3/lib/python3.10/site-packages/gradio/blocks.py", line 1786, in process_api
result = await self.call_function(
File "/root/miniconda3/lib/python3.10/site-packages/gradio/blocks.py", line 1350, in call_function
prediction = await utils.async_iteration(iterator)
File "/root/miniconda3/lib/python3.10/site-packages/gradio/utils.py", line 583, in async_iteration
return await iterator.anext()
File "/root/miniconda3/lib/python3.10/site-packages/gradio/utils.py", line 576, in anext
return await anyio.to_thread.run_sync(
File "/root/miniconda3/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
return await get_async_backend().run_sync_in_worker_thread(
File "/root/miniconda3/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2177, in run_sync_in_worker_thread
return await future
File "/root/miniconda3/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 859, in run
result = context.run(func, *args)
File "/root/miniconda3/lib/python3.10/site-packages/gradio/utils.py", line 559, in run_sync_iterator_async
return next(iterator)
File "/root/miniconda3/lib/python3.10/site-packages/gradio/utils.py", line 742, in gen_wrapper
response = next(iterator)
File "/root/miniconda3/lib/python3.10/site-packages/xinference/core/chat_interface.py", line 229, in predict
response = model.chat(
File "/root/miniconda3/lib/python3.10/site-packages/xinference/client/restful/restful_client.py", line 425, in chat
raise RuntimeError(
RuntimeError: Failed to generate chat completion, detail: [address=0.0.0.0:33747, pid=4651] FlashAttention only supports Ampere GPUs or newer.
I tested an error on another machine
Successfully built chatglm-cpp antlr4-python3-runtime controlnet-aux FlagEmbedding transformers-stream-generator encodec gast quantile-python cdifflib ffmpy oss2 wget aliyun-python-sdk-core crcmod
Failed to build llama-cpp-python
ERROR: ERROR: Failed to build installable wheels for some pyproject.toml based projects (llama-cpp-python)
The text was updated successfully, but these errors were encountered: