mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-03-06 21:53:50 +01:00
Remove obsolete DeepSpeed inference code (2023 relic)
This commit is contained in:
parent
942ff8fcb4
commit
387cf9d8df
|
|
@ -245,7 +245,7 @@ usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [-
|
|||
[--extra-flags EXTRA_FLAGS] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code]
|
||||
[--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE]
|
||||
[--enable-tp] [--tp-backend TP_BACKEND] [--gpu-split GPU_SPLIT] [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--cpp-runner]
|
||||
[--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB]
|
||||
[--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB]
|
||||
[--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH]
|
||||
[--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT]
|
||||
[--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
|
||||
|
|
@ -334,11 +334,6 @@ ExLlamaV2:
|
|||
TensorRT-LLM:
|
||||
--cpp-runner Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
|
||||
|
||||
DeepSpeed:
|
||||
--deepspeed Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
|
||||
--nvme-offload-dir NVME_OFFLOAD_DIR DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
|
||||
--local_rank LOCAL_RANK DeepSpeed: Optional argument for distributed setups.
|
||||
|
||||
RoPE:
|
||||
--alpha_value ALPHA_VALUE Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.
|
||||
--rope_freq_base ROPE_FREQ_BASE If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).
|
||||
|
|
|
|||
|
|
@ -13,29 +13,6 @@ Source: https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/1126
|
|||
|
||||
This file will be automatically detected the next time you start the web UI.
|
||||
|
||||
## DeepSpeed
|
||||
|
||||
`DeepSpeed ZeRO-3` is an alternative offloading strategy for full-precision (16-bit) transformers models.
|
||||
|
||||
With this, I have been able to load a 6b model (GPT-J 6B) with less than 6GB of VRAM. The speed of text generation is very decent and much better than what would be accomplished with `--auto-devices --gpu-memory 6`.
|
||||
|
||||
As far as I know, DeepSpeed is only available for Linux at the moment.
|
||||
|
||||
### How to use it
|
||||
|
||||
1. Install DeepSpeed:
|
||||
|
||||
```
|
||||
conda install -c conda-forge mpi4py mpich
|
||||
pip install -U deepspeed
|
||||
```
|
||||
|
||||
2. Start the web UI replacing `python` with `deepspeed --num_gpus=1` and adding the `--deepspeed` flag. Example:
|
||||
|
||||
```
|
||||
deepspeed --num_gpus=1 server.py --deepspeed --chat --model gpt-j-6B
|
||||
```
|
||||
|
||||
## Miscellaneous info
|
||||
|
||||
### You can train LoRAs in CPU mode
|
||||
|
|
|
|||
|
|
@ -1,74 +0,0 @@
|
|||
def generate_ds_config(ds_bf16, train_batch_size, nvme_offload_dir):
|
||||
'''
|
||||
DeepSpeed configuration
|
||||
https://huggingface.co/docs/transformers/main_classes/deepspeed
|
||||
'''
|
||||
|
||||
if nvme_offload_dir:
|
||||
ds_config = {
|
||||
"fp16": {
|
||||
"enabled": not ds_bf16,
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": ds_bf16,
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"offload_param": {
|
||||
"device": "nvme",
|
||||
"nvme_path": nvme_offload_dir,
|
||||
"pin_memory": True,
|
||||
"buffer_count": 5,
|
||||
"buffer_size": 1e9,
|
||||
"max_in_cpu": 1e9
|
||||
},
|
||||
"overlap_comm": True,
|
||||
"reduce_bucket_size": "auto",
|
||||
"contiguous_gradients": True,
|
||||
"sub_group_size": 1e8,
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"stage3_max_live_parameters": "auto",
|
||||
"stage3_max_reuse_distance": "auto",
|
||||
},
|
||||
"aio": {
|
||||
"block_size": 262144,
|
||||
"queue_depth": 32,
|
||||
"thread_count": 1,
|
||||
"single_submit": False,
|
||||
"overlap_events": True
|
||||
},
|
||||
"steps_per_print": 2000,
|
||||
"train_batch_size": train_batch_size,
|
||||
"train_micro_batch_size_per_gpu": 1,
|
||||
"wall_clock_breakdown": False
|
||||
}
|
||||
else:
|
||||
ds_config = {
|
||||
"fp16": {
|
||||
"enabled": not ds_bf16,
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": ds_bf16,
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"offload_param": {
|
||||
"device": "cpu",
|
||||
"pin_memory": True
|
||||
},
|
||||
"overlap_comm": True,
|
||||
"contiguous_gradients": True,
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"stage3_max_live_parameters": "auto",
|
||||
"stage3_max_reuse_distance": "auto",
|
||||
},
|
||||
"steps_per_print": 2000,
|
||||
"train_batch_size": train_batch_size,
|
||||
"train_micro_batch_size_per_gpu": 1,
|
||||
"wall_clock_breakdown": False
|
||||
}
|
||||
|
||||
return ds_config
|
||||
|
|
@ -144,12 +144,6 @@ group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N',
|
|||
group = parser.add_argument_group('TensorRT-LLM')
|
||||
group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
|
||||
|
||||
# DeepSpeed
|
||||
group = parser.add_argument_group('DeepSpeed')
|
||||
group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
|
||||
group.add_argument('--nvme-offload-dir', type=str, help='DeepSpeed: Directory to use for ZeRO-3 NVME offloading.')
|
||||
group.add_argument('--local_rank', type=int, default=0, help='DeepSpeed: Optional argument for distributed setups.')
|
||||
|
||||
# RoPE
|
||||
group = parser.add_argument_group('RoPE')
|
||||
group.add_argument('--alpha_value', type=float, default=1, help='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.')
|
||||
|
|
|
|||
|
|
@ -372,8 +372,6 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None,
|
|||
generate_params['negative_prompt_ids'] = encode(state['negative_prompt'])
|
||||
|
||||
generate_params.update({'use_cache': not shared.args.no_cache})
|
||||
if shared.args.deepspeed:
|
||||
generate_params.update({'synced_gpus': True})
|
||||
|
||||
# Encode the input
|
||||
input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
|
||||
|
|
|
|||
|
|
@ -12,9 +12,6 @@ def get_device():
|
|||
return shared.model.device
|
||||
elif torch.cuda.is_available():
|
||||
return torch.device('cuda')
|
||||
elif shared.args.deepspeed:
|
||||
import deepspeed
|
||||
return deepspeed.get_accelerator().current_device_name()
|
||||
elif torch.backends.mps.is_available():
|
||||
return torch.device('mps')
|
||||
elif is_torch_xpu_available():
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
import os
|
||||
import pprint
|
||||
from pathlib import Path
|
||||
|
||||
|
|
@ -6,11 +5,7 @@ import torch
|
|||
import torch.nn.functional as F
|
||||
import transformers
|
||||
from accelerate import infer_auto_device_map, init_empty_weights
|
||||
from accelerate.utils import (
|
||||
is_ccl_available,
|
||||
is_npu_available,
|
||||
is_xpu_available
|
||||
)
|
||||
from accelerate.utils import is_xpu_available
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModel,
|
||||
|
|
@ -28,31 +23,6 @@ from modules.torch_utils import get_device
|
|||
|
||||
transformers.logging.set_verbosity_error()
|
||||
|
||||
local_rank = None
|
||||
if shared.args.deepspeed:
|
||||
import deepspeed
|
||||
from transformers.integrations.deepspeed import (
|
||||
HfDeepSpeedConfig,
|
||||
is_deepspeed_zero3_enabled
|
||||
)
|
||||
|
||||
from modules.deepspeed_parameters import generate_ds_config
|
||||
|
||||
# Distributed setup
|
||||
local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0"))
|
||||
world_size = int(os.getenv("WORLD_SIZE", "1"))
|
||||
if is_xpu_available() and is_ccl_available():
|
||||
torch.xpu.set_device(local_rank)
|
||||
deepspeed.init_distributed(backend="ccl")
|
||||
elif is_npu_available():
|
||||
torch.npu.set_device(local_rank)
|
||||
deepspeed.init_distributed(dist_backend="hccl")
|
||||
else:
|
||||
torch.cuda.set_device(local_rank)
|
||||
deepspeed.init_distributed()
|
||||
ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
|
||||
dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration
|
||||
|
||||
|
||||
class _StopEverythingStoppingCriteria(transformers.StoppingCriteria):
|
||||
def __init__(self):
|
||||
|
|
@ -163,7 +133,6 @@ def load_model_HF(model_name):
|
|||
shared.args.load_in_8bit,
|
||||
shared.args.load_in_4bit,
|
||||
shared.args.disk,
|
||||
shared.args.deepspeed,
|
||||
shared.args.cpu_memory is not None,
|
||||
shared.args.compress_pos_emb > 1,
|
||||
shared.args.alpha_value > 1,
|
||||
|
|
@ -183,25 +152,6 @@ def load_model_HF(model_name):
|
|||
if device:
|
||||
model = model.to(device)
|
||||
|
||||
# DeepSpeed ZeRO-3
|
||||
elif shared.args.deepspeed:
|
||||
model = LoaderClass.from_pretrained(
|
||||
path_to_model,
|
||||
torch_dtype=params['torch_dtype'],
|
||||
trust_remote_code=params.get('trust_remote_code')
|
||||
)
|
||||
|
||||
model = deepspeed.initialize(
|
||||
model=model,
|
||||
config_params=ds_config,
|
||||
model_parameters=None,
|
||||
optimizer=None,
|
||||
lr_scheduler=None
|
||||
)[0]
|
||||
|
||||
model.module.eval() # Inference
|
||||
logger.info(f'DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}')
|
||||
|
||||
# Load with quantization and/or offloading
|
||||
else:
|
||||
if not any((shared.args.cpu, torch.cuda.is_available(), is_xpu_available(), torch.backends.mps.is_available())):
|
||||
|
|
|
|||
Loading…
Reference in a new issue