Remove obsolete DeepSpeed inference code (2023 relic)

This commit is contained in:
oobabooga 2026-03-04 17:20:34 -08:00
parent 942ff8fcb4
commit 387cf9d8df
7 changed files with 2 additions and 165 deletions

View file

@ -245,7 +245,7 @@ usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [-
[--extra-flags EXTRA_FLAGS] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code]
[--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE]
[--enable-tp] [--tp-backend TP_BACKEND] [--gpu-split GPU_SPLIT] [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--cpp-runner]
[--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB]
[--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB]
[--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH]
[--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT]
[--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
@ -334,11 +334,6 @@ ExLlamaV2:
TensorRT-LLM:
--cpp-runner Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
DeepSpeed:
--deepspeed Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
--nvme-offload-dir NVME_OFFLOAD_DIR DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
--local_rank LOCAL_RANK DeepSpeed: Optional argument for distributed setups.
RoPE:
--alpha_value ALPHA_VALUE Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.
--rope_freq_base ROPE_FREQ_BASE If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).

View file

@ -13,29 +13,6 @@ Source: https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/1126
This file will be automatically detected the next time you start the web UI.
## DeepSpeed
`DeepSpeed ZeRO-3` is an alternative offloading strategy for full-precision (16-bit) transformers models.
With this, I have been able to load a 6b model (GPT-J 6B) with less than 6GB of VRAM. The speed of text generation is very decent and much better than what would be accomplished with `--auto-devices --gpu-memory 6`.
As far as I know, DeepSpeed is only available for Linux at the moment.
### How to use it
1. Install DeepSpeed:
```
conda install -c conda-forge mpi4py mpich
pip install -U deepspeed
```
2. Start the web UI replacing `python` with `deepspeed --num_gpus=1` and adding the `--deepspeed` flag. Example:
```
deepspeed --num_gpus=1 server.py --deepspeed --chat --model gpt-j-6B
```
## Miscellaneous info
### You can train LoRAs in CPU mode

View file

@ -1,74 +0,0 @@
def generate_ds_config(ds_bf16, train_batch_size, nvme_offload_dir):
'''
DeepSpeed configuration
https://huggingface.co/docs/transformers/main_classes/deepspeed
'''
if nvme_offload_dir:
ds_config = {
"fp16": {
"enabled": not ds_bf16,
},
"bf16": {
"enabled": ds_bf16,
},
"zero_optimization": {
"stage": 3,
"offload_param": {
"device": "nvme",
"nvme_path": nvme_offload_dir,
"pin_memory": True,
"buffer_count": 5,
"buffer_size": 1e9,
"max_in_cpu": 1e9
},
"overlap_comm": True,
"reduce_bucket_size": "auto",
"contiguous_gradients": True,
"sub_group_size": 1e8,
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": "auto",
"stage3_max_reuse_distance": "auto",
},
"aio": {
"block_size": 262144,
"queue_depth": 32,
"thread_count": 1,
"single_submit": False,
"overlap_events": True
},
"steps_per_print": 2000,
"train_batch_size": train_batch_size,
"train_micro_batch_size_per_gpu": 1,
"wall_clock_breakdown": False
}
else:
ds_config = {
"fp16": {
"enabled": not ds_bf16,
},
"bf16": {
"enabled": ds_bf16,
},
"zero_optimization": {
"stage": 3,
"offload_param": {
"device": "cpu",
"pin_memory": True
},
"overlap_comm": True,
"contiguous_gradients": True,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": "auto",
"stage3_max_reuse_distance": "auto",
},
"steps_per_print": 2000,
"train_batch_size": train_batch_size,
"train_micro_batch_size_per_gpu": 1,
"wall_clock_breakdown": False
}
return ds_config

View file

@ -144,12 +144,6 @@ group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N',
group = parser.add_argument_group('TensorRT-LLM')
group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
# DeepSpeed
group = parser.add_argument_group('DeepSpeed')
group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
group.add_argument('--nvme-offload-dir', type=str, help='DeepSpeed: Directory to use for ZeRO-3 NVME offloading.')
group.add_argument('--local_rank', type=int, default=0, help='DeepSpeed: Optional argument for distributed setups.')
# RoPE
group = parser.add_argument_group('RoPE')
group.add_argument('--alpha_value', type=float, default=1, help='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.')

View file

@ -372,8 +372,6 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None,
generate_params['negative_prompt_ids'] = encode(state['negative_prompt'])
generate_params.update({'use_cache': not shared.args.no_cache})
if shared.args.deepspeed:
generate_params.update({'synced_gpus': True})
# Encode the input
input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))

View file

@ -12,9 +12,6 @@ def get_device():
return shared.model.device
elif torch.cuda.is_available():
return torch.device('cuda')
elif shared.args.deepspeed:
import deepspeed
return deepspeed.get_accelerator().current_device_name()
elif torch.backends.mps.is_available():
return torch.device('mps')
elif is_torch_xpu_available():

View file

@ -1,4 +1,3 @@
import os
import pprint
from pathlib import Path
@ -6,11 +5,7 @@ import torch
import torch.nn.functional as F
import transformers
from accelerate import infer_auto_device_map, init_empty_weights
from accelerate.utils import (
is_ccl_available,
is_npu_available,
is_xpu_available
)
from accelerate.utils import is_xpu_available
from transformers import (
AutoConfig,
AutoModel,
@ -28,31 +23,6 @@ from modules.torch_utils import get_device
transformers.logging.set_verbosity_error()
local_rank = None
if shared.args.deepspeed:
import deepspeed
from transformers.integrations.deepspeed import (
HfDeepSpeedConfig,
is_deepspeed_zero3_enabled
)
from modules.deepspeed_parameters import generate_ds_config
# Distributed setup
local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "1"))
if is_xpu_available() and is_ccl_available():
torch.xpu.set_device(local_rank)
deepspeed.init_distributed(backend="ccl")
elif is_npu_available():
torch.npu.set_device(local_rank)
deepspeed.init_distributed(dist_backend="hccl")
else:
torch.cuda.set_device(local_rank)
deepspeed.init_distributed()
ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration
class _StopEverythingStoppingCriteria(transformers.StoppingCriteria):
def __init__(self):
@ -163,7 +133,6 @@ def load_model_HF(model_name):
shared.args.load_in_8bit,
shared.args.load_in_4bit,
shared.args.disk,
shared.args.deepspeed,
shared.args.cpu_memory is not None,
shared.args.compress_pos_emb > 1,
shared.args.alpha_value > 1,
@ -183,25 +152,6 @@ def load_model_HF(model_name):
if device:
model = model.to(device)
# DeepSpeed ZeRO-3
elif shared.args.deepspeed:
model = LoaderClass.from_pretrained(
path_to_model,
torch_dtype=params['torch_dtype'],
trust_remote_code=params.get('trust_remote_code')
)
model = deepspeed.initialize(
model=model,
config_params=ds_config,
model_parameters=None,
optimizer=None,
lr_scheduler=None
)[0]
model.module.eval() # Inference
logger.info(f'DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}')
# Load with quantization and/or offloading
else:
if not any((shared.args.cpu, torch.cuda.is_available(), is_xpu_available(), torch.backends.mps.is_available())):