Source code for mii.api

# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team
from typing import Optional, Any, Dict, Tuple

import mii
from mii.backend import MIIClient  # , MIIServer
from mii.batching import MIIPipeline, MIIAsyncPipeline
from mii.config import get_mii_config, ModelConfig, MIIConfig
from mii.constants import DeploymentType
from mii.errors import UnknownArgument
from mii.modeling.models import load_model
from mii.score import create_score_file
from mii.modeling.tokenizers import load_tokenizer
from mii.utils import import_score_file


def _parse_kwargs_to_model_config(
    model_name_or_path: str = "",
    model_config: Optional[Dict] = None,
    **kwargs,
) -> Tuple[ModelConfig,
           Dict[str,
                Any]]:
    if model_config is None:
        model_config = {}

    assert isinstance(model_config, dict), "model_config must be a dict"

    # If model_name_or_path is set in model config, make sure it matches the kwarg
    if model_name_or_path:
        if "model_name_or_path" in model_config:
            assert (
                model_config.get("model_name_or_path") == model_name_or_path
            ), "model_name_or_path in model_config must match model_name_or_path"
        model_config["model_name_or_path"] = model_name_or_path

    # Fill model_config dict with relevant kwargs, store remaining kwargs in a new dict
    remaining_kwargs = {}
    for key, val in kwargs.items():
        if key in ModelConfig.model_fields.keys():
            if key in model_config:
                assert (
                    model_config.get(key) == val
                ), f"{key} in model_config must match {key}"
            model_config[key] = val
        else:
            remaining_kwargs[key] = val

    # Create the ModelConfig object and return it with remaining kwargs
    model_config = ModelConfig(**model_config)

    return model_config, remaining_kwargs


def _parse_kwargs_to_mii_config(
    model_name_or_path: str = "",
    model_config: Optional[Dict] = None,
    mii_config: Optional[Dict] = None,
    **kwargs,
) -> MIIConfig:
    if mii_config is None:
        mii_config = {}

    if model_config is None:
        model_config = mii_config.get("model_config", {})

    # Parse all model_config kwargs
    model_config, remaining_kwargs = _parse_kwargs_to_model_config(
        model_name_or_path=model_name_or_path, model_config=model_config, **kwargs
    )

    assert isinstance(mii_config, dict), "mii_config must be a dict"

    mii_config["model_config"] = model_config

    # Fill mii_config dict with relevant kwargs, raise error on unknown kwargs
    for key, val in remaining_kwargs.items():
        if key in MIIConfig.model_fields.keys():
            if key in mii_config:
                assert (
                    mii_config.get(key) == val
                ), f"{key} in mii_config must match {key}"
            mii_config[key] = val
        else:
            raise UnknownArgument(f"Keyword argument {key} not recognized")

    # Return the MIIConfig object
    mii_config = MIIConfig(**mii_config)
    return mii_config


[docs]def client(model_or_deployment_name: str) -> MIIClient:
    """
    Creates a client object for interfacing with an existing persistent model deployment.

    :param model_or_deployment_name: Name of the HuggingFace model name for the
        persistent model deployment. If `deployment_name` was provided input to
        :func:`mii.serve`, users should provide the `deployment_name` string instead.

    :raises UnknownArgument: Raised when provided keyword argument does not
        match any field in :class:`ModelConfig <mii.config.ModelConfig>` or
        :class:`MIIConfig <mii.config.MIIConfig>`.

    :return: Client object that can be used to interface with the deployed
        persistent model server, which uses ragged batching and dynamic splitfuse.
    """
    mii_config = get_mii_config(model_or_deployment_name)

    return MIIClient(mii_config)


[docs]def serve(
    model_name_or_path: str = "",
    model_config: Optional[Dict] = None,
    mii_config: Optional[Dict] = None,
    **kwargs,
) -> MIIClient:
    """
    Creates a persistent MII model deployment from a locally stored model path
    or HuggingFace model name.

    :param model_name_or_path: HuggingFace model name or path to locally stored
        model. This must be provided here or in the `model_config` dictionary.
    :param model_config: Dictionary containing model configuration fields. See
        :class:`ModelConfig <mii.config.ModelConfig>` for a full list of options.
        Users can pass these options in a dictionary here or as keyword arguments to
        the function.
    :param mii_config: Dictionary containing DeepSpeed-MII configuration fields.
        See :class:`MIIConfig <mii.config.MIIConfig>` for a full list of options.
        Users can pass these options in a dictionary here or as keyword arguments to
        the function.

    :raises UnknownArgument: Raised when provided keyword argument does not
        match any field in :class:`ModelConfig <mii.config.ModelConfig>` or
        :class:`MIIConfig <mii.config.MIIConfig>`.

    :return: Client object that can be used to interface with the deployed
        persistent model server, which uses ragged batching and dynamic splitfuse.
    """
    mii_config = _parse_kwargs_to_mii_config(
        model_name_or_path=model_name_or_path,
        model_config=model_config,
        mii_config=mii_config,
        **kwargs,
    )

    # TODO: Creating a score file behavior should be changed as AML deployment
    # support no longer works. Given the integration of MII/FastGen within AML
    # deployment containers, we can remove that deployment type and the need to
    # create a score file. Instead, we should create a config file (perhaps a
    # pickled MIIConfig?) and save that where we would save the score file. This
    # config can then be loaded and used similar to how the score file was used.
    # Additionally, some code for standing up the deployment server will need to
    # move from the score file template file to the `MIIServer` class:
    # MIIServer(mii_config)
    create_score_file(mii_config)

    if mii_config.deployment_type == DeploymentType.LOCAL:
        # Imports the created score file and executes the init() function, then
        # returns a MIIClient object. With the changes suggested in the comment
        # above, importing the score file would not be necessary.

        # How grpc server is created:
        # 1. The score.py file init() function makes a call to mii.backend.server.MIIServer()
        # 2. MIIServer.__init__() starts load balancer, REST API, and inference
        #    model processes via the mii.launch.multi_gpu_server script.
        #    Load balancer -> mii.grpc_related.modelresponse_server.serve_load_balancing
        #    REST API -> mii.grpc_related.restful_gateway.RestfulGatewayThread
        #    Inference model -> mii.api.async_pipeline & mii.grpc_related.modelresponse_server.serve_inference
        # 3. Load balancer and inference model create grpc.server() processes
        #    (via mii.grpc_related.modelresponse_server._do_serve)
        # 4. An MIIClient() is created that uses a "stub" (via
        #    mii.grpc_related.proto.modelresponse_pb2_grpc.ModelResponseStub) that
        #    can send/receive messages to/from the load balancer process. The load
        #    balancer process then acts as a middle layer between the client(s) and
        #    the model inference server(s)
        import_score_file(mii_config.deployment_name, DeploymentType.LOCAL).init()
        return MIIClient(mii_config=mii_config)
    if mii_config.deployment_type == DeploymentType.AML:
        acr_name = mii.aml_related.utils.get_acr_name()
        mii.aml_related.utils.generate_aml_scripts(
            acr_name=acr_name,
            deployment_name=mii_config.deployment_name,
            model_name=mii_config.model_conf.model,
            task_name=mii_config.model_conf.task,
            replica_num=mii_config.model_conf.replica_num,
            instance_type=mii_config.instance_type,
            version=mii_config.version,
        )
        print(
            f"AML deployment assets at {mii.aml_related.utils.aml_output_path(mii_config.deployment_name)}"
        )
        print("Please run 'deploy.sh' to bring your deployment online")


[docs]def pipeline(
    model_name_or_path: str = "",
    model_config: Optional[Dict] = None,
    all_rank_output: bool = False,
    **kwargs,
) -> MIIPipeline:
    """
    Creates a non-persistent MII model pipeline from a locally stored model path
    or HuggingFace model name.

    :param model_name_or_path: HuggingFace model name or path to locally stored
        model. This must be provided here or in the `model_config` dictionary.
    :param model_config: Dictionary containing model configuration fields. See
        :class:`ModelConfig <mii.config.ModelConfig>` for a full list of options.
        Users can pass these options in a dictionary here or as keyword arguments to
        the function.
    :param all_rank_output: Whether to return generated text on all ranks
        (when using `tensor_parallel>1`). If `True`, all ranks will return the
        same output. If `False`, only rank 0 will return output and the rest
        will return `None`.

    :raises UnknownArgument: Raised when provided keyword argument does not
        match any field in :class:`ModelConfig <mii.config.ModelConfig>`.

    :return: Non-persistent model pipeline using ragged batching and dynamic splitfuse.
    """
    model_config, remaining_kwargs = _parse_kwargs_to_model_config(
        model_name_or_path=model_name_or_path, model_config=model_config, **kwargs
    )
    if remaining_kwargs:
        raise UnknownArgument(
            f"Keyword argument(s) {remaining_kwargs.keys()} not recognized")

    inference_engine = load_model(model_config)
    tokenizer = load_tokenizer(model_config)
    inference_pipeline = MIIPipeline(
        inference_engine=inference_engine,
        tokenizer=tokenizer,
        model_config=model_config,
        all_rank_output=all_rank_output,
    )
    return inference_pipeline


def async_pipeline(model_config: ModelConfig) -> MIIAsyncPipeline:
    inference_engine = load_model(model_config)
    tokenizer = load_tokenizer(model_config)
    inference_pipeline = MIIAsyncPipeline(
        inference_engine=inference_engine,
        tokenizer=tokenizer,
        model_config=model_config,
    )
    return inference_pipeline