diff --git a/1-build/*.py/config.properties b/1-build/*.py/config.properties new file mode 100755 index 0000000..e69de29 diff --git a/1-build/Dockerfile-base-arm b/1-build/Dockerfile-base-arm deleted file mode 100644 index fe34d5d..0000000 --- a/1-build/Dockerfile-base-arm +++ /dev/null @@ -1,9 +0,0 @@ -FROM python:3.9 - -LABEL description="Base container for CPU models running on ARM architecture processors" - -RUN apt-get update && apt-get install -y htop dnsutils bc vim - -RUN pip install torch configparser transformers - -RUN echo "alias ll='ls -alh --color=auto'" >> /root/.bashrc diff --git a/1-build/Dockerfile-base-graviton b/1-build/Dockerfile-base-graviton deleted file mode 100644 index 76eca36..0000000 --- a/1-build/Dockerfile-base-graviton +++ /dev/null @@ -1,9 +0,0 @@ -FROM python:3.9 - -LABEL description="Base container for CPU models running on Graviton architecture processors" - -RUN apt-get update && apt-get install -y htop dnsutils bc vim - -RUN pip install torch configparser transformers - -RUN echo "alias ll='ls -alh --color=auto'" >> /root/.bashrc diff --git a/1-build/Dockerfile-base-inf b/1-build/Dockerfile-base-inf new file mode 100644 index 0000000..5919bd9 --- /dev/null +++ b/1-build/Dockerfile-base-inf @@ -0,0 +1,19 @@ +FROM amazonlinux:2 + +LABEL description="Base container for Inferentia1 models" +ENV PYTHONUNBUFFERED=TRUE +ENV PYTHONDONTWRITEBYTECODE=TRUE +ADD ./1-build/etc /etc +RUN echo -e '[neuron]\nname=Neuron YUM Repository\nbaseurl=https://yum.repos.neuron.amazonaws.com\nenabled=1\nmetadata_expire=0\n' >> /etc/yum.repos.d/neuron.repo +RUN rpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB +RUN yum update -y && \ + yum install -y python3 python3-devel gcc-c++ && \ + yum install -y tar gzip ca-certificates procps net-tools which vim wget libgomp htop jq bind-utils bc pciutils && \ + yum install -y aws-neuronx-tools-2.* +RUN pip3 install --upgrade --force-reinstall --no-cache-dir neuron-cc[tensorflow] torch-neuron transformers==4.2.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com +RUN pip3 install --no-cache-dir torchserve==0.3.0 torch-model-archiver==0.3.0 configparser +RUN alternatives --install /usr/bin/python python /usr/bin/python3 1; alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 +RUN echo "export PATH=/opt/aws/neuron/bin:$PATH" >> /root/.bashrc +RUN echo "alias ll='ls -alh --color=auto'" >> /root/.bashrc +ADD ./1-build/*.py /app/ + diff --git a/1-build/Dockerfile-base-inf2 b/1-build/Dockerfile-base-inf2 index 9523b07..959532a 100644 --- a/1-build/Dockerfile-base-inf2 +++ b/1-build/Dockerfile-base-inf2 @@ -1,42 +1,21 @@ FROM amazonlinux:2 - -LABEL description="Base container for Inferentia2 models" + +LABEL description="Base container for Inferentia1 models" ENV PYTHONUNBUFFERED=TRUE ENV PYTHONDONTWRITEBYTECODE=TRUE ADD ./1-build/etc /etc -# Neuron SDK components version numbers -ARG NEURONX_RUNTIME_LIB_VERSION=2.16.* -ARG NEURONX_COLLECTIVES_LIB_VERSION=2.16.* -ARG NEURONX_TOOLS_VERSION=2.13.* -ARG NEURONX_FRAMEWORK_VERSION=1.13.1.1.10.* -ARG NEURONX_TRANSFORMERS_VERSION=0.6.* -ARG NEURONX_CC_VERSION=2.9.* -ARG TORCHSERVE_VERSION=0.8.2 - RUN echo -e '[neuron]\nname=Neuron YUM Repository\nbaseurl=https://yum.repos.neuron.amazonaws.com\nenabled=1\nmetadata_expire=0\n' >> /etc/yum.repos.d/neuron.repo RUN rpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB -RUN amazon-linux-extras install -y python3.8 RUN yum update -y && \ - yum install -y git tar gzip ca-certificates procps net-tools which vim wget libgomp htop jq bind-utils bc pciutils && \ - yum install -y gcc-c++ && \ - yum install -y jq java-11-amazon-corretto-headless # for torchserve -RUN yum install -y aws-neuronx-collectives-${NEURONX_COLLECTIVES_LIB_VERSION} && \ - yum install -y aws-neuronx-runtime-lib-${NEURONX_RUNTIME_LIB_VERSION} && \ - yum install -y aws-neuronx-tools-${NEURONX_TOOLS_VERSION} -ENV PATH="/opt/aws/neuron/bin:${PATH}" -RUN echo 'alias python=python3.8' >> ~/.bashrc -RUN echo 'alias pip=pip3.8' >> ~/.bashrc -RUN update-alternatives --install /usr/bin/pip pip /usr/bin/pip3.8 1 - -RUN pip3.8 install --extra-index-url https://pip.repos.neuron.amazonaws.com \ - neuronx-cc==$NEURONX_CC_VERSION \ - torch-neuronx==$NEURONX_FRAMEWORK_VERSION \ - transformers-neuronx==$NEURONX_TRANSFORMERS_VERSION -RUN pip3.8 install "protobuf<4" \ - && pip3.8 install torchserve==${TORCHSERVE_VERSION} \ - && pip3.8 install torch-model-archiver==${TORCHSERVE_VERSION} \ - && pip3.8 install --no-deps --no-cache-dir -U torchvision==0.14.* captum==0.6.0 configparser - + yum install -y python3 python3-devel gcc-c++ && \ + yum install -y tar gzip ca-certificates procps net-tools which vim wget libgomp htop jq bind-utils bc pciutils && \ + yum install -y aws-neuronx-tools-2.* +RUN yum install -y aws-neuronx-collectives-2.* && \ + yum install -y aws-neuronx-runtime-lib-2.* +RUN pip3 install --upgrade --force-reinstall --no-cache-dir neuronx-cc[tensorflow] torch-neuronx transformers==4.2.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com +RUN pip3 install --no-cache-dir torchserve==0.3.0 torch-model-archiver==0.3.0 configparser +RUN alternatives --install /usr/bin/python python /usr/bin/python3 1; alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 +RUN echo "export PATH=/opt/aws/neuron/bin:$PATH" >> /root/.bashrc RUN echo "alias ll='ls -alh --color=auto'" >> /root/.bashrc ADD ./1-build/*.py /app/ diff --git a/1-build/etc/hostname b/1-build/etc/hostname new file mode 100755 index 0000000..e69de29 diff --git a/1-build/etc/hosts b/1-build/etc/hosts new file mode 100755 index 0000000..e69de29 diff --git a/1-build/etc/resolv.conf b/1-build/etc/resolv.conf new file mode 100755 index 0000000..e69de29 diff --git a/2-trace/model-tracer.py b/2-trace/model-tracer.py index b350d9a..50f588c 100644 --- a/2-trace/model-tracer.py +++ b/2-trace/model-tracer.py @@ -1,127 +1,30 @@ -###################################################################### -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # -# SPDX-License-Identifier: MIT-0 # -###################################################################### - -import platform -import torch +import os import importlib +import torch from configparser import ConfigParser - -machine=platform.uname().machine -device_type='cpu' -if machine == 'aarch64': - device_type='arm' - -try: - import torch_neuron - device_type='inf1' -except ImportError: - print('[WARN] Torch Neuron not Found') - pass -try: - import torch_neuronx - device_type='inf2' -except ImportError: - print('[WARN] Torch Neuronx not Found') - pass - -import os - -# 1. READ config.properties -print("\nParsing configuration ...") -path_prefix = os.getcwd() -with open(path_prefix + '/../config.properties') as f: - config_lines = '[global]\n' + f.read() - f.close() -config = ConfigParser() -config.read_string(config_lines) - -model_name = config['global']['huggingface_model_name'] -tokenizer_class_name = config['global']['huggingface_tokenizer_class'] -model_class_name = config['global']['huggingface_model_class'] -sequence_length=int(config['global']['sequence_length']) -processor=config['global']['processor'] -pipeline_cores=config['global']['pipeline_cores'] -batch_size=int(config['global']['batch_size']) -test=config['global']['test'] - -question = "What does the little engine say?" - -context = """In the childrens story about the little engine a small locomotive is pulling a large load up a mountain. - Since the load is heavy and the engine is small it is not sure whether it will be able to do the job. This is a story - about how an optimistic attitude empowers everyone to achieve more. In the story the little engine says: 'I think I can' as it is - pulling the heavy load all the way to the top of the mountain. On the way down it says: I thought I could.""" - - -# 2. LOAD PRE-TRAINED MODEL -print(f'\nLoading pre-trained model: {model_name}') -transformers = importlib.import_module("transformers") -tokenizer_class = getattr(transformers, tokenizer_class_name) -model_class = getattr(transformers, model_class_name) -tokenizer = tokenizer_class.from_pretrained(model_name) -model = model_class.from_pretrained(model_name, return_dict=False) - -# 3. TOKENIZE THE INPUT -print('\nTokenizing input sample ...') -inputs = tokenizer.encode_plus(question, - context, - return_tensors="pt", - max_length=sequence_length, - padding='max_length', - truncation=True) -if device_type not in ['inf1', 'inf2']: - if torch.cuda.is_available(): - device = torch.device("cuda") - device_type = "gpu" - model.to(device) - inputs.to(device) - else: - device = torch.device("cpu") - -if device_type == processor: - print(f" ... Using device: {device_type}") -else: - print(f"[WARN] detected device_type ({device_type}) does not match the configured processor ({processor})") - -# 2. COMPILE THE MODEL -print('\nTracing model ...') -example_inputs = ( - torch.cat([inputs['input_ids']] * batch_size,0), - torch.cat([inputs['attention_mask']] * batch_size,0) -) -os.makedirs(f'traced-{model_name}', exist_ok=True) -torch.set_num_threads(6) -if 'inf' == processor: - model_traced = torch.neuron.trace(model, - example_inputs, - verbose=1, - compiler_workdir=f'./traced-{model_name}/compile_wd_{processor}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}', - compiler_args = ['--neuroncore-pipeline-cores', str(pipeline_cores)]) -elif 'inf2' == processor: - model_traced = torch_neuronx.trace(model, - example_inputs) -else: - model_traced = torch.jit.trace(model, example_inputs) - -# 3. TEST THE COMPILED MODEL (Optional) -if test.lower() == 'true': - print("\nTesting traced model ...") - print(f"Question: {question}") - # Testing the traced model - answer_logits = model_traced(*example_inputs) - answer_start = answer_logits[0].argmax().item() - answer_end = answer_logits[1].argmax().item()+1 - answer_txt = "" - if answer_end > answer_start: - answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end])) - else: - answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:])) - print(f'Model Answer: {answer_txt}') - -# 4. SAVE THE COMPILED MODEL -print('\nSaving traced model ...') -model_path=f'./traced-{model_name}/{model_name}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}_{processor}.pt' -model_traced.save(model_path) - -print(f'Done. Model saved as: {model_path}') +from transformers_neuronx.llama.model import LlamaForSampling +from transformers import AutoModelForCausalLM +from transformers_neuronx.module import save_pretrained_split +tp_degree = 2 +batch_size = 1 +sequence_length = 256 +amp_type = 'bf16' +os.environ["NEURON_CC_FLAGS"] = "--model-type=transformer-inference" +os.environ['NEURON_RT_NUM_CORES'] = str(tp_degree) +os.environ["NEURONX_CACHE"]= "on" +os.environ["NEURONX_DUMP_TO"] = f"./neuron_cache/tp{tp_degree}_bs{batch_size}_seqlen{sequence_length}" +# create a directory for model +model_dir = "/app/llama_model" # hugging face format +os.makedirs(model_dir, exist_ok=True) +# initialize the model +model = AutoModelForCausalLM.from_pretrained(model_dir, low_cpu_mem_usage=True, torch_dtype=torch.float16) +# serialize the model +serialized_model_dir = os.path.join(model_dir, 'serialized') +os.makedirs(serialized_model_dir, exist_ok=True) +save_pretrained_split(model, serialized_model_dir) +# create neuron model +#transformers_neuronx = importlib.import_module("transformers_neuronx") +#neuron_model_class = getattr(transformers_neuronx, neuron_model_class_name) +neuron_model = LlamaForSampling.from_pretrained(serialized_model_dir, tp_degree=tp_degree, batch_size=batch_size, amp=amp_type) +# compile model for neuron +neuron_model.to_neuron() diff --git a/2-trace/old_model-tracer.py b/2-trace/old_model-tracer.py new file mode 100644 index 0000000..4fb6dde --- /dev/null +++ b/2-trace/old_model-tracer.py @@ -0,0 +1,124 @@ +###################################################################### +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # +# SPDX-License-Identifier: MIT-0 # +###################################################################### + +import torch +import importlib +from configparser import ConfigParser + +device_type='cpu' + +try: + import torch_neuron + device_type='inf1' +except ImportError: + print('[WARN] Torch Neuron not Found') + pass +try: + import torch_neuronx + device_type='inf2' +except ImportError: + print('[WARN] Torch Neuronx not Found') + pass + +import os + +# 1. READ config.properties +print("\nParsing configuration ...") +path_prefix = os.getcwd() +with open(path_prefix + '/../config.properties') as f: + config_lines = '[global]\n' + f.read() + f.close() +config = ConfigParser() +config.read_string(config_lines) + +model_name = config['global']['huggingface_model_name'] +tokenizer_class_name = config['global']['huggingface_tokenizer_class'] +model_class_name = config['global']['huggingface_model_class'] +sequence_length=int(config['global']['sequence_length']) +processor=config['global']['processor'] +pipeline_cores=config['global']['pipeline_cores'] +batch_size=int(config['global']['batch_size']) +test=config['global']['test'] + +question = "What does the little engine say?" + +context = """In the childrens story about the little engine a small locomotive is pulling a large load up a mountain. + Since the load is heavy and the engine is small it is not sure whether it will be able to do the job. This is a story + about how an optimistic attitude empowers everyone to achieve more. In the story the little engine says: 'I think I can' as it is + pulling the heavy load all the way to the top of the mountain. On the way down it says: I thought I could.""" + + +# 2. LOAD PRE-TRAINED MODEL +print(f'\nLoading pre-trained model: {model_name}') +transformers = importlib.import_module("transformers") +tokenizer_class = getattr(transformers, tokenizer_class_name) +model_class = getattr(transformers, model_class_name) +tokenizer = tokenizer_class.from_pretrained(model_name) +model = model_class.from_pretrained(model_name, return_dict=False) + +# 3. TOKENIZE THE INPUT +print('\nTokenizing input sample ...') +inputs = tokenizer.encode_plus(question, + context, + return_tensors="pt", + max_length=sequence_length, + padding='max_length', + truncation=True) +if device_type not in ['inf1', 'inf2']: + if torch.cuda.is_available(): + device = torch.device("cuda") + device_type = "gpu" + model.to(device) + inputs.to(device) + else: + device = torch.device("cpu") + device_type = 'cpu' + +if device_type == processor: + print(f" ... Using device: {device_type}") +else: + print(f"[WARN] detected device_type ({device_type}) does not match the configured processor ({processor})") + +# 2. COMPILE THE MODEL +print('\nTracing model ...') +example_inputs = ( + torch.cat([inputs['input_ids']] * batch_size,0), + torch.cat([inputs['attention_mask']] * batch_size,0) +) +os.makedirs(f'traced-{model_name}', exist_ok=True) +torch.set_num_threads(6) +if 'inf' in processor: + model_traced = torch.neuron.trace(model, + example_inputs, + verbose=1, + compiler_workdir=f'./traced-{model_name}/compile_wd_{processor}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}', + compiler_args = ['--neuroncore-pipeline-cores', str(pipeline_cores)]) +elif 'inf2' in processor: + model_traced = torch_neuronx.trace(model, + example_inputs) +else: + model_traced = torch.jit.trace(model, example_inputs) + +# 3. TEST THE COMPILED MODEL (Optional) +if test.lower() == 'true': + print("\nTesting traced model ...") + print(f"Question: {question}") + # Testing the traced model + answer_logits = model_traced(*example_inputs) + answer_start = answer_logits[0].argmax().item() + answer_end = answer_logits[1].argmax().item()+1 + answer_txt = "" + if answer_end > answer_start: + answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end])) + else: + answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:])) + print(f'Model Answer: {answer_txt}') + +# 4. SAVE THE COMPILED MODEL +print('\nSaving traced model ...') +model_path=f'./traced-{model_name}/{model_name}_{processor}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}.pt' +model_traced.save(model_path) + +print(f'Done. Model saved as: {model_path}') diff --git a/3-pack/Dockerfile b/3-pack/Dockerfile index f505e40..a90c0ce 100644 --- a/3-pack/Dockerfile +++ b/3-pack/Dockerfile @@ -18,12 +18,22 @@ COPY ./3-pack/run.sh /app/server/run.sh COPY ./3-pack/requirements.txt /app/server/requirements.txt -COPY ./2-trace/traced-${MODEL_NAME}/${MODEL_FILE_NAME} /app/server/models +COPY ./llama_model/serialized /app/server/models/serialized + +#COPY ./$HOME/llamav2_13b_converted/serialized /app/server/models/serialized + +COPY ./2-trace/neuron_cache/tp2_bs1_seqlen256 /app/server/models/tp2_bs1_seqlen256 + +COPY ./llama_model/tokenizer* /app/server/models/ + +#COPY ./$HOME/llamav2_13b_converted/tokenizer* /app/server/models/ RUN pip install -r /app/server/requirements.txt +RUN pip install python-multipart + WORKDIR /app/server EXPOSE 8080 -CMD ["./run.sh"] \ No newline at end of file +CMD ["./run.sh"] diff --git a/3-pack/fastapi-server.py b/3-pack/fastapi-server.py index 2e4108e..63d7eb3 100644 --- a/3-pack/fastapi-server.py +++ b/3-pack/fastapi-server.py @@ -9,6 +9,8 @@ import torch, os, logging import importlib import platform +from transformers import AutoTokenizer +from transformers_neuronx.llama.model import LlamaForSampling global device global processor @@ -19,6 +21,7 @@ global postprocess global default_question, default_context + logger = logging.getLogger() # Read static configuration from config.properties @@ -30,22 +33,18 @@ config = ConfigParser() config.read_string(config_lines) model_name = config['global']['huggingface_model_name'] -tokenizer_class_name = config['global']['huggingface_tokenizer_class'] +tokenizer_class_name = config['global']['huggingface_tokenizer_class'] model_class_name = config['global']['huggingface_model_class'] -sequence_length=config['global']['sequence_length'] +neuron_model_class_name = config['global']['neuron_model_class'] +sequence_length=int(config['global']['sequence_length']) processor=config['global']['processor'] -pipeline_cores=config['global']['pipeline_cores'] -batch_size=config['global']['batch_size'] -default_question = "What does the little engine say" -default_context = """In the childrens story about the little engine a small locomotive is pulling a large load up a mountain. - Since the load is heavy and the engine is small it is not sure whether it will be able to do the job. This is a story - about how an optimistic attitude empowers everyone to achieve more. In the story the little engine says: 'I think I can' as it is - pulling the heavy load all the way to the top of the mountain. On the way down it says: I thought I could.""" +pipeline_cores=int(config['global']['pipeline_cores']) +batch_size=int(config['global']['batch_size']) +default_prompts = ["My name is Mike and"]*batch_size +tp_degree=int(config['global']['tp_degree']) +amp_type=config['global']['amp_type'] # Read runtime configuration from environment -postprocess=True -if (os.getenv("POSTPROCESS",'True').lower() in ['false','0']): - postprocess=False quiet=False if (os.getenv("QUIET","False").lower() in ['true','1']): quiet=True @@ -56,7 +55,7 @@ logger.warning(f"Failed to parse environment variable NUM_MODELS={os.getenv('NUM_MODELS')}") logger.warning("Please ensure if set NUM_MODELS is a numeric value. Assuming value of 1") -# Detect runtime device type inf1, inf2, gpu, cpu, or arm +# Detect runtime device type inf2, gpu, cpu, or arm device_type="" try: @@ -101,54 +100,67 @@ async def read_root(): # Model inference API endpoint @app.get("/predictions/{model_id}") -async def infer(model_id, seq_0: Optional[str] = default_question, seq_1: Optional[str] = default_context): - question=seq_0 - context=seq_1 +async def infer(model_id, seqs: Optional[list] = default_prompts): + prompts=seqs status=200 if model_id in models.keys(): if not quiet: - logger.warning(f"\nQuestion: {question}\n") - tokenizer=tokenizers[model_id] - encoded_input = tokenizer.encode_plus(question, context, return_tensors='pt', max_length=128, padding='max_length', truncation=True) - if processor=='gpu': - encoded_input.to(device) - model=models[model_id] - model_input = (encoded_input['input_ids'], encoded_input['attention_mask']) - output=model(*model_input) # This is specific to Inferentia - answer_text = str(output[0]) - if postprocess: - answer_start = torch.argmax(output[0]) - answer_end = torch.argmax(output[1])+1 - if (answer_end > answer_start): - answer_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0][answer_start:answer_end])) - else: - answer_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0][answer_start:])) + logger.warning(f"\nQuestion: {prompts}\n") + + tokenizer = tokenizers[model_id] + tokens = tokenizer(prompts, return_tensors="pt") + neuron_model=models[model_id] + generated_sequences = neuron_model.sample(tokens.input_ids, sequence_length=sequence_length, top_k=50) + generated_sequences = [tokenizer.decode(seq) for seq in generated_sequences] + if not quiet: logger.warning("\nAnswer: ") - logger.warning(answer_text) + logger.warning(generated_sequences) else: status=404 - answer_text = f"Model {model_id} does not exist. Try a model name up to model{num_models-1}" + generated_sequences = f"Model {model_id} does not exist. Try a model name up to model{num_models-1}" if not quiet: - logger.warning(answer_text) - return responses.JSONResponse(status_code=status, content={"detail": answer_text}) + logger.warning(generated_sequences) + return responses.JSONResponse(status_code=status, content={"detail": generated_sequences}) # Load models in memory and onto accelerator as needed -model_suffix = "_bs"+batch_size+"_seq"+sequence_length+"_pc"+pipeline_cores+"_"+processor -model_path=os.path.join(path_prefix,'models',model_name + model_suffix + ".pt") -logger.warning(f"Loading {num_models} instances of pre-trained model {model_name} from path {model_path} ...") +#model_suffix = "_bs"+batch_size+"_seq"+sequence_length+"_pc"+pipeline_cores+"_"+processor +#model_path=os.path.join(path_prefix,'models',model_name + model_suffix + ".pt") +#logger.warning(f"Loading {num_models} instances of pre-trained model {model_name} from path {model_path} ...") + +# set neuron environment variable +os.environ["NEURON_CC_FLAGS"] = "--model-type=transformer-inference" +os.environ['NEURON_RT_NUM_CORES'] = str(tp_degree) +os.environ["NEURONX_CACHE"]= "on" +os.environ["NEURONX_DUMP_TO"] = f"/app/server/models/tp{tp_degree}_bs{batch_size}_seqlen{sequence_length}" + +model_dir = "/app/server/models" # [TODO], hard-coded, to add to config.properties +tokenizer_dir = "/app/server/models" # tokenizer in the same directory as model + +serialized_model_dir = os.path.join(model_dir, 'serialized') +os.makedirs(serialized_model_dir, exist_ok=True) + tokenizers={} models={} transformers = importlib.import_module("transformers") tokenizer_class = getattr(transformers, tokenizer_class_name) +transformers_neuronx = importlib.import_module("transformers_neuronx") +#neuron_model_class = getattr(transformers_neuronx, neuron_model_class_name) + for i in range(num_models): model_id = 'model' + str(i) logger.warning(f" {model_id} ...") - tokenizers[model_id]=tokenizer_class.from_pretrained(model_name) - models[model_id] = torch.jit.load(model_path) - if device_type=='gpu': - model=models[model_id] - model.to(device) - elif device_type in ['inf1', 'inf2']: - infer(model_id, default_question, default_context) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir) + tokenizers[model_id]=tokenizer + if device_type in ['inf2']: + #models[model_id] = neuron_model_class.from_pretrained(serialized_model_dir, tp_degree=tp_degree, batch_size=batch_size, amp=amp_type) + models[model_id] = LlamaForSampling.from_pretrained(serialized_model_dir, tp_degree=tp_degree, batch_size=batch_size, amp=amp_type) + neuron_model = models[model_id] + neuron_model.to_neuron() # compile model and load weights into device memory + infer(model_id, default_prompts) logger.warning(" ... warmup completed") + else: + logger.warning(" ... inference other than inf2 needs to be added") + + + diff --git a/3-pack/old_fastapi-server.py b/3-pack/old_fastapi-server.py new file mode 100644 index 0000000..2aee0a2 --- /dev/null +++ b/3-pack/old_fastapi-server.py @@ -0,0 +1,150 @@ +###################################################################### +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # +# SPDX-License-Identifier: MIT-0 # +###################################################################### + +from typing import Optional +from fastapi import FastAPI,logger,responses +from configparser import ConfigParser +import torch, os, logging +import importlib + +global device +global processor +global device_type +global model +global tokenizer +global logger +global postprocess +global default_question, default_context + +logger = logging.getLogger() + +# Read static configuration from config.properties +logger.warning("\nParsing configuration ...") +path_prefix = os.path.dirname(__file__) +with open(path_prefix + '/../config.properties') as f: + config_lines = '[global]\n' + f.read() + f.close() +config = ConfigParser() +config.read_string(config_lines) +model_name = config['global']['huggingface_model_name'] +tokenizer_class_name = config['global']['huggingface_tokenizer_class'] +model_class_name = config['global']['huggingface_model_class'] +sequence_length=config['global']['sequence_length'] +processor=config['global']['processor'] +pipeline_cores=config['global']['pipeline_cores'] +batch_size=config['global']['batch_size'] +default_question = "What does the little engine say" +default_context = """In the childrens story about the little engine a small locomotive is pulling a large load up a mountain. + Since the load is heavy and the engine is small it is not sure whether it will be able to do the job. This is a story + about how an optimistic attitude empowers everyone to achieve more. In the story the little engine says: 'I think I can' as it is + pulling the heavy load all the way to the top of the mountain. On the way down it says: I thought I could.""" + +# Read runtime configuration from environment +postprocess=True +if (os.getenv("POSTPROCESS",'True').lower() in ['false','0']): + postprocess=False +quiet=False +if (os.getenv("QUIET","False").lower() in ['true','1']): + quiet=True +num_models=1 +try: + num_models=int(os.getenv("NUM_MODELS", '1')) +except ValueError: + logger.warning(f"Failed to parse environment variable NUM_MODELS={os.getenv('NUM_MODELS')}") + logger.warning("Please ensure if set NUM_MODELS is a numeric value. Assuming value of 1") + +# Detect runtime device type inf1, inf2, gpu, or cpu +device_type="" + +try: + import torch_neuron + device_type="inf1" +except ImportError: + logger.warning("Inf1 chip not detected") + pass +try: + import torch_neuronx + device_type = 'inf2' +except ImportError: + print('[WARN] Inf2 device not found') + pass + + +if device_type in ['inf1', 'inf2']: + pass +elif torch.cuda.is_available(): + device_type="gpu" + device = torch.device("cuda") + logger.warning(torch.cuda.get_device_name(0)) +else: + device_type="cpu" + device = torch.device(device_type) + +if processor != device_type: + logger.warning(f"Configured target processor {processor} differs from actual processor {device_type}") +logger.warning(f"Running models on processor: {device_type}") + + +# FastAPI server +app = FastAPI() + +# Server healthcheck +@app.get("/") +async def read_root(): + return {"Status": "Healthy"} + +# Model inference API endpoint +@app.get("/predictions/{model_id}") +async def infer(model_id, seq_0: Optional[str] = default_question, seq_1: Optional[str] = default_context): + question=seq_0 + context=seq_1 + status=200 + if model_id in models.keys(): + if not quiet: + logger.warning(f"\nQuestion: {question}\n") + tokenizer=tokenizers[model_id] + encoded_input = tokenizer.encode_plus(question, context, return_tensors='pt', max_length=128, padding='max_length', truncation=True) + if processor=='gpu': + encoded_input.to(device) + model=models[model_id] + model_input = (encoded_input['input_ids'], encoded_input['attention_mask']) + output=model(*model_input) # This is specific to Inferentia + answer_text = str(output[0]) + if postprocess: + answer_start = torch.argmax(output[0]) + answer_end = torch.argmax(output[1])+1 + if (answer_end > answer_start): + answer_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0][answer_start:answer_end])) + else: + answer_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0][answer_start:])) + if not quiet: + logger.warning("\nAnswer: ") + logger.warning(answer_text) + else: + status=404 + answer_text = f"Model {model_id} does not exist. Try a model name up to model{num_models-1}" + if not quiet: + logger.warning(answer_text) + return responses.JSONResponse(status_code=status, content={"detail": answer_text}) + +# Load models in memory and onto accelerator as needed +model_suffix = "bs"+batch_size+"_seq"+sequence_length+"_pc"+pipeline_cores +model_path=os.path.join(path_prefix,'models',model_name + "_" + processor + "_" + model_suffix + ".pt") +logger.warning(f"Loading {num_models} instances of pre-trained model {model_name} from path {model_path} ...") +tokenizers={} +models={} +transformers = importlib.import_module("transformers") +tokenizer_class = getattr(transformers, tokenizer_class_name) +for i in range(num_models): + model_id = 'model' + str(i) + logger.warning(f" {model_id} ...") + tokenizers[model_id]=tokenizer_class.from_pretrained(model_name) + models[model_id] = torch.jit.load(model_path) + if device_type=='gpu': + model=models[model_id] + model.to(device) + elif device_type in ['inf1', 'inf2']: + infer(model_id, default_question, default_context) + logger.warning(" ... warmup completed") diff --git a/4-deploy/cpu-yaml.template b/4-deploy/cpu-yaml.template index 380a97c..a392642 100644 --- a/4-deploy/cpu-yaml.template +++ b/4-deploy/cpu-yaml.template @@ -49,10 +49,6 @@ spec: - name: pod-port containerPort: 8080 resources: - # Use 'memory' setting in limits and requests to ensure that model pods get scheduled to nodes evenly limits: cpu: 1 - #memory: "27000Mi" - #requests: - #memory: "27000Mi" diff --git a/4-deploy/graviton-yaml.template b/4-deploy/graviton-yaml.template deleted file mode 100644 index 73a8b0a..0000000 --- a/4-deploy/graviton-yaml.template +++ /dev/null @@ -1,67 +0,0 @@ ---- -kind: Service -apiVersion: v1 -metadata: - name: ${instance_name} - namespace: ${namespace} - labels: - app: ${instance_name} -spec: - ports: - - name: preds - port: ${service_port} - targetPort: pod-port - type: ClusterIP - selector: - app: ${instance_name} ---- -kind: Deployment -apiVersion: apps/v1 -metadata: - name: ${instance_name} - namespace: ${namespace} - labels: - app: ${instance_name} -spec: - replicas: 1 - selector: - matchLabels: - app: ${instance_name} - template: - metadata: - labels: - app: ${instance_name} - spec: - nodeSelector: - node.kubernetes.io/instance-type: "${instance_type}" - topologySpreadConstraints: - - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: DoNotSchedule - #nodeAffinityPolicy: Honor - labelSelector: - matchLabels: - app: ${instance_name} - containers: - - name: main - image: "${registry}${model_image_name}${model_image_tag}" - imagePullPolicy: Always - env: - - name: NUM_MODELS - value: "${num_models}" - - name: POSTPROCESS - value: "${postprocess}" - - name: QUIET - value: "${quiet}" - ports: - - name: pod-port - containerPort: 8080 - resources: - #use limits and requests to ensure that certain number of model pods get scheduled per node - limits: - #set value based on total available node memory and intended num pods/node - memory: "27000Mi" - requests: - #set value based on total available node memory and intended num pods/node - memory: "27000Mi" - diff --git a/4-deploy/inf2-yaml.template b/4-deploy/inf2-yaml.template index 1137375..c4380ba 100644 --- a/4-deploy/inf2-yaml.template +++ b/4-deploy/inf2-yaml.template @@ -57,7 +57,6 @@ spec: add: - IPC_LOCK resources: - #use limits and requests to ensure that certain number of model pods get scheduled per node limits: #hugepages-2Mi: 256Mi # configure to 256 * desired number of Inferentia devices. aws.amazon.com/neuron: 1 # desired number of Inferentia devices. diff --git a/5-test/Dockerfile b/5-test/Dockerfile index c2ec0cf..0464e17 100644 --- a/5-test/Dockerfile +++ b/5-test/Dockerfile @@ -8,4 +8,8 @@ COPY config.properties /app ADD ./5-test/tests /app/tests +RUN apt-get update + +RUN echo "Y" | apt-get install dnsutils + CMD ["bash","-c","while true; do date; sleep 10; done"] diff --git a/5-test/deployment-yaml.template b/5-test/deployment-yaml.template index 992b4e6..564a420 100644 --- a/5-test/deployment-yaml.template +++ b/5-test/deployment-yaml.template @@ -17,27 +17,12 @@ spec: app: ${instance_name} spec: nodeSelector: - node.kubernetes.io/instance-type: "${test_instance_type}" + beta.kubernetes.io/instance-type: "${test_instance_type}" containers: - name: main image: "${registry}${test_image_name}${test_image_tag}" command: ["bash","-c","${cmd_pod}"] imagePullPolicy: Always - env: - - name: runtime - value: "$runtime" - - name: num_servers - value: "$num_servers" - - name: num_models - value: "$num_models" - - name: app_name - value: "$app_name" - - name: namespace - value: "$namespace" - - name: num_requests - value: "$num_requests" - - name: request_frequency - value: "$request_frequency" resources: limits: cpu: 1 diff --git a/5-test/job-yaml.template b/5-test/job-yaml.template index 36b231c..c655e98 100644 --- a/5-test/job-yaml.template +++ b/5-test/job-yaml.template @@ -14,28 +14,13 @@ spec: app: ${instance_name} spec: nodeSelector: - node.kubernetes.io/instance-type: "${test_instance_type}" + beta.kubernetes.io/instance-type: "${test_instance_type}" restartPolicy: Never containers: - name: main image: "${registry}${test_image_name}${test_image_tag}" command: ["bash","-c","${cmd_pod}"] imagePullPolicy: Always - env: - - name: runtime - value: "$runtime" - - name: num_servers - value: "$num_servers" - - name: num_models - value: "$num_models" - - name: app_name - value: "$app_name" - - name: namespace - value: "$namespace" - - name: num_requests - value: "$num_requests" - - name: request_frequency - value: "$request_frequency" resources: - requests: + limits: cpu: 1 diff --git a/5-test/run.sh b/5-test/run.sh index a93f539..a7280c5 100755 --- a/5-test/run.sh +++ b/5-test/run.sh @@ -58,7 +58,7 @@ if [ "$runtime" == "docker" ]; then elif [ "$runtime" == "kubernetes" ]; then pushd ./5-test > /dev/null if [ "$1" == "bma" ]; then - CMD="kubectl -n ${test_namespace} get pods | grep ${test_image_name}- | cut -d ' ' -f 1 | xargs -L 1 kubectl -n ${test_namespace} logs | grep { | grep -v 0.0, | tee ./bmk-all.log" + CMD="kubectl -n ${test_namespace} get pods | grep ${test_image_name}- | cut -d ' ' -f 1 | xargs -L 1 kubectl logs | grep { | grep -v 0.0, | tee ./bmk-all.log" command -v bc > /dev/null if [ "$?" == "1" ]; then echo "bc not found" @@ -91,4 +91,4 @@ elif [ "$runtime" == "kubernetes" ]; then popd > /dev/null else echo "Runtime $runtime not recognized" -fi +fi \ No newline at end of file diff --git a/5-test/tests/benchmark.sh b/5-test/tests/benchmark.sh index 9ec1a0d..682a050 100755 --- a/5-test/tests/benchmark.sh +++ b/5-test/tests/benchmark.sh @@ -5,21 +5,14 @@ # SPDX-License-Identifier: MIT-0 # ###################################################################### -if [ "$num_servers" == "" ]; then - - echo "Configuring number of model servers from config.properties ..." - - if [ -f ../config.properties ]; then - source ../config.properties - elif [ -f ../../config.properties ]; then - source ../../config.properties - elif [ -f ./config.properties ]; then - source ./config.properties - else - echo "config.properties not found!" - fi +if [ -f ../config.properties ]; then + source ../config.properties +elif [ -f ../../config.properties ]; then + source ../../config.properties +elif [ -f ./config.properties ]; then + source ./config.properties else - echo "Number of model servers ($num_servers) configured from environment ..." + echo "config.properties not found!" fi if [ "$runtime" == "docker" ]; then @@ -28,4 +21,4 @@ elif [ "$runtime" == "kubernetes" ]; then python benchmark_client.py --num_thread 2 --url http://${app_name}-[INSTANCE_IDX].${namespace}.svc.cluster.local:8080/predictions/model[MODEL_IDX] --is_multi_instance --n_instance ${num_servers} --is_multi_model_per_instance --n_model_per_instance ${num_models} --latency_window_size 1000 --cache_dns else echo "Runtime $runtime not recognized" -fi +fi \ No newline at end of file diff --git a/5-test/tests/curl-rnd-ip.sh b/5-test/tests/curl-rnd-ip.sh index 8e0cdcf..c65c679 100755 --- a/5-test/tests/curl-rnd-ip.sh +++ b/5-test/tests/curl-rnd-ip.sh @@ -4,19 +4,15 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # # SPDX-License-Identifier: MIT-0 # ###################################################################### -if [ "$num_servers" == "" ]; then - echo "Configuring number of model servers from config.properties ..." - if [ -f ../config.properties ]; then - source ../config.properties - elif [ -f ../../config.properties ]; then - source ../../config.properties - elif [ -f ./config.properties ]; then - source ./config.properties - else - echo "config.properties not found!" - fi + +if [ -f ../config.properties ]; then + source ../config.properties +elif [ -f ../../config.properties ]; then + source ../../config.properties +elif [ -f ./config.properties ]; then + source ./config.properties else - echo "Configured number of model servers ($num_servers) from environment" + echo "config.properties not found!" fi server=0 @@ -26,7 +22,6 @@ models=$num_models # get instance ip addresses rm -f ./endpoint_ip.conf -echo "runtime=$runtime" while [ $server -lt $servers ] do if [ "$runtime" == "docker" ]; then @@ -57,4 +52,4 @@ do request=$((request+1)) done -rm -f ./endpoint_ip.conf +rm -f ./endpoint_ip.conf \ No newline at end of file diff --git a/5-test/tests/curl-seq-ip.sh b/5-test/tests/curl-seq-ip.sh index a9d7c73..44833e1 100755 --- a/5-test/tests/curl-seq-ip.sh +++ b/5-test/tests/curl-seq-ip.sh @@ -5,19 +5,14 @@ # SPDX-License-Identifier: MIT-0 # ###################################################################### -if [ "$num_servers" == "" ]; then - echo "Configuring number of model servers from config.properties ..." - if [ -f ../config.properties ]; then - source ../config.properties - elif [ -f ../../config.properties ]; then - source ../../config.properties - elif [ -f ./config.properties ]; then - source ./config.properties - else - echo "config.properties not found!" - fi +if [ -f ../config.properties ]; then + source ../config.properties +elif [ -f ../../config.properties ]; then + source ../../config.properties +elif [ -f ./config.properties ]; then + source ./config.properties else - echo "Configured number of model servers ($num_servers) from environment" + echo "config.properties not found!" fi server=0 @@ -27,15 +22,12 @@ models=$num_models # get server ip addresses rm -f ./endpoint_ip.conf -echo "runtime=$runtime" while [ $server -lt $servers ] do if [ "$runtime" == "docker" ]; then instance_ip=$(cat /etc/hosts | grep ${app_name}-${server} | awk '{print $1}') elif [ "$runtime" == "kubernetes" ]; then - #echo "host=${app_name}-${server}.${namespace}.svc.cluster.local" instance_ip=$(host ${app_name}-${server}.${namespace}.svc.cluster.local | grep "has address" | cut -d ' ' -f 4) - #echo "instance_ip=$instance_ip" fi echo $instance_ip >> endpoint_ip.conf server=$((server+1)) @@ -60,4 +52,4 @@ do server=$((server+1)) done -rm -f ./endpoint_ip.conf +rm -f ./endpoint_ip.conf \ No newline at end of file diff --git a/README.md b/README.md index 6094a46..5e20617 100644 --- a/README.md +++ b/README.md @@ -6,34 +6,14 @@ enables hybrid deployments where the best processor/accelerator is used to serve In this sample repository, we use a [bert-base](https://huggingface.co/distilbert-base-multilingual-cased) NLP model from [huggingface.co](https://huggingface.co/), however the project structure and workflow is generic and can be adapted for use with other models.
- - +
-Fig. 1 - Sample Amazon EKS cluster infrastructure for deploying, running and testing ML Inference workloads +Fig. 1 - Sample EKS infrastructure for inference workloads

-The ML inference workloads in this sample project are deployed on the CPU, GPU, or Inferentia nodes as shown on Fig. 1. The control scripts run in any location that has access to the cluster API. To eliminate latency concern related to the cluster ingress, load tests run in a pod within the cluster and send requests to the models directly through the cluster pod network. -
-1. The Amazon EKS cluster has several node groups, with one EC2 instance family per node group. Each node group can support different instance types, such as CPU (c5,c6i, c7g), GPU (g4dn), AWS Inferentia (Inf2) -and can pack multiple models per EKS node to maximize the number of served ML models that are running in a node group. -Model bin packing is used to maximize compute and memory utilization of the compute node EC2 instances in the cluster node groups. -
-2. The natural language processing (NLP) open-source PyTorch model from [huggingface.co](https://huggingface.co/) serving application and ML framework dependencies are built by Users as container images -using Automation framework uploaded to Amazon Elastic Container Registry - [Amazon ECR](https://aws.amazon.com/ecr/). -
-3. Using project Automation framework, Model container images are obtained from ECR and deployed to [Amazon EKS cluster](https://aws.amazon.com/eks/) using generated Deployment and Service manifests via Kubernetes API -exposed via Elastic Load Balancer (ELB). Model deployments are customized for each target EKS compute node instance type via settings in the central configuration file. -
-4. Following best practices of separation of Model data from containers that run it, ML model microservice design allows to scale out to a large number of models. In the project, model containers are pulling data from -Amazon Simple Storage Service ([Amazon S3](https://aws.amazon.com)) and other public model data sources each time they are initialized. -
-5. Using project Automation framework, Test container images are obtained from ECR and deployed to Amazon EKS cluster using generated Deployment and Service manifests via Kubernetes API. -Test deployments are customized for each deployment target EKS compute node architecture via settings in the central configuration file. Load/scale testing is performed via sending simultaneous requests -to the Model service pool. Performance Test results metrics are obtained, recorded and aggregated. -
-
-

+The inference workloads in this sample project are deployed on the CPU, GPU, or Inferentia nodes as shown on Fig. 1. The control scripts run in any location that has access to the cluster API. To eliminate latency concern related to the cluster ingress, load tests run in a pod within the cluster and send requests to the models directly through the cluster pod network. +

diff --git a/build.sh b/build.sh index c79b11a..db9b917 100755 --- a/build.sh +++ b/build.sh @@ -28,15 +28,29 @@ if [ "$action" == "" ]; then echo "Building base container ..." echo "" - dockerfile=./1-build/Dockerfile-base-${processor} - if [ -f $dockerfile ]; then - echo " ... base-${processor} ..." - docker build -t ${registry}${base_image_name}${base_image_tag} -f $dockerfile . - else - echo "Dockerfile $dockerfile was not found." - echo "Please ensure that processor is configured with a supported value in config.properties" - exit 1 - fi + case "$processor" in + "cpu") + echo " ... base-cpu ..." + docker build -t ${registry}${base_image_name}${base_image_tag} -f ./1-build/Dockerfile-base-cpu . + ;; + "gpu") + echo " ... base-gpu ..." + docker build -t ${registry}${base_image_name}${base_image_tag} -f ./1-build/Dockerfile-base-gpu . + ;; + "inf1") + echo " ... base-inf1 ..." + docker build -t ${registry}${base_image_name}${base_image_tag} -f ./1-build/Dockerfile-base-inf1 . + ;; + "inf2") + echo " ... base-inf2 ..." + docker build -t ${registry}${base_image_name}${base_image_tag} -f ./1-build/Dockerfile-base-inf2 . + ;; + *) + echo "Please ensure cpu, gpu, inf1 or inf2 is configure as processor in config.properties" + exit 1 + ;; + esac + elif [ "$action" == "push" ]; then ./1-build/push.sh elif [ "$action" == "pull" ]; then diff --git a/config.properties b/config.properties index f47a489..cba5da6 100644 --- a/config.properties +++ b/config.properties @@ -8,15 +8,21 @@ ###################################################################### # Model settings -huggingface_model_name=bert-base-multilingual-cased -huggingface_tokenizer_class=BertTokenizer -huggingface_model_class=BertForQuestionAnswering +huggingface_model_name=llama-2-13b-hf +#huggingface_model_name=llamav2_7b_converted +huggingface_tokenizer_class=AutoTokenizer +huggingface_model_class=AutoModelForCausalLM + +# Neuron setting +neuron_model_class=LlamaForSampling +tp_degree=2 +amp_type=bf16 # Compiler settings -# processor = cpu|gpu|inf1|inf2|graviton -processor=graviton +# processor = cpu|gpu|inf1|inf2|arm +processor=inf2 pipeline_cores=1 -sequence_length=128 +sequence_length=256 batch_size=1 test=True @@ -24,7 +30,7 @@ test=True account=$(aws sts get-caller-identity --query Account --output text) # region is used to login if the registry is ecr -region=us-east-1 +region=us-west-2 # Container settings # Default is the private ECR registry in the current AWS account. @@ -33,15 +39,13 @@ region=us-east-1 registry=${account}.dkr.ecr.${region}.amazonaws.com/ # registry_type=ecr registry_type=ecr -base_image_name=aws-do-inference-base -base_image_tag=:v10-${processor} -model_image_name=${huggingface_model_name} -model_image_tag=:v10-${processor} - -# if using pre-built public registry image (may require authentication) use the following settings -#registry=public.ecr.aws/a2u7h5w3 -#model_image_name=bert-base-workshop -#model_image_tag=:v10-${processor} +#base_image_name=aws-do-inference-base +#base_image_name=llama2container +base_image_name=base-${processor} +#base_image_tag=:v9-${processor} +base_image_tag=:v1 +model_image_name=${huggingface_model_name}-${processor} +model_image_tag=:v1 # Trace settings # trace_opts_$processor is a processor-specific setting used by the docker run command in the trace.sh script @@ -50,8 +54,7 @@ trace_opts_cpu="" trace_opts_gpu="--gpus 0" trace_opts_inf1="-e AWS_NEURON_VISIBLE_DEVICES=ALL --privileged" trace_opts_inf2="-e AWS_NEURON_VISIBLE_DEVICES=ALL --privileged" -trace_opts_graviton="" - +trace_opts_arm="" # Deployment settings # some of these settings apply only when the runtime is kubernetes # runtime = docker | kubernetes @@ -65,11 +68,11 @@ postprocess=True # service_port=8080 - port on which model service will be exposed service_port=8080 # Kubernetes-specific deployment settings -# instance_type = c5.xxx | g4dn.xlarge | g4dn.12xlarge | inf1.xlarge | inf2.8xlarge | c7g.4xlarge... +# instance_type = c5.xxx | g4dn.xlarge | g4dn.12xlarge | inf1.xlarge | inf1.6xlarge | ... # A node group with the specified instance_type must exist in the cluster # The instance type must have the processor configured above -# Example: processor=graviton, instance_type=c7g.4xlarge -instance_type=c7g.4xlarge +# Example: processor=arm, instance_type=c7g.4xlarge +instance_type=c5.4xlarge # num_servers - number of model servers to deploy # note that more than one model server can run on a node with multiple cpu/gpu/inferentia chips. # example: 4 model servers fit on one inf1.6xlarge instance as it has 4 inferentia chips. @@ -80,24 +83,18 @@ namespace=mpi app_name=${huggingface_model_name}-${processor} app_dir=app-${app_name}-${instance_type} -# Test image settings +# Test settings test_image_name=test-${huggingface_model_name} -test_image_tag=:v10-cpu - -#when using pre-built test image available in public ECR registry (may require authentication): -#registry=public.ecr.aws/a2u7h5w3/ -#test_image_name=bert-base-workshop -#test_image_tag=:test-v10-cpu - +test_image_tag=:v9-cpu # request_frequency - time to sleep between two consecutive requests in curl tests request_frequency=0.01 # Stop random request test after num_requests number of requests num_requests=30 # Number of test containers to launch (default=1), use > 1 for scale testing num_test_containers=1 -# test_instance_type - when runtime is kubernetes, node instance type on which test pods will run -test_instance_type=c5.4xlarge +# test_instance_type - when runtime is kubernetes, instance type on which test pods will run +test_instance_type=c5.xlarge # test_namespace - when runtime is kubernetes, namespace where test pods will be created test_namespace=mpi -# test_dir - when runtime is kubernetes, directory where test job/pod manifests are stored +# test_dir - when runtime is kubernetes, directory where test pod manifests are stored test_dir=app-${test_image_name}-${instance_type} diff --git a/k8s-neuron-device-plugin-rbac.yml b/k8s-neuron-device-plugin-rbac.yml new file mode 100644 index 0000000..ae30e52 --- /dev/null +++ b/k8s-neuron-device-plugin-rbac.yml @@ -0,0 +1,59 @@ +# rbac.yaml +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: neuron-device-plugin +rules: +- apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch +- apiGroups: + - "" + resources: + - pods + verbs: + - update + - patch + - get + - list + - watch +- apiGroups: + - "" + resources: + - nodes/status + verbs: + - patch + - update +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: neuron-device-plugin + namespace: kube-system +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: neuron-device-plugin + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: neuron-device-plugin +subjects: +- kind: ServiceAccount + name: neuron-device-plugin + namespace: kube-system diff --git a/k8s-neuron-device-plugin.yml b/k8s-neuron-device-plugin.yml new file mode 100644 index 0000000..25b43ad --- /dev/null +++ b/k8s-neuron-device-plugin.yml @@ -0,0 +1,98 @@ +# https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: neuron-device-plugin-daemonset + namespace: kube-system +spec: + selector: + matchLabels: + name: neuron-device-plugin-ds + updateStrategy: + type: RollingUpdate + template: + metadata: + annotations: + scheduler.alpha.kubernetes.io/critical-pod: "" + labels: + name: neuron-device-plugin-ds + spec: + serviceAccount: neuron-device-plugin + tolerations: + - key: CriticalAddonsOnly + operator: Exists + - key: aws.amazon.com/neuron + operator: Exists + effect: NoSchedule + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "beta.kubernetes.io/instance-type" + operator: In + values: + - inf1.xlarge + - inf1.2xlarge + - inf1.6xlarge + - inf1.24xlarge + - inf2.xlarge + - inf2.4xlarge + - inf2.8xlarge + - inf2.24xlarge + - inf2.48xlarge + - trn1.2xlarge + - trn1.32xlarge + - trn1n.32xlarge + - matchExpressions: + - key: "node.kubernetes.io/instance-type" + operator: In + values: + - inf1.xlarge + - inf1.2xlarge + - inf1.6xlarge + - inf1.24xlarge + - inf2.xlarge + - inf2.4xlarge + - inf2.8xlarge + - inf2.24xlarge + - inf2.48xlarge + - trn1.2xlarge + - trn1.32xlarge + - trn1n.32xlarge + containers: + #Device Plugin containers are available both in us-east and us-west ecr + #repos + - image: public.ecr.aws/neuron/neuron-device-plugin:2.16.18.0 + imagePullPolicy: Always + name: neuron-device-plugin + env: + - name: KUBECONFIG + value: /etc/kubernetes/kubelet.conf + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: infa-map + mountPath: /run + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins + - name: infa-map + hostPath: + path: /run + + diff --git a/low-latency-high-throughput-inference-on-amazon-eks.png b/low-latency-high-throughput-inference-on-amazon-eks.png deleted file mode 100644 index a22e366..0000000 Binary files a/low-latency-high-throughput-inference-on-amazon-eks.png and /dev/null differ diff --git a/pack.sh b/pack.sh index c82a103..118bd15 100755 --- a/pack.sh +++ b/pack.sh @@ -25,7 +25,7 @@ source ./config.properties action=$1 if [ "$action" == "" ]; then - model_file_name=${huggingface_model_name}_bs${batch_size}_seq${sequence_length}_pc${pipeline_cores}_${processor}.pt + model_file_name=${huggingface_model_name}_${processor}_bs${batch_size}_seq${sequence_length}_pc${pipeline_cores}.pt docker build -t ${registry}${model_image_name}${model_image_tag} --build-arg BASE_IMAGE=${registry}${base_image_name}${base_image_tag} \ --build-arg MODEL_NAME=${huggingface_model_name} --build-arg MODEL_FILE_NAME=${model_file_name} --build-arg PROCESSOR=${processor} \ diff --git a/trace.sh b/trace.sh index 4decc5a..b75ebae 100755 --- a/trace.sh +++ b/trace.sh @@ -19,18 +19,27 @@ print_help() { if [ "$1" == "" ]; then source ./config.properties echo "" - echo "Tracing model: $huggingface_model_name ..." - - dockerfile=./1-build/Dockerfile-base-${processor} + echo "Tracing model $huggingface_model_name ..." + echo "" - if [ -f $dockerfile ]; then - echo " ... for processor: $processor ..." - trace_opts=trace_opts_${processor} - docker run ${!trace_opts} -it --rm -v $(pwd)/2-trace:/app/trace -v $(pwd)/config.properties:/app/config.properties ${registry}${base_image_name}${base_image_tag} bash -c "cd /app/trace; python model-tracer.py" - else - echo "Processor $processor is not supported. Please ensure the processor setting in config.properties is configured properly" - exit 1 - fi + case "$processor" in + "cpu") + echo " ... for cpu ..." + docker run -it --rm -v $(pwd)/2-trace:/app/trace -v $(pwd)/config.properties:/app/config.properties ${registry}${base_image_name}${base_image_tag} bash -c "cd /app/trace; python model-tracer.py" + ;; + "gpu") + echo " ... for gpu ..." + docker run --gpus 0 -it --rm -v $(pwd)/2-trace:/app/trace -v $(pwd)/config.properties:/app/config.properties ${registry}${base_image_name}${base_image_tag} bash -c "cd /app/trace; python model-tracer.py" + ;; + "inf") + echo " ... for inf ..." + docker run -it --rm -e AWS_NEURON_VISIBLE_DEVICES=ALL --privileged -v $(pwd)/2-trace:/app/trace -v $(pwd)/config.properties:/app/config.properties ${registry}${base_image_name}${base_image_tag} bash -c "cd /app/trace; python model-tracer.py" + ;; + *) + echo "Please ensure cpu, gpu, or inf is configure as processor in config.properties" + exit 1 + ;; + esac else print_help fi