diff --git a/1-build/*.py/config.properties b/1-build/*.py/config.properties new file mode 100755 index 0000000..e69de29 diff --git a/1-build/Dockerfile-base-arm b/1-build/Dockerfile-base-arm deleted file mode 100644 index fe34d5d..0000000 --- a/1-build/Dockerfile-base-arm +++ /dev/null @@ -1,9 +0,0 @@ -FROM python:3.9 - -LABEL description="Base container for CPU models running on ARM architecture processors" - -RUN apt-get update && apt-get install -y htop dnsutils bc vim - -RUN pip install torch configparser transformers - -RUN echo "alias ll='ls -alh --color=auto'" >> /root/.bashrc diff --git a/1-build/Dockerfile-base-graviton b/1-build/Dockerfile-base-graviton deleted file mode 100644 index 76eca36..0000000 --- a/1-build/Dockerfile-base-graviton +++ /dev/null @@ -1,9 +0,0 @@ -FROM python:3.9 - -LABEL description="Base container for CPU models running on Graviton architecture processors" - -RUN apt-get update && apt-get install -y htop dnsutils bc vim - -RUN pip install torch configparser transformers - -RUN echo "alias ll='ls -alh --color=auto'" >> /root/.bashrc diff --git a/1-build/Dockerfile-base-inf b/1-build/Dockerfile-base-inf new file mode 100644 index 0000000..5919bd9 --- /dev/null +++ b/1-build/Dockerfile-base-inf @@ -0,0 +1,19 @@ +FROM amazonlinux:2 + +LABEL description="Base container for Inferentia1 models" +ENV PYTHONUNBUFFERED=TRUE +ENV PYTHONDONTWRITEBYTECODE=TRUE +ADD ./1-build/etc /etc +RUN echo -e '[neuron]\nname=Neuron YUM Repository\nbaseurl=https://yum.repos.neuron.amazonaws.com\nenabled=1\nmetadata_expire=0\n' >> /etc/yum.repos.d/neuron.repo +RUN rpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB +RUN yum update -y && \ + yum install -y python3 python3-devel gcc-c++ && \ + yum install -y tar gzip ca-certificates procps net-tools which vim wget libgomp htop jq bind-utils bc pciutils && \ + yum install -y aws-neuronx-tools-2.* +RUN pip3 install --upgrade --force-reinstall --no-cache-dir neuron-cc[tensorflow] torch-neuron transformers==4.2.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com +RUN pip3 install --no-cache-dir torchserve==0.3.0 torch-model-archiver==0.3.0 configparser +RUN alternatives --install /usr/bin/python python /usr/bin/python3 1; alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 +RUN echo "export PATH=/opt/aws/neuron/bin:$PATH" >> /root/.bashrc +RUN echo "alias ll='ls -alh --color=auto'" >> /root/.bashrc +ADD ./1-build/*.py /app/ + diff --git a/1-build/Dockerfile-base-inf2 b/1-build/Dockerfile-base-inf2 index 9523b07..959532a 100644 --- a/1-build/Dockerfile-base-inf2 +++ b/1-build/Dockerfile-base-inf2 @@ -1,42 +1,21 @@ FROM amazonlinux:2 - -LABEL description="Base container for Inferentia2 models" + +LABEL description="Base container for Inferentia1 models" ENV PYTHONUNBUFFERED=TRUE ENV PYTHONDONTWRITEBYTECODE=TRUE ADD ./1-build/etc /etc -# Neuron SDK components version numbers -ARG NEURONX_RUNTIME_LIB_VERSION=2.16.* -ARG NEURONX_COLLECTIVES_LIB_VERSION=2.16.* -ARG NEURONX_TOOLS_VERSION=2.13.* -ARG NEURONX_FRAMEWORK_VERSION=1.13.1.1.10.* -ARG NEURONX_TRANSFORMERS_VERSION=0.6.* -ARG NEURONX_CC_VERSION=2.9.* -ARG TORCHSERVE_VERSION=0.8.2 - RUN echo -e '[neuron]\nname=Neuron YUM Repository\nbaseurl=https://yum.repos.neuron.amazonaws.com\nenabled=1\nmetadata_expire=0\n' >> /etc/yum.repos.d/neuron.repo RUN rpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB -RUN amazon-linux-extras install -y python3.8 RUN yum update -y && \ - yum install -y git tar gzip ca-certificates procps net-tools which vim wget libgomp htop jq bind-utils bc pciutils && \ - yum install -y gcc-c++ && \ - yum install -y jq java-11-amazon-corretto-headless # for torchserve -RUN yum install -y aws-neuronx-collectives-${NEURONX_COLLECTIVES_LIB_VERSION} && \ - yum install -y aws-neuronx-runtime-lib-${NEURONX_RUNTIME_LIB_VERSION} && \ - yum install -y aws-neuronx-tools-${NEURONX_TOOLS_VERSION} -ENV PATH="/opt/aws/neuron/bin:${PATH}" -RUN echo 'alias python=python3.8' >> ~/.bashrc -RUN echo 'alias pip=pip3.8' >> ~/.bashrc -RUN update-alternatives --install /usr/bin/pip pip /usr/bin/pip3.8 1 - -RUN pip3.8 install --extra-index-url https://pip.repos.neuron.amazonaws.com \ - neuronx-cc==$NEURONX_CC_VERSION \ - torch-neuronx==$NEURONX_FRAMEWORK_VERSION \ - transformers-neuronx==$NEURONX_TRANSFORMERS_VERSION -RUN pip3.8 install "protobuf<4" \ - && pip3.8 install torchserve==${TORCHSERVE_VERSION} \ - && pip3.8 install torch-model-archiver==${TORCHSERVE_VERSION} \ - && pip3.8 install --no-deps --no-cache-dir -U torchvision==0.14.* captum==0.6.0 configparser - + yum install -y python3 python3-devel gcc-c++ && \ + yum install -y tar gzip ca-certificates procps net-tools which vim wget libgomp htop jq bind-utils bc pciutils && \ + yum install -y aws-neuronx-tools-2.* +RUN yum install -y aws-neuronx-collectives-2.* && \ + yum install -y aws-neuronx-runtime-lib-2.* +RUN pip3 install --upgrade --force-reinstall --no-cache-dir neuronx-cc[tensorflow] torch-neuronx transformers==4.2.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com +RUN pip3 install --no-cache-dir torchserve==0.3.0 torch-model-archiver==0.3.0 configparser +RUN alternatives --install /usr/bin/python python /usr/bin/python3 1; alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 +RUN echo "export PATH=/opt/aws/neuron/bin:$PATH" >> /root/.bashrc RUN echo "alias ll='ls -alh --color=auto'" >> /root/.bashrc ADD ./1-build/*.py /app/ diff --git a/1-build/etc/hostname b/1-build/etc/hostname new file mode 100755 index 0000000..e69de29 diff --git a/1-build/etc/hosts b/1-build/etc/hosts new file mode 100755 index 0000000..e69de29 diff --git a/1-build/etc/resolv.conf b/1-build/etc/resolv.conf new file mode 100755 index 0000000..e69de29 diff --git a/2-trace/model-tracer.py b/2-trace/model-tracer.py index b350d9a..50f588c 100644 --- a/2-trace/model-tracer.py +++ b/2-trace/model-tracer.py @@ -1,127 +1,30 @@ -###################################################################### -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # -# SPDX-License-Identifier: MIT-0 # -###################################################################### - -import platform -import torch +import os import importlib +import torch from configparser import ConfigParser - -machine=platform.uname().machine -device_type='cpu' -if machine == 'aarch64': - device_type='arm' - -try: - import torch_neuron - device_type='inf1' -except ImportError: - print('[WARN] Torch Neuron not Found') - pass -try: - import torch_neuronx - device_type='inf2' -except ImportError: - print('[WARN] Torch Neuronx not Found') - pass - -import os - -# 1. READ config.properties -print("\nParsing configuration ...") -path_prefix = os.getcwd() -with open(path_prefix + '/../config.properties') as f: - config_lines = '[global]\n' + f.read() - f.close() -config = ConfigParser() -config.read_string(config_lines) - -model_name = config['global']['huggingface_model_name'] -tokenizer_class_name = config['global']['huggingface_tokenizer_class'] -model_class_name = config['global']['huggingface_model_class'] -sequence_length=int(config['global']['sequence_length']) -processor=config['global']['processor'] -pipeline_cores=config['global']['pipeline_cores'] -batch_size=int(config['global']['batch_size']) -test=config['global']['test'] - -question = "What does the little engine say?" - -context = """In the childrens story about the little engine a small locomotive is pulling a large load up a mountain. - Since the load is heavy and the engine is small it is not sure whether it will be able to do the job. This is a story - about how an optimistic attitude empowers everyone to achieve more. In the story the little engine says: 'I think I can' as it is - pulling the heavy load all the way to the top of the mountain. On the way down it says: I thought I could.""" - - -# 2. LOAD PRE-TRAINED MODEL -print(f'\nLoading pre-trained model: {model_name}') -transformers = importlib.import_module("transformers") -tokenizer_class = getattr(transformers, tokenizer_class_name) -model_class = getattr(transformers, model_class_name) -tokenizer = tokenizer_class.from_pretrained(model_name) -model = model_class.from_pretrained(model_name, return_dict=False) - -# 3. TOKENIZE THE INPUT -print('\nTokenizing input sample ...') -inputs = tokenizer.encode_plus(question, - context, - return_tensors="pt", - max_length=sequence_length, - padding='max_length', - truncation=True) -if device_type not in ['inf1', 'inf2']: - if torch.cuda.is_available(): - device = torch.device("cuda") - device_type = "gpu" - model.to(device) - inputs.to(device) - else: - device = torch.device("cpu") - -if device_type == processor: - print(f" ... Using device: {device_type}") -else: - print(f"[WARN] detected device_type ({device_type}) does not match the configured processor ({processor})") - -# 2. COMPILE THE MODEL -print('\nTracing model ...') -example_inputs = ( - torch.cat([inputs['input_ids']] * batch_size,0), - torch.cat([inputs['attention_mask']] * batch_size,0) -) -os.makedirs(f'traced-{model_name}', exist_ok=True) -torch.set_num_threads(6) -if 'inf' == processor: - model_traced = torch.neuron.trace(model, - example_inputs, - verbose=1, - compiler_workdir=f'./traced-{model_name}/compile_wd_{processor}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}', - compiler_args = ['--neuroncore-pipeline-cores', str(pipeline_cores)]) -elif 'inf2' == processor: - model_traced = torch_neuronx.trace(model, - example_inputs) -else: - model_traced = torch.jit.trace(model, example_inputs) - -# 3. TEST THE COMPILED MODEL (Optional) -if test.lower() == 'true': - print("\nTesting traced model ...") - print(f"Question: {question}") - # Testing the traced model - answer_logits = model_traced(*example_inputs) - answer_start = answer_logits[0].argmax().item() - answer_end = answer_logits[1].argmax().item()+1 - answer_txt = "" - if answer_end > answer_start: - answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end])) - else: - answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:])) - print(f'Model Answer: {answer_txt}') - -# 4. SAVE THE COMPILED MODEL -print('\nSaving traced model ...') -model_path=f'./traced-{model_name}/{model_name}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}_{processor}.pt' -model_traced.save(model_path) - -print(f'Done. Model saved as: {model_path}') +from transformers_neuronx.llama.model import LlamaForSampling +from transformers import AutoModelForCausalLM +from transformers_neuronx.module import save_pretrained_split +tp_degree = 2 +batch_size = 1 +sequence_length = 256 +amp_type = 'bf16' +os.environ["NEURON_CC_FLAGS"] = "--model-type=transformer-inference" +os.environ['NEURON_RT_NUM_CORES'] = str(tp_degree) +os.environ["NEURONX_CACHE"]= "on" +os.environ["NEURONX_DUMP_TO"] = f"./neuron_cache/tp{tp_degree}_bs{batch_size}_seqlen{sequence_length}" +# create a directory for model +model_dir = "/app/llama_model" # hugging face format +os.makedirs(model_dir, exist_ok=True) +# initialize the model +model = AutoModelForCausalLM.from_pretrained(model_dir, low_cpu_mem_usage=True, torch_dtype=torch.float16) +# serialize the model +serialized_model_dir = os.path.join(model_dir, 'serialized') +os.makedirs(serialized_model_dir, exist_ok=True) +save_pretrained_split(model, serialized_model_dir) +# create neuron model +#transformers_neuronx = importlib.import_module("transformers_neuronx") +#neuron_model_class = getattr(transformers_neuronx, neuron_model_class_name) +neuron_model = LlamaForSampling.from_pretrained(serialized_model_dir, tp_degree=tp_degree, batch_size=batch_size, amp=amp_type) +# compile model for neuron +neuron_model.to_neuron() diff --git a/2-trace/old_model-tracer.py b/2-trace/old_model-tracer.py new file mode 100644 index 0000000..4fb6dde --- /dev/null +++ b/2-trace/old_model-tracer.py @@ -0,0 +1,124 @@ +###################################################################### +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # +# SPDX-License-Identifier: MIT-0 # +###################################################################### + +import torch +import importlib +from configparser import ConfigParser + +device_type='cpu' + +try: + import torch_neuron + device_type='inf1' +except ImportError: + print('[WARN] Torch Neuron not Found') + pass +try: + import torch_neuronx + device_type='inf2' +except ImportError: + print('[WARN] Torch Neuronx not Found') + pass + +import os + +# 1. READ config.properties +print("\nParsing configuration ...") +path_prefix = os.getcwd() +with open(path_prefix + '/../config.properties') as f: + config_lines = '[global]\n' + f.read() + f.close() +config = ConfigParser() +config.read_string(config_lines) + +model_name = config['global']['huggingface_model_name'] +tokenizer_class_name = config['global']['huggingface_tokenizer_class'] +model_class_name = config['global']['huggingface_model_class'] +sequence_length=int(config['global']['sequence_length']) +processor=config['global']['processor'] +pipeline_cores=config['global']['pipeline_cores'] +batch_size=int(config['global']['batch_size']) +test=config['global']['test'] + +question = "What does the little engine say?" + +context = """In the childrens story about the little engine a small locomotive is pulling a large load up a mountain. + Since the load is heavy and the engine is small it is not sure whether it will be able to do the job. This is a story + about how an optimistic attitude empowers everyone to achieve more. In the story the little engine says: 'I think I can' as it is + pulling the heavy load all the way to the top of the mountain. On the way down it says: I thought I could.""" + + +# 2. LOAD PRE-TRAINED MODEL +print(f'\nLoading pre-trained model: {model_name}') +transformers = importlib.import_module("transformers") +tokenizer_class = getattr(transformers, tokenizer_class_name) +model_class = getattr(transformers, model_class_name) +tokenizer = tokenizer_class.from_pretrained(model_name) +model = model_class.from_pretrained(model_name, return_dict=False) + +# 3. TOKENIZE THE INPUT +print('\nTokenizing input sample ...') +inputs = tokenizer.encode_plus(question, + context, + return_tensors="pt", + max_length=sequence_length, + padding='max_length', + truncation=True) +if device_type not in ['inf1', 'inf2']: + if torch.cuda.is_available(): + device = torch.device("cuda") + device_type = "gpu" + model.to(device) + inputs.to(device) + else: + device = torch.device("cpu") + device_type = 'cpu' + +if device_type == processor: + print(f" ... Using device: {device_type}") +else: + print(f"[WARN] detected device_type ({device_type}) does not match the configured processor ({processor})") + +# 2. COMPILE THE MODEL +print('\nTracing model ...') +example_inputs = ( + torch.cat([inputs['input_ids']] * batch_size,0), + torch.cat([inputs['attention_mask']] * batch_size,0) +) +os.makedirs(f'traced-{model_name}', exist_ok=True) +torch.set_num_threads(6) +if 'inf' in processor: + model_traced = torch.neuron.trace(model, + example_inputs, + verbose=1, + compiler_workdir=f'./traced-{model_name}/compile_wd_{processor}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}', + compiler_args = ['--neuroncore-pipeline-cores', str(pipeline_cores)]) +elif 'inf2' in processor: + model_traced = torch_neuronx.trace(model, + example_inputs) +else: + model_traced = torch.jit.trace(model, example_inputs) + +# 3. TEST THE COMPILED MODEL (Optional) +if test.lower() == 'true': + print("\nTesting traced model ...") + print(f"Question: {question}") + # Testing the traced model + answer_logits = model_traced(*example_inputs) + answer_start = answer_logits[0].argmax().item() + answer_end = answer_logits[1].argmax().item()+1 + answer_txt = "" + if answer_end > answer_start: + answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end])) + else: + answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:])) + print(f'Model Answer: {answer_txt}') + +# 4. SAVE THE COMPILED MODEL +print('\nSaving traced model ...') +model_path=f'./traced-{model_name}/{model_name}_{processor}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}.pt' +model_traced.save(model_path) + +print(f'Done. Model saved as: {model_path}') diff --git a/3-pack/Dockerfile b/3-pack/Dockerfile index f505e40..a90c0ce 100644 --- a/3-pack/Dockerfile +++ b/3-pack/Dockerfile @@ -18,12 +18,22 @@ COPY ./3-pack/run.sh /app/server/run.sh COPY ./3-pack/requirements.txt /app/server/requirements.txt -COPY ./2-trace/traced-${MODEL_NAME}/${MODEL_FILE_NAME} /app/server/models +COPY ./llama_model/serialized /app/server/models/serialized + +#COPY ./$HOME/llamav2_13b_converted/serialized /app/server/models/serialized + +COPY ./2-trace/neuron_cache/tp2_bs1_seqlen256 /app/server/models/tp2_bs1_seqlen256 + +COPY ./llama_model/tokenizer* /app/server/models/ + +#COPY ./$HOME/llamav2_13b_converted/tokenizer* /app/server/models/ RUN pip install -r /app/server/requirements.txt +RUN pip install python-multipart + WORKDIR /app/server EXPOSE 8080 -CMD ["./run.sh"] \ No newline at end of file +CMD ["./run.sh"] diff --git a/3-pack/fastapi-server.py b/3-pack/fastapi-server.py index 2e4108e..63d7eb3 100644 --- a/3-pack/fastapi-server.py +++ b/3-pack/fastapi-server.py @@ -9,6 +9,8 @@ import torch, os, logging import importlib import platform +from transformers import AutoTokenizer +from transformers_neuronx.llama.model import LlamaForSampling global device global processor @@ -19,6 +21,7 @@ global postprocess global default_question, default_context + logger = logging.getLogger() # Read static configuration from config.properties @@ -30,22 +33,18 @@ config = ConfigParser() config.read_string(config_lines) model_name = config['global']['huggingface_model_name'] -tokenizer_class_name = config['global']['huggingface_tokenizer_class'] +tokenizer_class_name = config['global']['huggingface_tokenizer_class'] model_class_name = config['global']['huggingface_model_class'] -sequence_length=config['global']['sequence_length'] +neuron_model_class_name = config['global']['neuron_model_class'] +sequence_length=int(config['global']['sequence_length']) processor=config['global']['processor'] -pipeline_cores=config['global']['pipeline_cores'] -batch_size=config['global']['batch_size'] -default_question = "What does the little engine say" -default_context = """In the childrens story about the little engine a small locomotive is pulling a large load up a mountain. - Since the load is heavy and the engine is small it is not sure whether it will be able to do the job. This is a story - about how an optimistic attitude empowers everyone to achieve more. In the story the little engine says: 'I think I can' as it is - pulling the heavy load all the way to the top of the mountain. On the way down it says: I thought I could.""" +pipeline_cores=int(config['global']['pipeline_cores']) +batch_size=int(config['global']['batch_size']) +default_prompts = ["My name is Mike and"]*batch_size +tp_degree=int(config['global']['tp_degree']) +amp_type=config['global']['amp_type'] # Read runtime configuration from environment -postprocess=True -if (os.getenv("POSTPROCESS",'True').lower() in ['false','0']): - postprocess=False quiet=False if (os.getenv("QUIET","False").lower() in ['true','1']): quiet=True @@ -56,7 +55,7 @@ logger.warning(f"Failed to parse environment variable NUM_MODELS={os.getenv('NUM_MODELS')}") logger.warning("Please ensure if set NUM_MODELS is a numeric value. Assuming value of 1") -# Detect runtime device type inf1, inf2, gpu, cpu, or arm +# Detect runtime device type inf2, gpu, cpu, or arm device_type="" try: @@ -101,54 +100,67 @@ async def read_root(): # Model inference API endpoint @app.get("/predictions/{model_id}") -async def infer(model_id, seq_0: Optional[str] = default_question, seq_1: Optional[str] = default_context): - question=seq_0 - context=seq_1 +async def infer(model_id, seqs: Optional[list] = default_prompts): + prompts=seqs status=200 if model_id in models.keys(): if not quiet: - logger.warning(f"\nQuestion: {question}\n") - tokenizer=tokenizers[model_id] - encoded_input = tokenizer.encode_plus(question, context, return_tensors='pt', max_length=128, padding='max_length', truncation=True) - if processor=='gpu': - encoded_input.to(device) - model=models[model_id] - model_input = (encoded_input['input_ids'], encoded_input['attention_mask']) - output=model(*model_input) # This is specific to Inferentia - answer_text = str(output[0]) - if postprocess: - answer_start = torch.argmax(output[0]) - answer_end = torch.argmax(output[1])+1 - if (answer_end > answer_start): - answer_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0][answer_start:answer_end])) - else: - answer_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0][answer_start:])) + logger.warning(f"\nQuestion: {prompts}\n") + + tokenizer = tokenizers[model_id] + tokens = tokenizer(prompts, return_tensors="pt") + neuron_model=models[model_id] + generated_sequences = neuron_model.sample(tokens.input_ids, sequence_length=sequence_length, top_k=50) + generated_sequences = [tokenizer.decode(seq) for seq in generated_sequences] + if not quiet: logger.warning("\nAnswer: ") - logger.warning(answer_text) + logger.warning(generated_sequences) else: status=404 - answer_text = f"Model {model_id} does not exist. Try a model name up to model{num_models-1}" + generated_sequences = f"Model {model_id} does not exist. Try a model name up to model{num_models-1}" if not quiet: - logger.warning(answer_text) - return responses.JSONResponse(status_code=status, content={"detail": answer_text}) + logger.warning(generated_sequences) + return responses.JSONResponse(status_code=status, content={"detail": generated_sequences}) # Load models in memory and onto accelerator as needed -model_suffix = "_bs"+batch_size+"_seq"+sequence_length+"_pc"+pipeline_cores+"_"+processor -model_path=os.path.join(path_prefix,'models',model_name + model_suffix + ".pt") -logger.warning(f"Loading {num_models} instances of pre-trained model {model_name} from path {model_path} ...") +#model_suffix = "_bs"+batch_size+"_seq"+sequence_length+"_pc"+pipeline_cores+"_"+processor +#model_path=os.path.join(path_prefix,'models',model_name + model_suffix + ".pt") +#logger.warning(f"Loading {num_models} instances of pre-trained model {model_name} from path {model_path} ...") + +# set neuron environment variable +os.environ["NEURON_CC_FLAGS"] = "--model-type=transformer-inference" +os.environ['NEURON_RT_NUM_CORES'] = str(tp_degree) +os.environ["NEURONX_CACHE"]= "on" +os.environ["NEURONX_DUMP_TO"] = f"/app/server/models/tp{tp_degree}_bs{batch_size}_seqlen{sequence_length}" + +model_dir = "/app/server/models" # [TODO], hard-coded, to add to config.properties +tokenizer_dir = "/app/server/models" # tokenizer in the same directory as model + +serialized_model_dir = os.path.join(model_dir, 'serialized') +os.makedirs(serialized_model_dir, exist_ok=True) + tokenizers={} models={} transformers = importlib.import_module("transformers") tokenizer_class = getattr(transformers, tokenizer_class_name) +transformers_neuronx = importlib.import_module("transformers_neuronx") +#neuron_model_class = getattr(transformers_neuronx, neuron_model_class_name) + for i in range(num_models): model_id = 'model' + str(i) logger.warning(f" {model_id} ...") - tokenizers[model_id]=tokenizer_class.from_pretrained(model_name) - models[model_id] = torch.jit.load(model_path) - if device_type=='gpu': - model=models[model_id] - model.to(device) - elif device_type in ['inf1', 'inf2']: - infer(model_id, default_question, default_context) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir) + tokenizers[model_id]=tokenizer + if device_type in ['inf2']: + #models[model_id] = neuron_model_class.from_pretrained(serialized_model_dir, tp_degree=tp_degree, batch_size=batch_size, amp=amp_type) + models[model_id] = LlamaForSampling.from_pretrained(serialized_model_dir, tp_degree=tp_degree, batch_size=batch_size, amp=amp_type) + neuron_model = models[model_id] + neuron_model.to_neuron() # compile model and load weights into device memory + infer(model_id, default_prompts) logger.warning(" ... warmup completed") + else: + logger.warning(" ... inference other than inf2 needs to be added") + + + diff --git a/3-pack/old_fastapi-server.py b/3-pack/old_fastapi-server.py new file mode 100644 index 0000000..2aee0a2 --- /dev/null +++ b/3-pack/old_fastapi-server.py @@ -0,0 +1,150 @@ +###################################################################### +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # +# SPDX-License-Identifier: MIT-0 # +###################################################################### + +from typing import Optional +from fastapi import FastAPI,logger,responses +from configparser import ConfigParser +import torch, os, logging +import importlib + +global device +global processor +global device_type +global model +global tokenizer +global logger +global postprocess +global default_question, default_context + +logger = logging.getLogger() + +# Read static configuration from config.properties +logger.warning("\nParsing configuration ...") +path_prefix = os.path.dirname(__file__) +with open(path_prefix + '/../config.properties') as f: + config_lines = '[global]\n' + f.read() + f.close() +config = ConfigParser() +config.read_string(config_lines) +model_name = config['global']['huggingface_model_name'] +tokenizer_class_name = config['global']['huggingface_tokenizer_class'] +model_class_name = config['global']['huggingface_model_class'] +sequence_length=config['global']['sequence_length'] +processor=config['global']['processor'] +pipeline_cores=config['global']['pipeline_cores'] +batch_size=config['global']['batch_size'] +default_question = "What does the little engine say" +default_context = """In the childrens story about the little engine a small locomotive is pulling a large load up a mountain. + Since the load is heavy and the engine is small it is not sure whether it will be able to do the job. This is a story + about how an optimistic attitude empowers everyone to achieve more. In the story the little engine says: 'I think I can' as it is + pulling the heavy load all the way to the top of the mountain. On the way down it says: I thought I could.""" + +# Read runtime configuration from environment +postprocess=True +if (os.getenv("POSTPROCESS",'True').lower() in ['false','0']): + postprocess=False +quiet=False +if (os.getenv("QUIET","False").lower() in ['true','1']): + quiet=True +num_models=1 +try: + num_models=int(os.getenv("NUM_MODELS", '1')) +except ValueError: + logger.warning(f"Failed to parse environment variable NUM_MODELS={os.getenv('NUM_MODELS')}") + logger.warning("Please ensure if set NUM_MODELS is a numeric value. Assuming value of 1") + +# Detect runtime device type inf1, inf2, gpu, or cpu +device_type="" + +try: + import torch_neuron + device_type="inf1" +except ImportError: + logger.warning("Inf1 chip not detected") + pass +try: + import torch_neuronx + device_type = 'inf2' +except ImportError: + print('[WARN] Inf2 device not found') + pass + + +if device_type in ['inf1', 'inf2']: + pass +elif torch.cuda.is_available(): + device_type="gpu" + device = torch.device("cuda") + logger.warning(torch.cuda.get_device_name(0)) +else: + device_type="cpu" + device = torch.device(device_type) + +if processor != device_type: + logger.warning(f"Configured target processor {processor} differs from actual processor {device_type}") +logger.warning(f"Running models on processor: {device_type}") + + +# FastAPI server +app = FastAPI() + +# Server healthcheck +@app.get("/") +async def read_root(): + return {"Status": "Healthy"} + +# Model inference API endpoint +@app.get("/predictions/{model_id}") +async def infer(model_id, seq_0: Optional[str] = default_question, seq_1: Optional[str] = default_context): + question=seq_0 + context=seq_1 + status=200 + if model_id in models.keys(): + if not quiet: + logger.warning(f"\nQuestion: {question}\n") + tokenizer=tokenizers[model_id] + encoded_input = tokenizer.encode_plus(question, context, return_tensors='pt', max_length=128, padding='max_length', truncation=True) + if processor=='gpu': + encoded_input.to(device) + model=models[model_id] + model_input = (encoded_input['input_ids'], encoded_input['attention_mask']) + output=model(*model_input) # This is specific to Inferentia + answer_text = str(output[0]) + if postprocess: + answer_start = torch.argmax(output[0]) + answer_end = torch.argmax(output[1])+1 + if (answer_end > answer_start): + answer_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0][answer_start:answer_end])) + else: + answer_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0][answer_start:])) + if not quiet: + logger.warning("\nAnswer: ") + logger.warning(answer_text) + else: + status=404 + answer_text = f"Model {model_id} does not exist. Try a model name up to model{num_models-1}" + if not quiet: + logger.warning(answer_text) + return responses.JSONResponse(status_code=status, content={"detail": answer_text}) + +# Load models in memory and onto accelerator as needed +model_suffix = "bs"+batch_size+"_seq"+sequence_length+"_pc"+pipeline_cores +model_path=os.path.join(path_prefix,'models',model_name + "_" + processor + "_" + model_suffix + ".pt") +logger.warning(f"Loading {num_models} instances of pre-trained model {model_name} from path {model_path} ...") +tokenizers={} +models={} +transformers = importlib.import_module("transformers") +tokenizer_class = getattr(transformers, tokenizer_class_name) +for i in range(num_models): + model_id = 'model' + str(i) + logger.warning(f" {model_id} ...") + tokenizers[model_id]=tokenizer_class.from_pretrained(model_name) + models[model_id] = torch.jit.load(model_path) + if device_type=='gpu': + model=models[model_id] + model.to(device) + elif device_type in ['inf1', 'inf2']: + infer(model_id, default_question, default_context) + logger.warning(" ... warmup completed") diff --git a/4-deploy/cpu-yaml.template b/4-deploy/cpu-yaml.template index 380a97c..a392642 100644 --- a/4-deploy/cpu-yaml.template +++ b/4-deploy/cpu-yaml.template @@ -49,10 +49,6 @@ spec: - name: pod-port containerPort: 8080 resources: - # Use 'memory' setting in limits and requests to ensure that model pods get scheduled to nodes evenly limits: cpu: 1 - #memory: "27000Mi" - #requests: - #memory: "27000Mi" diff --git a/4-deploy/graviton-yaml.template b/4-deploy/graviton-yaml.template deleted file mode 100644 index 73a8b0a..0000000 --- a/4-deploy/graviton-yaml.template +++ /dev/null @@ -1,67 +0,0 @@ ---- -kind: Service -apiVersion: v1 -metadata: - name: ${instance_name} - namespace: ${namespace} - labels: - app: ${instance_name} -spec: - ports: - - name: preds - port: ${service_port} - targetPort: pod-port - type: ClusterIP - selector: - app: ${instance_name} ---- -kind: Deployment -apiVersion: apps/v1 -metadata: - name: ${instance_name} - namespace: ${namespace} - labels: - app: ${instance_name} -spec: - replicas: 1 - selector: - matchLabels: - app: ${instance_name} - template: - metadata: - labels: - app: ${instance_name} - spec: - nodeSelector: - node.kubernetes.io/instance-type: "${instance_type}" - topologySpreadConstraints: - - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: DoNotSchedule - #nodeAffinityPolicy: Honor - labelSelector: - matchLabels: - app: ${instance_name} - containers: - - name: main - image: "${registry}${model_image_name}${model_image_tag}" - imagePullPolicy: Always - env: - - name: NUM_MODELS - value: "${num_models}" - - name: POSTPROCESS - value: "${postprocess}" - - name: QUIET - value: "${quiet}" - ports: - - name: pod-port - containerPort: 8080 - resources: - #use limits and requests to ensure that certain number of model pods get scheduled per node - limits: - #set value based on total available node memory and intended num pods/node - memory: "27000Mi" - requests: - #set value based on total available node memory and intended num pods/node - memory: "27000Mi" - diff --git a/4-deploy/inf2-yaml.template b/4-deploy/inf2-yaml.template index 1137375..c4380ba 100644 --- a/4-deploy/inf2-yaml.template +++ b/4-deploy/inf2-yaml.template @@ -57,7 +57,6 @@ spec: add: - IPC_LOCK resources: - #use limits and requests to ensure that certain number of model pods get scheduled per node limits: #hugepages-2Mi: 256Mi # configure to 256 * desired number of Inferentia devices. aws.amazon.com/neuron: 1 # desired number of Inferentia devices. diff --git a/5-test/Dockerfile b/5-test/Dockerfile index c2ec0cf..0464e17 100644 --- a/5-test/Dockerfile +++ b/5-test/Dockerfile @@ -8,4 +8,8 @@ COPY config.properties /app ADD ./5-test/tests /app/tests +RUN apt-get update + +RUN echo "Y" | apt-get install dnsutils + CMD ["bash","-c","while true; do date; sleep 10; done"] diff --git a/5-test/deployment-yaml.template b/5-test/deployment-yaml.template index 992b4e6..564a420 100644 --- a/5-test/deployment-yaml.template +++ b/5-test/deployment-yaml.template @@ -17,27 +17,12 @@ spec: app: ${instance_name} spec: nodeSelector: - node.kubernetes.io/instance-type: "${test_instance_type}" + beta.kubernetes.io/instance-type: "${test_instance_type}" containers: - name: main image: "${registry}${test_image_name}${test_image_tag}" command: ["bash","-c","${cmd_pod}"] imagePullPolicy: Always - env: - - name: runtime - value: "$runtime" - - name: num_servers - value: "$num_servers" - - name: num_models - value: "$num_models" - - name: app_name - value: "$app_name" - - name: namespace - value: "$namespace" - - name: num_requests - value: "$num_requests" - - name: request_frequency - value: "$request_frequency" resources: limits: cpu: 1 diff --git a/5-test/job-yaml.template b/5-test/job-yaml.template index 36b231c..c655e98 100644 --- a/5-test/job-yaml.template +++ b/5-test/job-yaml.template @@ -14,28 +14,13 @@ spec: app: ${instance_name} spec: nodeSelector: - node.kubernetes.io/instance-type: "${test_instance_type}" + beta.kubernetes.io/instance-type: "${test_instance_type}" restartPolicy: Never containers: - name: main image: "${registry}${test_image_name}${test_image_tag}" command: ["bash","-c","${cmd_pod}"] imagePullPolicy: Always - env: - - name: runtime - value: "$runtime" - - name: num_servers - value: "$num_servers" - - name: num_models - value: "$num_models" - - name: app_name - value: "$app_name" - - name: namespace - value: "$namespace" - - name: num_requests - value: "$num_requests" - - name: request_frequency - value: "$request_frequency" resources: - requests: + limits: cpu: 1 diff --git a/5-test/run.sh b/5-test/run.sh index a93f539..a7280c5 100755 --- a/5-test/run.sh +++ b/5-test/run.sh @@ -58,7 +58,7 @@ if [ "$runtime" == "docker" ]; then elif [ "$runtime" == "kubernetes" ]; then pushd ./5-test > /dev/null if [ "$1" == "bma" ]; then - CMD="kubectl -n ${test_namespace} get pods | grep ${test_image_name}- | cut -d ' ' -f 1 | xargs -L 1 kubectl -n ${test_namespace} logs | grep { | grep -v 0.0, | tee ./bmk-all.log" + CMD="kubectl -n ${test_namespace} get pods | grep ${test_image_name}- | cut -d ' ' -f 1 | xargs -L 1 kubectl logs | grep { | grep -v 0.0, | tee ./bmk-all.log" command -v bc > /dev/null if [ "$?" == "1" ]; then echo "bc not found" @@ -91,4 +91,4 @@ elif [ "$runtime" == "kubernetes" ]; then popd > /dev/null else echo "Runtime $runtime not recognized" -fi +fi \ No newline at end of file diff --git a/5-test/tests/benchmark.sh b/5-test/tests/benchmark.sh index 9ec1a0d..682a050 100755 --- a/5-test/tests/benchmark.sh +++ b/5-test/tests/benchmark.sh @@ -5,21 +5,14 @@ # SPDX-License-Identifier: MIT-0 # ###################################################################### -if [ "$num_servers" == "" ]; then - - echo "Configuring number of model servers from config.properties ..." - - if [ -f ../config.properties ]; then - source ../config.properties - elif [ -f ../../config.properties ]; then - source ../../config.properties - elif [ -f ./config.properties ]; then - source ./config.properties - else - echo "config.properties not found!" - fi +if [ -f ../config.properties ]; then + source ../config.properties +elif [ -f ../../config.properties ]; then + source ../../config.properties +elif [ -f ./config.properties ]; then + source ./config.properties else - echo "Number of model servers ($num_servers) configured from environment ..." + echo "config.properties not found!" fi if [ "$runtime" == "docker" ]; then @@ -28,4 +21,4 @@ elif [ "$runtime" == "kubernetes" ]; then python benchmark_client.py --num_thread 2 --url http://${app_name}-[INSTANCE_IDX].${namespace}.svc.cluster.local:8080/predictions/model[MODEL_IDX] --is_multi_instance --n_instance ${num_servers} --is_multi_model_per_instance --n_model_per_instance ${num_models} --latency_window_size 1000 --cache_dns else echo "Runtime $runtime not recognized" -fi +fi \ No newline at end of file diff --git a/5-test/tests/curl-rnd-ip.sh b/5-test/tests/curl-rnd-ip.sh index 8e0cdcf..c65c679 100755 --- a/5-test/tests/curl-rnd-ip.sh +++ b/5-test/tests/curl-rnd-ip.sh @@ -4,19 +4,15 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # # SPDX-License-Identifier: MIT-0 # ###################################################################### -if [ "$num_servers" == "" ]; then - echo "Configuring number of model servers from config.properties ..." - if [ -f ../config.properties ]; then - source ../config.properties - elif [ -f ../../config.properties ]; then - source ../../config.properties - elif [ -f ./config.properties ]; then - source ./config.properties - else - echo "config.properties not found!" - fi + +if [ -f ../config.properties ]; then + source ../config.properties +elif [ -f ../../config.properties ]; then + source ../../config.properties +elif [ -f ./config.properties ]; then + source ./config.properties else - echo "Configured number of model servers ($num_servers) from environment" + echo "config.properties not found!" fi server=0 @@ -26,7 +22,6 @@ models=$num_models # get instance ip addresses rm -f ./endpoint_ip.conf -echo "runtime=$runtime" while [ $server -lt $servers ] do if [ "$runtime" == "docker" ]; then @@ -57,4 +52,4 @@ do request=$((request+1)) done -rm -f ./endpoint_ip.conf +rm -f ./endpoint_ip.conf \ No newline at end of file diff --git a/5-test/tests/curl-seq-ip.sh b/5-test/tests/curl-seq-ip.sh index a9d7c73..44833e1 100755 --- a/5-test/tests/curl-seq-ip.sh +++ b/5-test/tests/curl-seq-ip.sh @@ -5,19 +5,14 @@ # SPDX-License-Identifier: MIT-0 # ###################################################################### -if [ "$num_servers" == "" ]; then - echo "Configuring number of model servers from config.properties ..." - if [ -f ../config.properties ]; then - source ../config.properties - elif [ -f ../../config.properties ]; then - source ../../config.properties - elif [ -f ./config.properties ]; then - source ./config.properties - else - echo "config.properties not found!" - fi +if [ -f ../config.properties ]; then + source ../config.properties +elif [ -f ../../config.properties ]; then + source ../../config.properties +elif [ -f ./config.properties ]; then + source ./config.properties else - echo "Configured number of model servers ($num_servers) from environment" + echo "config.properties not found!" fi server=0 @@ -27,15 +22,12 @@ models=$num_models # get server ip addresses rm -f ./endpoint_ip.conf -echo "runtime=$runtime" while [ $server -lt $servers ] do if [ "$runtime" == "docker" ]; then instance_ip=$(cat /etc/hosts | grep ${app_name}-${server} | awk '{print $1}') elif [ "$runtime" == "kubernetes" ]; then - #echo "host=${app_name}-${server}.${namespace}.svc.cluster.local" instance_ip=$(host ${app_name}-${server}.${namespace}.svc.cluster.local | grep "has address" | cut -d ' ' -f 4) - #echo "instance_ip=$instance_ip" fi echo $instance_ip >> endpoint_ip.conf server=$((server+1)) @@ -60,4 +52,4 @@ do server=$((server+1)) done -rm -f ./endpoint_ip.conf +rm -f ./endpoint_ip.conf \ No newline at end of file diff --git a/README.md b/README.md index 6094a46..5e20617 100644 --- a/README.md +++ b/README.md @@ -6,34 +6,14 @@ enables hybrid deployments where the best processor/accelerator is used to serve In this sample repository, we use a [bert-base](https://huggingface.co/distilbert-base-multilingual-cased) NLP model from [huggingface.co](https://huggingface.co/), however the project structure and workflow is generic and can be adapted for use with other models.
+
diff --git a/build.sh b/build.sh
index c79b11a..db9b917 100755
--- a/build.sh
+++ b/build.sh
@@ -28,15 +28,29 @@ if [ "$action" == "" ]; then
echo "Building base container ..."
echo ""
- dockerfile=./1-build/Dockerfile-base-${processor}
- if [ -f $dockerfile ]; then
- echo " ... base-${processor} ..."
- docker build -t ${registry}${base_image_name}${base_image_tag} -f $dockerfile .
- else
- echo "Dockerfile $dockerfile was not found."
- echo "Please ensure that processor is configured with a supported value in config.properties"
- exit 1
- fi
+ case "$processor" in
+ "cpu")
+ echo " ... base-cpu ..."
+ docker build -t ${registry}${base_image_name}${base_image_tag} -f ./1-build/Dockerfile-base-cpu .
+ ;;
+ "gpu")
+ echo " ... base-gpu ..."
+ docker build -t ${registry}${base_image_name}${base_image_tag} -f ./1-build/Dockerfile-base-gpu .
+ ;;
+ "inf1")
+ echo " ... base-inf1 ..."
+ docker build -t ${registry}${base_image_name}${base_image_tag} -f ./1-build/Dockerfile-base-inf1 .
+ ;;
+ "inf2")
+ echo " ... base-inf2 ..."
+ docker build -t ${registry}${base_image_name}${base_image_tag} -f ./1-build/Dockerfile-base-inf2 .
+ ;;
+ *)
+ echo "Please ensure cpu, gpu, inf1 or inf2 is configure as processor in config.properties"
+ exit 1
+ ;;
+ esac
+
elif [ "$action" == "push" ]; then
./1-build/push.sh
elif [ "$action" == "pull" ]; then
diff --git a/config.properties b/config.properties
index f47a489..cba5da6 100644
--- a/config.properties
+++ b/config.properties
@@ -8,15 +8,21 @@
######################################################################
# Model settings
-huggingface_model_name=bert-base-multilingual-cased
-huggingface_tokenizer_class=BertTokenizer
-huggingface_model_class=BertForQuestionAnswering
+huggingface_model_name=llama-2-13b-hf
+#huggingface_model_name=llamav2_7b_converted
+huggingface_tokenizer_class=AutoTokenizer
+huggingface_model_class=AutoModelForCausalLM
+
+# Neuron setting
+neuron_model_class=LlamaForSampling
+tp_degree=2
+amp_type=bf16
# Compiler settings
-# processor = cpu|gpu|inf1|inf2|graviton
-processor=graviton
+# processor = cpu|gpu|inf1|inf2|arm
+processor=inf2
pipeline_cores=1
-sequence_length=128
+sequence_length=256
batch_size=1
test=True
@@ -24,7 +30,7 @@ test=True
account=$(aws sts get-caller-identity --query Account --output text)
# region is used to login if the registry is ecr
-region=us-east-1
+region=us-west-2
# Container settings
# Default is the private ECR registry in the current AWS account.
@@ -33,15 +39,13 @@ region=us-east-1
registry=${account}.dkr.ecr.${region}.amazonaws.com/
# registry_type=ecr
registry_type=ecr
-base_image_name=aws-do-inference-base
-base_image_tag=:v10-${processor}
-model_image_name=${huggingface_model_name}
-model_image_tag=:v10-${processor}
-
-# if using pre-built public registry image (may require authentication) use the following settings
-#registry=public.ecr.aws/a2u7h5w3
-#model_image_name=bert-base-workshop
-#model_image_tag=:v10-${processor}
+#base_image_name=aws-do-inference-base
+#base_image_name=llama2container
+base_image_name=base-${processor}
+#base_image_tag=:v9-${processor}
+base_image_tag=:v1
+model_image_name=${huggingface_model_name}-${processor}
+model_image_tag=:v1
# Trace settings
# trace_opts_$processor is a processor-specific setting used by the docker run command in the trace.sh script
@@ -50,8 +54,7 @@ trace_opts_cpu=""
trace_opts_gpu="--gpus 0"
trace_opts_inf1="-e AWS_NEURON_VISIBLE_DEVICES=ALL --privileged"
trace_opts_inf2="-e AWS_NEURON_VISIBLE_DEVICES=ALL --privileged"
-trace_opts_graviton=""
-
+trace_opts_arm=""
# Deployment settings
# some of these settings apply only when the runtime is kubernetes
# runtime = docker | kubernetes
@@ -65,11 +68,11 @@ postprocess=True
# service_port=8080 - port on which model service will be exposed
service_port=8080
# Kubernetes-specific deployment settings
-# instance_type = c5.xxx | g4dn.xlarge | g4dn.12xlarge | inf1.xlarge | inf2.8xlarge | c7g.4xlarge...
+# instance_type = c5.xxx | g4dn.xlarge | g4dn.12xlarge | inf1.xlarge | inf1.6xlarge | ...
# A node group with the specified instance_type must exist in the cluster
# The instance type must have the processor configured above
-# Example: processor=graviton, instance_type=c7g.4xlarge
-instance_type=c7g.4xlarge
+# Example: processor=arm, instance_type=c7g.4xlarge
+instance_type=c5.4xlarge
# num_servers - number of model servers to deploy
# note that more than one model server can run on a node with multiple cpu/gpu/inferentia chips.
# example: 4 model servers fit on one inf1.6xlarge instance as it has 4 inferentia chips.
@@ -80,24 +83,18 @@ namespace=mpi
app_name=${huggingface_model_name}-${processor}
app_dir=app-${app_name}-${instance_type}
-# Test image settings
+# Test settings
test_image_name=test-${huggingface_model_name}
-test_image_tag=:v10-cpu
-
-#when using pre-built test image available in public ECR registry (may require authentication):
-#registry=public.ecr.aws/a2u7h5w3/
-#test_image_name=bert-base-workshop
-#test_image_tag=:test-v10-cpu
-
+test_image_tag=:v9-cpu
# request_frequency - time to sleep between two consecutive requests in curl tests
request_frequency=0.01
# Stop random request test after num_requests number of requests
num_requests=30
# Number of test containers to launch (default=1), use > 1 for scale testing
num_test_containers=1
-# test_instance_type - when runtime is kubernetes, node instance type on which test pods will run
-test_instance_type=c5.4xlarge
+# test_instance_type - when runtime is kubernetes, instance type on which test pods will run
+test_instance_type=c5.xlarge
# test_namespace - when runtime is kubernetes, namespace where test pods will be created
test_namespace=mpi
-# test_dir - when runtime is kubernetes, directory where test job/pod manifests are stored
+# test_dir - when runtime is kubernetes, directory where test pod manifests are stored
test_dir=app-${test_image_name}-${instance_type}
diff --git a/k8s-neuron-device-plugin-rbac.yml b/k8s-neuron-device-plugin-rbac.yml
new file mode 100644
index 0000000..ae30e52
--- /dev/null
+++ b/k8s-neuron-device-plugin-rbac.yml
@@ -0,0 +1,59 @@
+# rbac.yaml
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+ name: neuron-device-plugin
+rules:
+- apiGroups:
+ - ""
+ resources:
+ - nodes
+ verbs:
+ - get
+ - list
+ - watch
+- apiGroups:
+ - ""
+ resources:
+ - events
+ verbs:
+ - create
+ - patch
+- apiGroups:
+ - ""
+ resources:
+ - pods
+ verbs:
+ - update
+ - patch
+ - get
+ - list
+ - watch
+- apiGroups:
+ - ""
+ resources:
+ - nodes/status
+ verbs:
+ - patch
+ - update
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: neuron-device-plugin
+ namespace: kube-system
+---
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+ name: neuron-device-plugin
+ namespace: kube-system
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: ClusterRole
+ name: neuron-device-plugin
+subjects:
+- kind: ServiceAccount
+ name: neuron-device-plugin
+ namespace: kube-system
diff --git a/k8s-neuron-device-plugin.yml b/k8s-neuron-device-plugin.yml
new file mode 100644
index 0000000..25b43ad
--- /dev/null
+++ b/k8s-neuron-device-plugin.yml
@@ -0,0 +1,98 @@
+# https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+ name: neuron-device-plugin-daemonset
+ namespace: kube-system
+spec:
+ selector:
+ matchLabels:
+ name: neuron-device-plugin-ds
+ updateStrategy:
+ type: RollingUpdate
+ template:
+ metadata:
+ annotations:
+ scheduler.alpha.kubernetes.io/critical-pod: ""
+ labels:
+ name: neuron-device-plugin-ds
+ spec:
+ serviceAccount: neuron-device-plugin
+ tolerations:
+ - key: CriticalAddonsOnly
+ operator: Exists
+ - key: aws.amazon.com/neuron
+ operator: Exists
+ effect: NoSchedule
+ # Mark this pod as a critical add-on; when enabled, the critical add-on
+ # scheduler reserves resources for critical add-on pods so that they can
+ # be rescheduled after a failure.
+ # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
+ priorityClassName: "system-node-critical"
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: "beta.kubernetes.io/instance-type"
+ operator: In
+ values:
+ - inf1.xlarge
+ - inf1.2xlarge
+ - inf1.6xlarge
+ - inf1.24xlarge
+ - inf2.xlarge
+ - inf2.4xlarge
+ - inf2.8xlarge
+ - inf2.24xlarge
+ - inf2.48xlarge
+ - trn1.2xlarge
+ - trn1.32xlarge
+ - trn1n.32xlarge
+ - matchExpressions:
+ - key: "node.kubernetes.io/instance-type"
+ operator: In
+ values:
+ - inf1.xlarge
+ - inf1.2xlarge
+ - inf1.6xlarge
+ - inf1.24xlarge
+ - inf2.xlarge
+ - inf2.4xlarge
+ - inf2.8xlarge
+ - inf2.24xlarge
+ - inf2.48xlarge
+ - trn1.2xlarge
+ - trn1.32xlarge
+ - trn1n.32xlarge
+ containers:
+ #Device Plugin containers are available both in us-east and us-west ecr
+ #repos
+ - image: public.ecr.aws/neuron/neuron-device-plugin:2.16.18.0
+ imagePullPolicy: Always
+ name: neuron-device-plugin
+ env:
+ - name: KUBECONFIG
+ value: /etc/kubernetes/kubelet.conf
+ - name: NODE_NAME
+ valueFrom:
+ fieldRef:
+ fieldPath: spec.nodeName
+ securityContext:
+ allowPrivilegeEscalation: false
+ capabilities:
+ drop: ["ALL"]
+ volumeMounts:
+ - name: device-plugin
+ mountPath: /var/lib/kubelet/device-plugins
+ - name: infa-map
+ mountPath: /run
+ volumes:
+ - name: device-plugin
+ hostPath:
+ path: /var/lib/kubelet/device-plugins
+ - name: infa-map
+ hostPath:
+ path: /run
+
+
diff --git a/low-latency-high-throughput-inference-on-amazon-eks.png b/low-latency-high-throughput-inference-on-amazon-eks.png
deleted file mode 100644
index a22e366..0000000
Binary files a/low-latency-high-throughput-inference-on-amazon-eks.png and /dev/null differ
diff --git a/pack.sh b/pack.sh
index c82a103..118bd15 100755
--- a/pack.sh
+++ b/pack.sh
@@ -25,7 +25,7 @@ source ./config.properties
action=$1
if [ "$action" == "" ]; then
- model_file_name=${huggingface_model_name}_bs${batch_size}_seq${sequence_length}_pc${pipeline_cores}_${processor}.pt
+ model_file_name=${huggingface_model_name}_${processor}_bs${batch_size}_seq${sequence_length}_pc${pipeline_cores}.pt
docker build -t ${registry}${model_image_name}${model_image_tag} --build-arg BASE_IMAGE=${registry}${base_image_name}${base_image_tag} \
--build-arg MODEL_NAME=${huggingface_model_name} --build-arg MODEL_FILE_NAME=${model_file_name} --build-arg PROCESSOR=${processor} \
diff --git a/trace.sh b/trace.sh
index 4decc5a..b75ebae 100755
--- a/trace.sh
+++ b/trace.sh
@@ -19,18 +19,27 @@ print_help() {
if [ "$1" == "" ]; then
source ./config.properties
echo ""
- echo "Tracing model: $huggingface_model_name ..."
-
- dockerfile=./1-build/Dockerfile-base-${processor}
+ echo "Tracing model $huggingface_model_name ..."
+
echo ""
- if [ -f $dockerfile ]; then
- echo " ... for processor: $processor ..."
- trace_opts=trace_opts_${processor}
- docker run ${!trace_opts} -it --rm -v $(pwd)/2-trace:/app/trace -v $(pwd)/config.properties:/app/config.properties ${registry}${base_image_name}${base_image_tag} bash -c "cd /app/trace; python model-tracer.py"
- else
- echo "Processor $processor is not supported. Please ensure the processor setting in config.properties is configured properly"
- exit 1
- fi
+ case "$processor" in
+ "cpu")
+ echo " ... for cpu ..."
+ docker run -it --rm -v $(pwd)/2-trace:/app/trace -v $(pwd)/config.properties:/app/config.properties ${registry}${base_image_name}${base_image_tag} bash -c "cd /app/trace; python model-tracer.py"
+ ;;
+ "gpu")
+ echo " ... for gpu ..."
+ docker run --gpus 0 -it --rm -v $(pwd)/2-trace:/app/trace -v $(pwd)/config.properties:/app/config.properties ${registry}${base_image_name}${base_image_tag} bash -c "cd /app/trace; python model-tracer.py"
+ ;;
+ "inf")
+ echo " ... for inf ..."
+ docker run -it --rm -e AWS_NEURON_VISIBLE_DEVICES=ALL --privileged -v $(pwd)/2-trace:/app/trace -v $(pwd)/config.properties:/app/config.properties ${registry}${base_image_name}${base_image_tag} bash -c "cd /app/trace; python model-tracer.py"
+ ;;
+ *)
+ echo "Please ensure cpu, gpu, or inf is configure as processor in config.properties"
+ exit 1
+ ;;
+ esac
else
print_help
fi