diff --git a/1-build/*.py/config.properties b/1-build/*.py/config.properties
new file mode 100755
index 0000000..e69de29
diff --git a/1-build/Dockerfile-base-arm b/1-build/Dockerfile-base-arm
deleted file mode 100644
index fe34d5d..0000000
--- a/1-build/Dockerfile-base-arm
+++ /dev/null
@@ -1,9 +0,0 @@
-FROM python:3.9
-
-LABEL description="Base container for CPU models running on ARM architecture processors"
-
-RUN apt-get update && apt-get install -y htop dnsutils bc vim
-
-RUN pip install torch configparser transformers
-
-RUN echo "alias ll='ls -alh --color=auto'" >> /root/.bashrc
diff --git a/1-build/Dockerfile-base-graviton b/1-build/Dockerfile-base-graviton
deleted file mode 100644
index 76eca36..0000000
--- a/1-build/Dockerfile-base-graviton
+++ /dev/null
@@ -1,9 +0,0 @@
-FROM python:3.9
-
-LABEL description="Base container for CPU models running on Graviton architecture processors"
-
-RUN apt-get update && apt-get install -y htop dnsutils bc vim
-
-RUN pip install torch configparser transformers
-
-RUN echo "alias ll='ls -alh --color=auto'" >> /root/.bashrc
diff --git a/1-build/Dockerfile-base-inf b/1-build/Dockerfile-base-inf
new file mode 100644
index 0000000..5919bd9
--- /dev/null
+++ b/1-build/Dockerfile-base-inf
@@ -0,0 +1,19 @@
+FROM amazonlinux:2
+  
+LABEL description="Base container for Inferentia1 models"
+ENV PYTHONUNBUFFERED=TRUE
+ENV PYTHONDONTWRITEBYTECODE=TRUE
+ADD ./1-build/etc /etc
+RUN echo -e '[neuron]\nname=Neuron YUM Repository\nbaseurl=https://yum.repos.neuron.amazonaws.com\nenabled=1\nmetadata_expire=0\n' >> /etc/yum.repos.d/neuron.repo
+RUN rpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB
+RUN yum update -y && \
+    yum install -y python3 python3-devel gcc-c++ && \
+    yum install -y tar gzip ca-certificates procps net-tools which vim wget libgomp htop jq bind-utils bc pciutils && \
+    yum install -y aws-neuronx-tools-2.*
+RUN pip3 install --upgrade --force-reinstall --no-cache-dir neuron-cc[tensorflow] torch-neuron transformers==4.2.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com
+RUN pip3 install --no-cache-dir torchserve==0.3.0 torch-model-archiver==0.3.0 configparser
+RUN alternatives --install /usr/bin/python python /usr/bin/python3 1; alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
+RUN echo "export PATH=/opt/aws/neuron/bin:$PATH" >> /root/.bashrc
+RUN echo "alias ll='ls -alh --color=auto'" >> /root/.bashrc 
+ADD ./1-build/*.py /app/
+
diff --git a/1-build/Dockerfile-base-inf2 b/1-build/Dockerfile-base-inf2
index 9523b07..959532a 100644
--- a/1-build/Dockerfile-base-inf2
+++ b/1-build/Dockerfile-base-inf2
@@ -1,42 +1,21 @@
 FROM amazonlinux:2
-
-LABEL description="Base container for Inferentia2 models"
+  
+LABEL description="Base container for Inferentia1 models"
 ENV PYTHONUNBUFFERED=TRUE
 ENV PYTHONDONTWRITEBYTECODE=TRUE
 ADD ./1-build/etc /etc
-# Neuron SDK components version numbers
-ARG NEURONX_RUNTIME_LIB_VERSION=2.16.*
-ARG NEURONX_COLLECTIVES_LIB_VERSION=2.16.*
-ARG NEURONX_TOOLS_VERSION=2.13.*
-ARG NEURONX_FRAMEWORK_VERSION=1.13.1.1.10.*
-ARG NEURONX_TRANSFORMERS_VERSION=0.6.*
-ARG NEURONX_CC_VERSION=2.9.*
-ARG TORCHSERVE_VERSION=0.8.2
-
 RUN echo -e '[neuron]\nname=Neuron YUM Repository\nbaseurl=https://yum.repos.neuron.amazonaws.com\nenabled=1\nmetadata_expire=0\n' >> /etc/yum.repos.d/neuron.repo
 RUN rpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB
-RUN amazon-linux-extras install -y python3.8
 RUN yum update -y && \
-    yum install -y git tar gzip ca-certificates procps net-tools which vim wget libgomp htop jq bind-utils bc pciutils && \
-    yum install -y gcc-c++ && \
-    yum install -y jq java-11-amazon-corretto-headless  # for torchserve
-RUN yum install -y aws-neuronx-collectives-${NEURONX_COLLECTIVES_LIB_VERSION} && \
-    yum install -y aws-neuronx-runtime-lib-${NEURONX_RUNTIME_LIB_VERSION} && \
-    yum install -y aws-neuronx-tools-${NEURONX_TOOLS_VERSION}
-ENV PATH="/opt/aws/neuron/bin:${PATH}"
-RUN echo 'alias python=python3.8' >> ~/.bashrc
-RUN echo 'alias pip=pip3.8' >> ~/.bashrc
-RUN update-alternatives --install /usr/bin/pip pip /usr/bin/pip3.8 1
-
-RUN pip3.8 install --extra-index-url https://pip.repos.neuron.amazonaws.com \
-    neuronx-cc==$NEURONX_CC_VERSION \
-    torch-neuronx==$NEURONX_FRAMEWORK_VERSION \
-    transformers-neuronx==$NEURONX_TRANSFORMERS_VERSION 
-RUN pip3.8 install "protobuf<4" \
-    && pip3.8 install torchserve==${TORCHSERVE_VERSION} \
-    && pip3.8 install torch-model-archiver==${TORCHSERVE_VERSION} \
-    && pip3.8 install --no-deps --no-cache-dir -U torchvision==0.14.* captum==0.6.0 configparser
-
+    yum install -y python3 python3-devel gcc-c++ && \
+    yum install -y tar gzip ca-certificates procps net-tools which vim wget libgomp htop jq bind-utils bc pciutils && \
+    yum install -y aws-neuronx-tools-2.*
+RUN yum install -y aws-neuronx-collectives-2.* && \
+    yum install -y aws-neuronx-runtime-lib-2.*
+RUN pip3 install --upgrade --force-reinstall --no-cache-dir neuronx-cc[tensorflow] torch-neuronx transformers==4.2.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com
+RUN pip3 install --no-cache-dir torchserve==0.3.0 torch-model-archiver==0.3.0 configparser
+RUN alternatives --install /usr/bin/python python /usr/bin/python3 1; alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
+RUN echo "export PATH=/opt/aws/neuron/bin:$PATH" >> /root/.bashrc
 RUN echo "alias ll='ls -alh --color=auto'" >> /root/.bashrc 
 ADD ./1-build/*.py /app/
 
diff --git a/1-build/etc/hostname b/1-build/etc/hostname
new file mode 100755
index 0000000..e69de29
diff --git a/1-build/etc/hosts b/1-build/etc/hosts
new file mode 100755
index 0000000..e69de29
diff --git a/1-build/etc/resolv.conf b/1-build/etc/resolv.conf
new file mode 100755
index 0000000..e69de29
diff --git a/2-trace/model-tracer.py b/2-trace/model-tracer.py
index b350d9a..50f588c 100644
--- a/2-trace/model-tracer.py
+++ b/2-trace/model-tracer.py
@@ -1,127 +1,30 @@
-######################################################################
-# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. #
-# SPDX-License-Identifier: MIT-0                                     #
-######################################################################
-
-import platform
-import torch
+import os
 import importlib
+import torch
 from configparser import ConfigParser
-
-machine=platform.uname().machine
-device_type='cpu'
-if machine == 'aarch64':
-    device_type='arm'
-
-try:
-    import torch_neuron
-    device_type='inf1'
-except ImportError:
-    print('[WARN] Torch Neuron not Found')
-    pass
-try:
-    import torch_neuronx
-    device_type='inf2'
-except ImportError:
-    print('[WARN] Torch Neuronx not Found')
-    pass
-
-import os
-
-# 1. READ config.properties
-print("\nParsing configuration ...")
-path_prefix = os.getcwd()
-with open(path_prefix + '/../config.properties') as f:
-    config_lines = '[global]\n' + f.read()
-    f.close()
-config = ConfigParser()
-config.read_string(config_lines)
-
-model_name = config['global']['huggingface_model_name']
-tokenizer_class_name = config['global']['huggingface_tokenizer_class']
-model_class_name = config['global']['huggingface_model_class']
-sequence_length=int(config['global']['sequence_length'])
-processor=config['global']['processor']
-pipeline_cores=config['global']['pipeline_cores']
-batch_size=int(config['global']['batch_size'])
-test=config['global']['test']
-
-question = "What does the little engine say?"
-
-context = """In the childrens story about the little engine a small locomotive is pulling a large load up a mountain.
-    Since the load is heavy and the engine is small it is not sure whether it will be able to do the job. This is a story 
-    about how an optimistic attitude empowers everyone to achieve more. In the story the little engine says: 'I think I can' as it is 
-    pulling the heavy load all the way to the top of the mountain. On the way down it says: I thought I could."""
-
-
-# 2. LOAD PRE-TRAINED MODEL
-print(f'\nLoading pre-trained model: {model_name}')
-transformers = importlib.import_module("transformers")
-tokenizer_class = getattr(transformers, tokenizer_class_name)
-model_class = getattr(transformers, model_class_name)
-tokenizer = tokenizer_class.from_pretrained(model_name)
-model = model_class.from_pretrained(model_name, return_dict=False)
-
-# 3. TOKENIZE THE INPUT
-print('\nTokenizing input sample ...')
-inputs = tokenizer.encode_plus(question,
-                               context,
-                               return_tensors="pt",
-                               max_length=sequence_length,
-                               padding='max_length',
-                               truncation=True)
-if device_type not in ['inf1', 'inf2']:
-    if torch.cuda.is_available():
-        device = torch.device("cuda")
-        device_type = "gpu"
-        model.to(device)
-        inputs.to(device)
-    else:
-        device = torch.device("cpu")
-
-if device_type == processor:
-    print(f"   ... Using device: {device_type}")
-else:
-    print(f"[WARN] detected device_type ({device_type}) does not match the configured processor ({processor})")
-
-# 2. COMPILE THE MODEL
-print('\nTracing model ...')
-example_inputs = (
-    torch.cat([inputs['input_ids']] * batch_size,0), 
-    torch.cat([inputs['attention_mask']] * batch_size,0)
-)
-os.makedirs(f'traced-{model_name}', exist_ok=True)
-torch.set_num_threads(6)
-if 'inf' == processor:
-    model_traced = torch.neuron.trace(model, 
-                                  example_inputs, 
-                                  verbose=1, 
-                                  compiler_workdir=f'./traced-{model_name}/compile_wd_{processor}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}',  
-                                  compiler_args = ['--neuroncore-pipeline-cores', str(pipeline_cores)])
-elif 'inf2' == processor:
-    model_traced = torch_neuronx.trace(model,
-                                  example_inputs)
-else:
-    model_traced = torch.jit.trace(model, example_inputs)
-    
-# 3. TEST THE COMPILED MODEL (Optional)        
-if test.lower() == 'true':
-    print("\nTesting traced model ...")
-    print(f"Question: {question}")
-    # Testing the traced model
-    answer_logits = model_traced(*example_inputs)
-    answer_start = answer_logits[0].argmax().item()
-    answer_end = answer_logits[1].argmax().item()+1
-    answer_txt = ""
-    if answer_end > answer_start:
-        answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
-    else:
-        answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:]))
-    print(f'Model Answer: {answer_txt}')
-
-# 4. SAVE THE COMPILED MODEL
-print('\nSaving traced model ...')
-model_path=f'./traced-{model_name}/{model_name}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}_{processor}.pt'
-model_traced.save(model_path)
-
-print(f'Done. Model saved as: {model_path}')
+from transformers_neuronx.llama.model import LlamaForSampling
+from transformers import AutoModelForCausalLM
+from transformers_neuronx.module import save_pretrained_split
+tp_degree = 2
+batch_size = 1
+sequence_length = 256
+amp_type = 'bf16'
+os.environ["NEURON_CC_FLAGS"] = "--model-type=transformer-inference"
+os.environ['NEURON_RT_NUM_CORES'] = str(tp_degree)
+os.environ["NEURONX_CACHE"]= "on"
+os.environ["NEURONX_DUMP_TO"] = f"./neuron_cache/tp{tp_degree}_bs{batch_size}_seqlen{sequence_length}"
+# create a directory for model
+model_dir = "/app/llama_model" # hugging face format
+os.makedirs(model_dir, exist_ok=True)
+# initialize the model
+model = AutoModelForCausalLM.from_pretrained(model_dir, low_cpu_mem_usage=True, torch_dtype=torch.float16)
+# serialize the model
+serialized_model_dir = os.path.join(model_dir, 'serialized')
+os.makedirs(serialized_model_dir, exist_ok=True)
+save_pretrained_split(model, serialized_model_dir)
+# create neuron model
+#transformers_neuronx = importlib.import_module("transformers_neuronx")
+#neuron_model_class = getattr(transformers_neuronx, neuron_model_class_name)
+neuron_model = LlamaForSampling.from_pretrained(serialized_model_dir, tp_degree=tp_degree, batch_size=batch_size, amp=amp_type)
+# compile model for neuron
+neuron_model.to_neuron()
diff --git a/2-trace/old_model-tracer.py b/2-trace/old_model-tracer.py
new file mode 100644
index 0000000..4fb6dde
--- /dev/null
+++ b/2-trace/old_model-tracer.py
@@ -0,0 +1,124 @@
+######################################################################
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. #
+# SPDX-License-Identifier: MIT-0                                     #
+######################################################################
+
+import torch
+import importlib
+from configparser import ConfigParser
+
+device_type='cpu'
+
+try:
+    import torch_neuron
+    device_type='inf1'
+except ImportError:
+    print('[WARN] Torch Neuron not Found')
+    pass
+try:
+    import torch_neuronx
+    device_type='inf2'
+except ImportError:
+    print('[WARN] Torch Neuronx not Found')
+    pass
+
+import os
+
+# 1. READ config.properties
+print("\nParsing configuration ...")
+path_prefix = os.getcwd()
+with open(path_prefix + '/../config.properties') as f:
+    config_lines = '[global]\n' + f.read()
+    f.close()
+config = ConfigParser()
+config.read_string(config_lines)
+
+model_name = config['global']['huggingface_model_name']
+tokenizer_class_name = config['global']['huggingface_tokenizer_class']
+model_class_name = config['global']['huggingface_model_class']
+sequence_length=int(config['global']['sequence_length'])
+processor=config['global']['processor']
+pipeline_cores=config['global']['pipeline_cores']
+batch_size=int(config['global']['batch_size'])
+test=config['global']['test']
+
+question = "What does the little engine say?"
+
+context = """In the childrens story about the little engine a small locomotive is pulling a large load up a mountain.
+    Since the load is heavy and the engine is small it is not sure whether it will be able to do the job. This is a story 
+    about how an optimistic attitude empowers everyone to achieve more. In the story the little engine says: 'I think I can' as it is 
+    pulling the heavy load all the way to the top of the mountain. On the way down it says: I thought I could."""
+
+
+# 2. LOAD PRE-TRAINED MODEL
+print(f'\nLoading pre-trained model: {model_name}')
+transformers = importlib.import_module("transformers")
+tokenizer_class = getattr(transformers, tokenizer_class_name)
+model_class = getattr(transformers, model_class_name)
+tokenizer = tokenizer_class.from_pretrained(model_name)
+model = model_class.from_pretrained(model_name, return_dict=False)
+
+# 3. TOKENIZE THE INPUT
+print('\nTokenizing input sample ...')
+inputs = tokenizer.encode_plus(question,
+                               context,
+                               return_tensors="pt",
+                               max_length=sequence_length,
+                               padding='max_length',
+                               truncation=True)
+if device_type not in ['inf1', 'inf2']:
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        device_type = "gpu"
+        model.to(device)
+        inputs.to(device)
+    else:
+        device = torch.device("cpu")
+        device_type = 'cpu'
+
+if device_type == processor:
+    print(f"   ... Using device: {device_type}")
+else:
+    print(f"[WARN] detected device_type ({device_type}) does not match the configured processor ({processor})")
+
+# 2. COMPILE THE MODEL
+print('\nTracing model ...')
+example_inputs = (
+    torch.cat([inputs['input_ids']] * batch_size,0), 
+    torch.cat([inputs['attention_mask']] * batch_size,0)
+)
+os.makedirs(f'traced-{model_name}', exist_ok=True)
+torch.set_num_threads(6)
+if 'inf' in processor:
+    model_traced = torch.neuron.trace(model, 
+                                  example_inputs, 
+                                  verbose=1, 
+                                  compiler_workdir=f'./traced-{model_name}/compile_wd_{processor}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}',  
+                                  compiler_args = ['--neuroncore-pipeline-cores', str(pipeline_cores)])
+elif 'inf2' in processor:
+    model_traced = torch_neuronx.trace(model,
+                                  example_inputs)
+else:
+    model_traced = torch.jit.trace(model, example_inputs)
+    
+# 3. TEST THE COMPILED MODEL (Optional)        
+if test.lower() == 'true':
+    print("\nTesting traced model ...")
+    print(f"Question: {question}")
+    # Testing the traced model
+    answer_logits = model_traced(*example_inputs)
+    answer_start = answer_logits[0].argmax().item()
+    answer_end = answer_logits[1].argmax().item()+1
+    answer_txt = ""
+    if answer_end > answer_start:
+        answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
+    else:
+        answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:]))
+    print(f'Model Answer: {answer_txt}')
+
+# 4. SAVE THE COMPILED MODEL
+print('\nSaving traced model ...')
+model_path=f'./traced-{model_name}/{model_name}_{processor}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}.pt'
+model_traced.save(model_path)
+
+print(f'Done. Model saved as: {model_path}')
diff --git a/3-pack/Dockerfile b/3-pack/Dockerfile
index f505e40..a90c0ce 100644
--- a/3-pack/Dockerfile
+++ b/3-pack/Dockerfile
@@ -18,12 +18,22 @@ COPY ./3-pack/run.sh /app/server/run.sh
 
 COPY ./3-pack/requirements.txt /app/server/requirements.txt
 
-COPY ./2-trace/traced-${MODEL_NAME}/${MODEL_FILE_NAME} /app/server/models
+COPY ./llama_model/serialized /app/server/models/serialized
+
+#COPY ./$HOME/llamav2_13b_converted/serialized /app/server/models/serialized
+
+COPY ./2-trace/neuron_cache/tp2_bs1_seqlen256 /app/server/models/tp2_bs1_seqlen256
+
+COPY ./llama_model/tokenizer* /app/server/models/
+
+#COPY ./$HOME/llamav2_13b_converted/tokenizer* /app/server/models/
 
 RUN pip install -r /app/server/requirements.txt
 
+RUN pip install python-multipart
+
 WORKDIR /app/server
 
 EXPOSE 8080
 
-CMD ["./run.sh"]
\ No newline at end of file
+CMD ["./run.sh"]
diff --git a/3-pack/fastapi-server.py b/3-pack/fastapi-server.py
index 2e4108e..63d7eb3 100644
--- a/3-pack/fastapi-server.py
+++ b/3-pack/fastapi-server.py
@@ -9,6 +9,8 @@
 import torch, os, logging
 import importlib
 import platform
+from transformers import AutoTokenizer
+from transformers_neuronx.llama.model import LlamaForSampling
 
 global device
 global processor
@@ -19,6 +21,7 @@
 global postprocess
 global default_question, default_context
 
+
 logger = logging.getLogger()
 
 # Read static configuration from config.properties
@@ -30,22 +33,18 @@
 config = ConfigParser()
 config.read_string(config_lines)
 model_name = config['global']['huggingface_model_name']
-tokenizer_class_name = config['global']['huggingface_tokenizer_class']
+tokenizer_class_name = config['global']['huggingface_tokenizer_class'] 
 model_class_name = config['global']['huggingface_model_class']
-sequence_length=config['global']['sequence_length']
+neuron_model_class_name = config['global']['neuron_model_class']
+sequence_length=int(config['global']['sequence_length'])
 processor=config['global']['processor']
-pipeline_cores=config['global']['pipeline_cores']
-batch_size=config['global']['batch_size']
-default_question = "What does the little engine say"
-default_context = """In the childrens story about the little engine a small locomotive is pulling a large load up a mountain.
-    Since the load is heavy and the engine is small it is not sure whether it will be able to do the job. This is a story 
-    about how an optimistic attitude empowers everyone to achieve more. In the story the little engine says: 'I think I can' as it is 
-    pulling the heavy load all the way to the top of the mountain. On the way down it says: I thought I could."""
+pipeline_cores=int(config['global']['pipeline_cores'])
+batch_size=int(config['global']['batch_size'])
+default_prompts = ["My name is Mike and"]*batch_size
+tp_degree=int(config['global']['tp_degree'])
+amp_type=config['global']['amp_type']
 
 # Read runtime configuration from environment
-postprocess=True
-if (os.getenv("POSTPROCESS",'True').lower() in ['false','0']):
-    postprocess=False
 quiet=False
 if (os.getenv("QUIET","False").lower() in ['true','1']):
     quiet=True
@@ -56,7 +55,7 @@
     logger.warning(f"Failed to parse environment variable NUM_MODELS={os.getenv('NUM_MODELS')}")
     logger.warning("Please ensure if set NUM_MODELS is a numeric value. Assuming value of 1")
 
-# Detect runtime device type inf1, inf2, gpu, cpu, or arm
+# Detect runtime device type inf2, gpu, cpu, or arm
 device_type=""
 
 try:
@@ -101,54 +100,67 @@ async def read_root():
 
 # Model inference API endpoint
 @app.get("/predictions/{model_id}")
-async def infer(model_id, seq_0: Optional[str] = default_question, seq_1: Optional[str] = default_context):
-    question=seq_0
-    context=seq_1
+async def infer(model_id, seqs: Optional[list] = default_prompts):
+    prompts=seqs
     status=200
     if model_id in models.keys():
         if not quiet:
-            logger.warning(f"\nQuestion: {question}\n")
-        tokenizer=tokenizers[model_id]
-        encoded_input = tokenizer.encode_plus(question, context, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
-        if processor=='gpu':
-            encoded_input.to(device)
-        model=models[model_id]
-        model_input = (encoded_input['input_ids'],  encoded_input['attention_mask'])
-        output=model(*model_input) # This is specific to Inferentia
-        answer_text = str(output[0])
-        if postprocess:
-            answer_start = torch.argmax(output[0])
-            answer_end = torch.argmax(output[1])+1
-            if (answer_end > answer_start):
-                answer_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0][answer_start:answer_end]))
-            else:
-                answer_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0][answer_start:]))
+            logger.warning(f"\nQuestion: {prompts}\n")
+
+        tokenizer = tokenizers[model_id]
+        tokens = tokenizer(prompts, return_tensors="pt")
+        neuron_model=models[model_id]
+        generated_sequences = neuron_model.sample(tokens.input_ids, sequence_length=sequence_length, top_k=50)
+        generated_sequences = [tokenizer.decode(seq) for seq in generated_sequences]
+
         if not quiet:
             logger.warning("\nAnswer: ")
-            logger.warning(answer_text)
+            logger.warning(generated_sequences)
     else:
         status=404
-        answer_text = f"Model {model_id} does not exist. Try a model name up to model{num_models-1}"
+        generated_sequences = f"Model {model_id} does not exist. Try a model name up to model{num_models-1}"
         if not quiet:
-            logger.warning(answer_text)
-    return responses.JSONResponse(status_code=status, content={"detail": answer_text})
+            logger.warning(generated_sequences)
+    return responses.JSONResponse(status_code=status, content={"detail": generated_sequences})
 
 # Load models in memory and onto accelerator as needed
-model_suffix = "_bs"+batch_size+"_seq"+sequence_length+"_pc"+pipeline_cores+"_"+processor
-model_path=os.path.join(path_prefix,'models',model_name + model_suffix + ".pt")
-logger.warning(f"Loading {num_models} instances of pre-trained model {model_name} from path {model_path} ...")
+#model_suffix = "_bs"+batch_size+"_seq"+sequence_length+"_pc"+pipeline_cores+"_"+processor
+#model_path=os.path.join(path_prefix,'models',model_name + model_suffix + ".pt")
+#logger.warning(f"Loading {num_models} instances of pre-trained model {model_name} from path {model_path} ...")
+
+# set neuron environment variable
+os.environ["NEURON_CC_FLAGS"] = "--model-type=transformer-inference"
+os.environ['NEURON_RT_NUM_CORES'] = str(tp_degree)
+os.environ["NEURONX_CACHE"]= "on"
+os.environ["NEURONX_DUMP_TO"] = f"/app/server/models/tp{tp_degree}_bs{batch_size}_seqlen{sequence_length}"
+
+model_dir = "/app/server/models" # [TODO], hard-coded, to add to config.properties
+tokenizer_dir = "/app/server/models" # tokenizer in the same directory as model
+
+serialized_model_dir = os.path.join(model_dir, 'serialized')
+os.makedirs(serialized_model_dir, exist_ok=True)
+
 tokenizers={}
 models={}
 transformers = importlib.import_module("transformers")
 tokenizer_class = getattr(transformers, tokenizer_class_name)
+transformers_neuronx = importlib.import_module("transformers_neuronx")
+#neuron_model_class = getattr(transformers_neuronx, neuron_model_class_name)
+
 for i in range(num_models):
     model_id = 'model' + str(i)
     logger.warning(f"   {model_id} ...")
-    tokenizers[model_id]=tokenizer_class.from_pretrained(model_name)
-    models[model_id] = torch.jit.load(model_path)
-    if device_type=='gpu':
-        model=models[model_id]
-        model.to(device)
-    elif device_type in ['inf1', 'inf2']:
-        infer(model_id, default_question, default_context)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+    tokenizers[model_id]=tokenizer
+    if device_type in ['inf2']:
+        #models[model_id] = neuron_model_class.from_pretrained(serialized_model_dir, tp_degree=tp_degree, batch_size=batch_size, amp=amp_type)
+        models[model_id] = LlamaForSampling.from_pretrained(serialized_model_dir, tp_degree=tp_degree, batch_size=batch_size, amp=amp_type)
+        neuron_model = models[model_id]
+        neuron_model.to_neuron() # compile model and load weights into device memory
+        infer(model_id, default_prompts)
         logger.warning("    ... warmup completed")
+    else:
+        logger.warning("    ... inference other than inf2 needs to be added")
+
+
+
diff --git a/3-pack/old_fastapi-server.py b/3-pack/old_fastapi-server.py
new file mode 100644
index 0000000..2aee0a2
--- /dev/null
+++ b/3-pack/old_fastapi-server.py
@@ -0,0 +1,150 @@
+######################################################################
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. #
+# SPDX-License-Identifier: MIT-0                                     #
+######################################################################
+
+from typing import Optional
+from fastapi import FastAPI,logger,responses
+from configparser import ConfigParser
+import torch, os, logging
+import importlib
+
+global device
+global processor
+global device_type
+global model
+global tokenizer
+global logger
+global postprocess
+global default_question, default_context
+
+logger = logging.getLogger()
+
+# Read static configuration from config.properties
+logger.warning("\nParsing configuration ...")
+path_prefix = os.path.dirname(__file__)
+with open(path_prefix + '/../config.properties') as f:
+    config_lines = '[global]\n' + f.read()
+    f.close()
+config = ConfigParser()
+config.read_string(config_lines)
+model_name = config['global']['huggingface_model_name']
+tokenizer_class_name = config['global']['huggingface_tokenizer_class']
+model_class_name = config['global']['huggingface_model_class']
+sequence_length=config['global']['sequence_length']
+processor=config['global']['processor']
+pipeline_cores=config['global']['pipeline_cores']
+batch_size=config['global']['batch_size']
+default_question = "What does the little engine say"
+default_context = """In the childrens story about the little engine a small locomotive is pulling a large load up a mountain.
+    Since the load is heavy and the engine is small it is not sure whether it will be able to do the job. This is a story 
+    about how an optimistic attitude empowers everyone to achieve more. In the story the little engine says: 'I think I can' as it is 
+    pulling the heavy load all the way to the top of the mountain. On the way down it says: I thought I could."""
+
+# Read runtime configuration from environment
+postprocess=True
+if (os.getenv("POSTPROCESS",'True').lower() in ['false','0']):
+    postprocess=False
+quiet=False
+if (os.getenv("QUIET","False").lower() in ['true','1']):
+    quiet=True
+num_models=1
+try:
+    num_models=int(os.getenv("NUM_MODELS", '1'))
+except ValueError:
+    logger.warning(f"Failed to parse environment variable NUM_MODELS={os.getenv('NUM_MODELS')}")
+    logger.warning("Please ensure if set NUM_MODELS is a numeric value. Assuming value of 1")
+
+# Detect runtime device type inf1, inf2, gpu, or cpu
+device_type=""
+
+try:
+    import torch_neuron
+    device_type="inf1"
+except ImportError:
+    logger.warning("Inf1 chip not detected")
+    pass
+try:
+    import torch_neuronx
+    device_type = 'inf2'
+except ImportError:
+    print('[WARN] Inf2 device not found')
+    pass
+
+
+if device_type in ['inf1', 'inf2']:
+    pass
+elif torch.cuda.is_available():
+    device_type="gpu"
+    device = torch.device("cuda")
+    logger.warning(torch.cuda.get_device_name(0))
+else:
+    device_type="cpu"
+    device = torch.device(device_type)
+
+if processor != device_type:
+    logger.warning(f"Configured target processor {processor} differs from actual processor {device_type}")
+logger.warning(f"Running models on processor: {device_type}")
+
+
+# FastAPI server
+app = FastAPI()
+
+# Server healthcheck
+@app.get("/")
+async def read_root():
+    return {"Status": "Healthy"}
+
+# Model inference API endpoint
+@app.get("/predictions/{model_id}")
+async def infer(model_id, seq_0: Optional[str] = default_question, seq_1: Optional[str] = default_context):
+    question=seq_0
+    context=seq_1
+    status=200
+    if model_id in models.keys():
+        if not quiet:
+            logger.warning(f"\nQuestion: {question}\n")
+        tokenizer=tokenizers[model_id]
+        encoded_input = tokenizer.encode_plus(question, context, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
+        if processor=='gpu':
+            encoded_input.to(device)
+        model=models[model_id]
+        model_input = (encoded_input['input_ids'],  encoded_input['attention_mask'])
+        output=model(*model_input) # This is specific to Inferentia
+        answer_text = str(output[0])
+        if postprocess:
+            answer_start = torch.argmax(output[0])
+            answer_end = torch.argmax(output[1])+1
+            if (answer_end > answer_start):
+                answer_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0][answer_start:answer_end]))
+            else:
+                answer_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0][answer_start:]))
+        if not quiet:
+            logger.warning("\nAnswer: ")
+            logger.warning(answer_text)
+    else:
+        status=404
+        answer_text = f"Model {model_id} does not exist. Try a model name up to model{num_models-1}"
+        if not quiet:
+            logger.warning(answer_text)
+    return responses.JSONResponse(status_code=status, content={"detail": answer_text})
+
+# Load models in memory and onto accelerator as needed
+model_suffix = "bs"+batch_size+"_seq"+sequence_length+"_pc"+pipeline_cores
+model_path=os.path.join(path_prefix,'models',model_name + "_" + processor + "_" + model_suffix + ".pt")
+logger.warning(f"Loading {num_models} instances of pre-trained model {model_name} from path {model_path} ...")
+tokenizers={}
+models={}
+transformers = importlib.import_module("transformers")
+tokenizer_class = getattr(transformers, tokenizer_class_name)
+for i in range(num_models):
+    model_id = 'model' + str(i)
+    logger.warning(f"   {model_id} ...")
+    tokenizers[model_id]=tokenizer_class.from_pretrained(model_name)
+    models[model_id] = torch.jit.load(model_path)
+    if device_type=='gpu':
+        model=models[model_id]
+        model.to(device)
+    elif device_type in ['inf1', 'inf2']:
+        infer(model_id, default_question, default_context)
+        logger.warning("    ... warmup completed")
diff --git a/4-deploy/cpu-yaml.template b/4-deploy/cpu-yaml.template
index 380a97c..a392642 100644
--- a/4-deploy/cpu-yaml.template
+++ b/4-deploy/cpu-yaml.template
@@ -49,10 +49,6 @@ spec:
         - name: pod-port
           containerPort: 8080
         resources:
-        # Use 'memory' setting in limits and requests to ensure that model pods get scheduled to nodes evenly
           limits:
             cpu: 1
-            #memory: "27000Mi"
-         #requests:
-            #memory: "27000Mi"
 
diff --git a/4-deploy/graviton-yaml.template b/4-deploy/graviton-yaml.template
deleted file mode 100644
index 73a8b0a..0000000
--- a/4-deploy/graviton-yaml.template
+++ /dev/null
@@ -1,67 +0,0 @@
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: ${instance_name}
-  namespace: ${namespace}
-  labels:
-    app: ${instance_name}
-spec:
-  ports:
-  - name: preds
-    port: ${service_port}
-    targetPort: pod-port 
-  type: ClusterIP
-  selector:
-    app: ${instance_name}
----
-kind: Deployment
-apiVersion: apps/v1
-metadata:
-  name: ${instance_name}
-  namespace: ${namespace}
-  labels:
-    app: ${instance_name}
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: ${instance_name}
-  template:
-    metadata:
-      labels:
-        app: ${instance_name}
-    spec:
-      nodeSelector:
-        node.kubernetes.io/instance-type: "${instance_type}"
-      topologySpreadConstraints:
-        - maxSkew: 1
-          topologyKey: kubernetes.io/hostname
-          whenUnsatisfiable: DoNotSchedule
-          #nodeAffinityPolicy: Honor
-          labelSelector:
-            matchLabels:
-              app: ${instance_name}
-      containers:
-      - name: main
-        image: "${registry}${model_image_name}${model_image_tag}"
-        imagePullPolicy: Always
-        env:
-          - name: NUM_MODELS
-            value: "${num_models}"
-          - name: POSTPROCESS
-            value: "${postprocess}"
-          - name: QUIET
-            value: "${quiet}"
-        ports:
-        - name: pod-port
-          containerPort: 8080
-        resources:
-        #use limits and requests to ensure that certain number of model pods get scheduled per node
-          limits:
-            #set value based on total available node memory and intended num pods/node
-            memory: "27000Mi"
-          requests:
-            #set value based on total available node memory and intended num pods/node
-            memory: "27000Mi"
-
diff --git a/4-deploy/inf2-yaml.template b/4-deploy/inf2-yaml.template
index 1137375..c4380ba 100644
--- a/4-deploy/inf2-yaml.template
+++ b/4-deploy/inf2-yaml.template
@@ -57,7 +57,6 @@ spec:
               add:
                 - IPC_LOCK
           resources:
-          #use limits and requests to ensure that certain number of model pods get scheduled per node
             limits:
               #hugepages-2Mi: 256Mi    # configure to 256 * desired number of Inferentia devices.
               aws.amazon.com/neuron: 1  # desired number of Inferentia devices.
diff --git a/5-test/Dockerfile b/5-test/Dockerfile
index c2ec0cf..0464e17 100644
--- a/5-test/Dockerfile
+++ b/5-test/Dockerfile
@@ -8,4 +8,8 @@ COPY config.properties /app
 
 ADD ./5-test/tests /app/tests
 
+RUN apt-get update
+
+RUN echo "Y" |  apt-get install dnsutils
+
 CMD ["bash","-c","while true; do date; sleep 10; done"]
diff --git a/5-test/deployment-yaml.template b/5-test/deployment-yaml.template
index 992b4e6..564a420 100644
--- a/5-test/deployment-yaml.template
+++ b/5-test/deployment-yaml.template
@@ -17,27 +17,12 @@ spec:
         app: ${instance_name}
     spec:
       nodeSelector:
-        node.kubernetes.io/instance-type: "${test_instance_type}"
+        beta.kubernetes.io/instance-type: "${test_instance_type}"
       containers:
       - name: main
         image: "${registry}${test_image_name}${test_image_tag}"
         command: ["bash","-c","${cmd_pod}"]
         imagePullPolicy: Always
-        env:
-        - name: runtime
-          value: "$runtime"
-        - name: num_servers
-          value: "$num_servers"
-        - name: num_models
-          value: "$num_models"
-        - name: app_name
-          value: "$app_name"
-        - name: namespace
-          value: "$namespace"
-        - name: num_requests
-          value: "$num_requests"
-        - name: request_frequency
-          value: "$request_frequency"
         resources:
           limits:
             cpu: 1
diff --git a/5-test/job-yaml.template b/5-test/job-yaml.template
index 36b231c..c655e98 100644
--- a/5-test/job-yaml.template
+++ b/5-test/job-yaml.template
@@ -14,28 +14,13 @@ spec:
         app: ${instance_name}
     spec:
       nodeSelector:
-        node.kubernetes.io/instance-type: "${test_instance_type}"
+        beta.kubernetes.io/instance-type: "${test_instance_type}"
       restartPolicy: Never
       containers:
       - name: main
         image: "${registry}${test_image_name}${test_image_tag}"
         command: ["bash","-c","${cmd_pod}"]
         imagePullPolicy: Always
-        env:
-        - name: runtime
-          value: "$runtime"
-        - name: num_servers
-          value: "$num_servers"
-        - name: num_models
-          value: "$num_models"
-        - name: app_name
-          value: "$app_name"
-        - name: namespace
-          value: "$namespace"
-        - name: num_requests
-          value: "$num_requests"
-        - name: request_frequency
-          value: "$request_frequency"
         resources:
-          requests:
+          limits:
             cpu: 1
diff --git a/5-test/run.sh b/5-test/run.sh
index a93f539..a7280c5 100755
--- a/5-test/run.sh
+++ b/5-test/run.sh
@@ -58,7 +58,7 @@ if [ "$runtime" == "docker" ]; then
 elif [ "$runtime" == "kubernetes" ]; then
     pushd ./5-test > /dev/null
     if [ "$1" == "bma" ]; then
-        CMD="kubectl -n ${test_namespace} get pods | grep ${test_image_name}- | cut -d ' ' -f 1 | xargs -L 1 kubectl -n ${test_namespace} logs | grep { | grep -v 0.0, | tee ./bmk-all.log"
+        CMD="kubectl -n ${test_namespace} get pods | grep ${test_image_name}- | cut -d ' ' -f 1 | xargs -L 1 kubectl logs | grep { | grep -v 0.0, | tee ./bmk-all.log"
         command -v bc > /dev/null
         if [ "$?" == "1" ]; then
             echo "bc not found"
@@ -91,4 +91,4 @@ elif [ "$runtime" == "kubernetes" ]; then
     popd > /dev/null
 else
     echo "Runtime $runtime not recognized"
-fi
+fi
\ No newline at end of file
diff --git a/5-test/tests/benchmark.sh b/5-test/tests/benchmark.sh
index 9ec1a0d..682a050 100755
--- a/5-test/tests/benchmark.sh
+++ b/5-test/tests/benchmark.sh
@@ -5,21 +5,14 @@
 # SPDX-License-Identifier: MIT-0                                     #
 ######################################################################
 
-if [ "$num_servers" == "" ]; then
-
-    echo "Configuring number of model servers from config.properties ..."
-
-    if [ -f ../config.properties ]; then
-        source ../config.properties
-    elif [ -f ../../config.properties ]; then
-        source ../../config.properties
-    elif [ -f ./config.properties ]; then
-        source ./config.properties
-    else
-        echo "config.properties not found!"
-    fi
+if [ -f ../config.properties ]; then
+    source ../config.properties
+elif [ -f ../../config.properties ]; then
+    source ../../config.properties
+elif [ -f ./config.properties ]; then
+    source ./config.properties
 else
-    echo "Number of model servers ($num_servers) configured from environment ..."
+    echo "config.properties not found!"
 fi
 
 if [ "$runtime" == "docker" ]; then
@@ -28,4 +21,4 @@ elif [ "$runtime" == "kubernetes" ]; then
     python benchmark_client.py --num_thread 2 --url http://${app_name}-[INSTANCE_IDX].${namespace}.svc.cluster.local:8080/predictions/model[MODEL_IDX] --is_multi_instance --n_instance ${num_servers} --is_multi_model_per_instance --n_model_per_instance ${num_models} --latency_window_size 1000 --cache_dns 
 else
     echo "Runtime $runtime not recognized"
-fi
+fi
\ No newline at end of file
diff --git a/5-test/tests/curl-rnd-ip.sh b/5-test/tests/curl-rnd-ip.sh
index 8e0cdcf..c65c679 100755
--- a/5-test/tests/curl-rnd-ip.sh
+++ b/5-test/tests/curl-rnd-ip.sh
@@ -4,19 +4,15 @@
 # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. #
 # SPDX-License-Identifier: MIT-0                                     #
 ######################################################################
-if [ "$num_servers" == "" ]; then
-    echo "Configuring number of model servers from config.properties ..."
-    if [ -f ../config.properties ]; then
-        source ../config.properties
-    elif [ -f ../../config.properties ]; then
-        source ../../config.properties
-    elif [ -f ./config.properties ]; then
-        source ./config.properties
-    else
-        echo "config.properties not found!"
-    fi
+
+if [ -f ../config.properties ]; then
+    source ../config.properties
+elif [ -f ../../config.properties ]; then
+    source ../../config.properties
+elif [ -f ./config.properties ]; then
+    source ./config.properties
 else
-    echo "Configured number of model servers ($num_servers) from environment"
+    echo "config.properties not found!"
 fi
 
 server=0
@@ -26,7 +22,6 @@ models=$num_models
 
 # get instance ip addresses
 rm -f  ./endpoint_ip.conf
-echo "runtime=$runtime"
 while [ $server -lt $servers ]
 do
 	if [ "$runtime" == "docker" ]; then
@@ -57,4 +52,4 @@ do
 	request=$((request+1))
 done
 
-rm -f  ./endpoint_ip.conf
+rm -f  ./endpoint_ip.conf
\ No newline at end of file
diff --git a/5-test/tests/curl-seq-ip.sh b/5-test/tests/curl-seq-ip.sh
index a9d7c73..44833e1 100755
--- a/5-test/tests/curl-seq-ip.sh
+++ b/5-test/tests/curl-seq-ip.sh
@@ -5,19 +5,14 @@
 # SPDX-License-Identifier: MIT-0                                     #
 ######################################################################
 
-if [ "$num_servers" == "" ]; then
-    echo "Configuring number of model servers from config.properties ..."
-    if [ -f ../config.properties ]; then
-        source ../config.properties
-    elif [ -f ../../config.properties ]; then
-        source ../../config.properties
-    elif [ -f ./config.properties ]; then
-        source ./config.properties
-    else
-        echo "config.properties not found!"
-    fi
+if [ -f ../config.properties ]; then
+    source ../config.properties
+elif [ -f ../../config.properties ]; then
+    source ../../config.properties
+elif [ -f ./config.properties ]; then
+    source ./config.properties
 else
-    echo "Configured number of model servers ($num_servers) from environment"
+    echo "config.properties not found!"
 fi
 
 server=0
@@ -27,15 +22,12 @@ models=$num_models
 
 # get server ip addresses
 rm -f  ./endpoint_ip.conf
-echo "runtime=$runtime"
 while [ $server -lt $servers ]
 do
 	if [ "$runtime" == "docker" ]; then
 		instance_ip=$(cat /etc/hosts | grep  ${app_name}-${server} | awk '{print $1}')
 	elif [ "$runtime" == "kubernetes" ]; then
-		#echo "host=${app_name}-${server}.${namespace}.svc.cluster.local"
 		instance_ip=$(host ${app_name}-${server}.${namespace}.svc.cluster.local | grep "has address" | cut -d ' ' -f 4)
-		#echo "instance_ip=$instance_ip"
 	fi
 	echo $instance_ip >> endpoint_ip.conf
 	server=$((server+1))
@@ -60,4 +52,4 @@ do
 	server=$((server+1))
 done
 
-rm -f  ./endpoint_ip.conf
+rm -f  ./endpoint_ip.conf
\ No newline at end of file
diff --git a/README.md b/README.md
index 6094a46..5e20617 100644
--- a/README.md
+++ b/README.md
@@ -6,34 +6,14 @@ enables hybrid deployments where the best processor/accelerator is used to serve
 In this sample repository, we use a [bert-base](https://huggingface.co/distilbert-base-multilingual-cased) NLP model from [huggingface.co](https://huggingface.co/), however the project structure and workflow is generic and can be adapted for use with other models.
 
 <div align="center">
-<!--img src="./aws-do-inference.png" width="90%"-->
-<img src="./low-latency-high-throughput-inference-on-amazon-eks.png" width="90%">  
+<img src="./aws-do-inference.png" width="90%">
 <br/>
-Fig. 1 - Sample Amazon EKS cluster infrastructure for deploying, running and testing ML Inference workloads
+Fig. 1 - Sample EKS infrastructure for inference workloads
 </div>
 <br/>
 
-The ML inference workloads in this sample project are deployed on the CPU, GPU, or Inferentia nodes as shown on Fig. 1. The control scripts run in any location that has access to the cluster API. To eliminate latency concern related to the cluster ingress, load tests run in a pod within the cluster and send requests to the models directly through the cluster pod network.
-<div align="left">
-1. The Amazon EKS cluster has several node groups, with one EC2 instance family per node group. Each node group can support different instance types, such as CPU (c5,c6i, c7g), GPU (g4dn), AWS Inferentia (Inf2)
-and can pack multiple models per EKS node to maximize the number of served ML models that are running in a node group. 
-Model bin packing is used to maximize compute and memory utilization of the compute node EC2 instances in the cluster node groups.
-<br/>  
-2. The natural language processing (NLP) open-source PyTorch model from [huggingface.co](https://huggingface.co/) serving application and ML framework dependencies are built by Users as container images
-using Automation framework uploaded to Amazon Elastic Container Registry - [Amazon ECR](https://aws.amazon.com/ecr/).
-<br/>
-3. Using project Automation framework, Model container images are obtained from ECR and deployed to [Amazon EKS cluster](https://aws.amazon.com/eks/) using generated Deployment and Service manifests via Kubernetes API
-exposed via Elastic Load Balancer (ELB). Model deployments are customized for each target EKS compute node instance type via settings in the central configuration file.
-<br/>
-4. Following best practices of separation of Model data from containers that run it, ML model microservice design allows to scale out to a large number of models. In the project, model containers are pulling data from
-Amazon Simple Storage Service ([Amazon S3](https://aws.amazon.com)) and other public model data sources each time they are initialized. 
-<br/>
-5. Using project Automation framework, Test container images are obtained from ECR and deployed to Amazon EKS cluster using generated Deployment and Service manifests via Kubernetes API. 
-Test deployments are customized for each deployment target EKS compute node architecture via settings in the central configuration file. Load/scale testing is performed via sending simultaneous requests
-to the Model service pool. Performance Test results metrics are obtained, recorded and aggregated.
-<br/>
-</div>
-<br/><br/>
+The inference workloads in this sample project are deployed on the CPU, GPU, or Inferentia nodes as shown on Fig. 1. The control scripts run in any location that has access to the cluster API. To eliminate latency concern related to the cluster ingress, load tests run in a pod within the cluster and send requests to the models directly through the cluster pod network.
+
 <div align="center">
 <a href="https://www.youtube.com/watch?v=g9XRhGhQhAE"><img src="./aws-do-inference-video.png" width="90%"></a>
 </br>
diff --git a/build.sh b/build.sh
index c79b11a..db9b917 100755
--- a/build.sh
+++ b/build.sh
@@ -28,15 +28,29 @@ if [ "$action" == "" ]; then
 	echo "Building base container ..."
 	
 	echo ""
-	dockerfile=./1-build/Dockerfile-base-${processor}
-	if [ -f $dockerfile ]; then
-		echo "    ... base-${processor} ..."
-		docker build -t ${registry}${base_image_name}${base_image_tag} -f $dockerfile .
-	else
-		echo "Dockerfile $dockerfile was not found."
-	        echo "Please ensure that processor is configured with a supported value in config.properties"
-		exit 1
-	fi
+	case "$processor" in
+		"cpu")
+			echo "   ... base-cpu ..."
+			docker build -t ${registry}${base_image_name}${base_image_tag} -f ./1-build/Dockerfile-base-cpu .
+			;;
+		"gpu")
+			echo "   ... base-gpu ..."
+			docker build -t ${registry}${base_image_name}${base_image_tag} -f ./1-build/Dockerfile-base-gpu .
+			;;
+		"inf1")
+			echo "   ... base-inf1 ..."
+			docker build -t ${registry}${base_image_name}${base_image_tag} -f ./1-build/Dockerfile-base-inf1 .
+			;;
+                "inf2")
+                        echo "   ... base-inf2 ..."
+                        docker build -t ${registry}${base_image_name}${base_image_tag} -f ./1-build/Dockerfile-base-inf2 .
+			;;
+		*)
+			echo "Please ensure cpu, gpu, inf1 or inf2 is configure as processor in config.properties"
+			exit 1
+			;;
+	esac
+
 elif [ "$action" == "push" ]; then
 	./1-build/push.sh
 elif [ "$action" == "pull" ]; then
diff --git a/config.properties b/config.properties
index f47a489..cba5da6 100644
--- a/config.properties
+++ b/config.properties
@@ -8,15 +8,21 @@
 ######################################################################
 
 # Model settings
-huggingface_model_name=bert-base-multilingual-cased
-huggingface_tokenizer_class=BertTokenizer
-huggingface_model_class=BertForQuestionAnswering
+huggingface_model_name=llama-2-13b-hf
+#huggingface_model_name=llamav2_7b_converted
+huggingface_tokenizer_class=AutoTokenizer
+huggingface_model_class=AutoModelForCausalLM
+
+# Neuron setting
+neuron_model_class=LlamaForSampling
+tp_degree=2
+amp_type=bf16
 
 # Compiler settings
-# processor = cpu|gpu|inf1|inf2|graviton
-processor=graviton
+# processor = cpu|gpu|inf1|inf2|arm
+processor=inf2
 pipeline_cores=1
-sequence_length=128
+sequence_length=256
 batch_size=1
 test=True
 
@@ -24,7 +30,7 @@ test=True
 account=$(aws sts get-caller-identity --query Account --output text)
 
 # region is used to login if the registry is ecr 
-region=us-east-1
+region=us-west-2
 
 # Container settings
 # Default is the private ECR registry in the current AWS account.
@@ -33,15 +39,13 @@ region=us-east-1
 registry=${account}.dkr.ecr.${region}.amazonaws.com/
 # registry_type=ecr
 registry_type=ecr
-base_image_name=aws-do-inference-base
-base_image_tag=:v10-${processor}
-model_image_name=${huggingface_model_name}
-model_image_tag=:v10-${processor}
-
-# if using pre-built public registry image (may require authentication) use the following settings
-#registry=public.ecr.aws/a2u7h5w3
-#model_image_name=bert-base-workshop
-#model_image_tag=:v10-${processor}
+#base_image_name=aws-do-inference-base
+#base_image_name=llama2container
+base_image_name=base-${processor}
+#base_image_tag=:v9-${processor}
+base_image_tag=:v1
+model_image_name=${huggingface_model_name}-${processor}
+model_image_tag=:v1
 
 # Trace settings
 # trace_opts_$processor is a processor-specific setting used by the docker run command in the trace.sh script
@@ -50,8 +54,7 @@ trace_opts_cpu=""
 trace_opts_gpu="--gpus 0"
 trace_opts_inf1="-e AWS_NEURON_VISIBLE_DEVICES=ALL --privileged"
 trace_opts_inf2="-e AWS_NEURON_VISIBLE_DEVICES=ALL --privileged"
-trace_opts_graviton=""
-
+trace_opts_arm=""
 # Deployment settings
 # some of these settings apply only when the runtime is kubernetes
 # runtime = docker | kubernetes
@@ -65,11 +68,11 @@ postprocess=True
 # service_port=8080 - port on which model service will be exposed
 service_port=8080
 # Kubernetes-specific deployment settings
-# instance_type = c5.xxx | g4dn.xlarge | g4dn.12xlarge | inf1.xlarge | inf2.8xlarge | c7g.4xlarge...
+# instance_type = c5.xxx | g4dn.xlarge | g4dn.12xlarge | inf1.xlarge | inf1.6xlarge | ...
 # A node group with the specified instance_type must exist in the cluster
 # The instance type must have the processor configured above
-# Example: processor=graviton, instance_type=c7g.4xlarge
-instance_type=c7g.4xlarge
+# Example: processor=arm, instance_type=c7g.4xlarge
+instance_type=c5.4xlarge
 # num_servers - number of model servers to deploy
 # note that more than one model server can run on a node with multiple cpu/gpu/inferentia chips.
 # example: 4 model servers fit on one inf1.6xlarge instance as it has 4 inferentia chips.
@@ -80,24 +83,18 @@ namespace=mpi
 app_name=${huggingface_model_name}-${processor}
 app_dir=app-${app_name}-${instance_type}
 
-# Test image settings
+# Test settings
 test_image_name=test-${huggingface_model_name}
-test_image_tag=:v10-cpu
-
-#when using pre-built test image available in public ECR registry (may require authentication): 
-#registry=public.ecr.aws/a2u7h5w3/
-#test_image_name=bert-base-workshop
-#test_image_tag=:test-v10-cpu
-
+test_image_tag=:v9-cpu
 # request_frequency - time to sleep between two consecutive requests in curl tests
 request_frequency=0.01
 # Stop random request test after num_requests number of requests
 num_requests=30
 # Number of test containers to launch (default=1), use > 1 for scale testing
 num_test_containers=1
-# test_instance_type - when runtime is kubernetes, node instance type on which test pods will run
-test_instance_type=c5.4xlarge
+# test_instance_type - when runtime is kubernetes, instance type on which test pods will run
+test_instance_type=c5.xlarge
 # test_namespace - when runtime is kubernetes, namespace where test pods will be created
 test_namespace=mpi
-# test_dir - when runtime is kubernetes, directory where test job/pod manifests are stored
+# test_dir - when runtime is kubernetes, directory where test pod manifests are stored
 test_dir=app-${test_image_name}-${instance_type}
diff --git a/k8s-neuron-device-plugin-rbac.yml b/k8s-neuron-device-plugin-rbac.yml
new file mode 100644
index 0000000..ae30e52
--- /dev/null
+++ b/k8s-neuron-device-plugin-rbac.yml
@@ -0,0 +1,59 @@
+# rbac.yaml
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: neuron-device-plugin
+rules:
+- apiGroups:
+  - ""
+  resources:
+  - nodes
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - events
+  verbs:
+  - create
+  - patch
+- apiGroups:
+  - ""
+  resources:
+  - pods
+  verbs:
+  - update
+  - patch
+  - get
+  - list
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - nodes/status
+  verbs:
+  - patch
+  - update
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: neuron-device-plugin
+  namespace: kube-system
+---
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: neuron-device-plugin
+  namespace: kube-system
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: neuron-device-plugin
+subjects:
+- kind: ServiceAccount
+  name: neuron-device-plugin
+  namespace: kube-system
diff --git a/k8s-neuron-device-plugin.yml b/k8s-neuron-device-plugin.yml
new file mode 100644
index 0000000..25b43ad
--- /dev/null
+++ b/k8s-neuron-device-plugin.yml
@@ -0,0 +1,98 @@
+# https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: neuron-device-plugin-daemonset
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      name:  neuron-device-plugin-ds
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      annotations:
+        scheduler.alpha.kubernetes.io/critical-pod: ""
+      labels:
+        name: neuron-device-plugin-ds
+    spec:
+      serviceAccount: neuron-device-plugin
+      tolerations:
+      - key: CriticalAddonsOnly
+        operator: Exists
+      - key: aws.amazon.com/neuron
+        operator: Exists
+        effect: NoSchedule
+      # Mark this pod as a critical add-on; when enabled, the critical add-on
+      # scheduler reserves resources for critical add-on pods so that they can
+      # be rescheduled after a failure.
+      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
+      priorityClassName: "system-node-critical"
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: "beta.kubernetes.io/instance-type"
+                    operator: In
+                    values:
+                      - inf1.xlarge
+                      - inf1.2xlarge
+                      - inf1.6xlarge
+                      - inf1.24xlarge
+                      - inf2.xlarge
+                      - inf2.4xlarge
+                      - inf2.8xlarge
+                      - inf2.24xlarge
+                      - inf2.48xlarge
+                      - trn1.2xlarge
+                      - trn1.32xlarge
+                      - trn1n.32xlarge
+              - matchExpressions:
+                  - key: "node.kubernetes.io/instance-type"
+                    operator: In
+                    values:
+                      - inf1.xlarge
+                      - inf1.2xlarge
+                      - inf1.6xlarge
+                      - inf1.24xlarge
+                      - inf2.xlarge
+                      - inf2.4xlarge
+                      - inf2.8xlarge
+                      - inf2.24xlarge
+                      - inf2.48xlarge
+                      - trn1.2xlarge
+                      - trn1.32xlarge
+                      - trn1n.32xlarge
+      containers:
+        #Device Plugin containers are available both in us-east and us-west ecr
+        #repos
+      - image: public.ecr.aws/neuron/neuron-device-plugin:2.16.18.0
+        imagePullPolicy: Always
+        name: neuron-device-plugin
+        env:
+        - name: KUBECONFIG
+          value: /etc/kubernetes/kubelet.conf
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: spec.nodeName
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop: ["ALL"]
+        volumeMounts:
+          - name: device-plugin
+            mountPath: /var/lib/kubelet/device-plugins
+          - name: infa-map
+            mountPath: /run
+      volumes:
+        - name: device-plugin
+          hostPath:
+            path: /var/lib/kubelet/device-plugins
+        - name: infa-map
+          hostPath:
+            path: /run
+
+
diff --git a/low-latency-high-throughput-inference-on-amazon-eks.png b/low-latency-high-throughput-inference-on-amazon-eks.png
deleted file mode 100644
index a22e366..0000000
Binary files a/low-latency-high-throughput-inference-on-amazon-eks.png and /dev/null differ
diff --git a/pack.sh b/pack.sh
index c82a103..118bd15 100755
--- a/pack.sh
+++ b/pack.sh
@@ -25,7 +25,7 @@ source ./config.properties
 action=$1
 
 if [ "$action" == "" ]; then
-    model_file_name=${huggingface_model_name}_bs${batch_size}_seq${sequence_length}_pc${pipeline_cores}_${processor}.pt
+    model_file_name=${huggingface_model_name}_${processor}_bs${batch_size}_seq${sequence_length}_pc${pipeline_cores}.pt
     
     docker build -t ${registry}${model_image_name}${model_image_tag} --build-arg BASE_IMAGE=${registry}${base_image_name}${base_image_tag} \
                  --build-arg MODEL_NAME=${huggingface_model_name} --build-arg MODEL_FILE_NAME=${model_file_name} --build-arg PROCESSOR=${processor} \
diff --git a/trace.sh b/trace.sh
index 4decc5a..b75ebae 100755
--- a/trace.sh
+++ b/trace.sh
@@ -19,18 +19,27 @@ print_help() {
 if [ "$1" == "" ]; then 
 	source ./config.properties
 	echo ""
-	echo "Tracing model: $huggingface_model_name ..."
-	
-	dockerfile=./1-build/Dockerfile-base-${processor}
+	echo "Tracing model $huggingface_model_name ..."
+
 	echo ""
-	if [ -f $dockerfile ]; then
-		echo "   ... for processor: $processor ..."
-		trace_opts=trace_opts_${processor}
-		docker run ${!trace_opts} -it --rm -v $(pwd)/2-trace:/app/trace -v $(pwd)/config.properties:/app/config.properties ${registry}${base_image_name}${base_image_tag} bash -c "cd /app/trace; python model-tracer.py"
-	else
-		echo "Processor $processor is not supported. Please ensure the processor setting in config.properties is configured properly"
-		exit 1
-	fi
+	case "$processor" in
+		"cpu")
+			echo "   ... for cpu ..."
+			docker run -it --rm -v $(pwd)/2-trace:/app/trace -v $(pwd)/config.properties:/app/config.properties ${registry}${base_image_name}${base_image_tag} bash -c "cd /app/trace; python model-tracer.py"
+			;;
+		"gpu")
+			echo "   ... for gpu ..."
+			docker run --gpus 0 -it --rm -v $(pwd)/2-trace:/app/trace -v $(pwd)/config.properties:/app/config.properties ${registry}${base_image_name}${base_image_tag} bash -c "cd /app/trace; python model-tracer.py"
+			;;
+		"inf")
+			echo "   ... for inf ..."
+			docker run -it --rm -e AWS_NEURON_VISIBLE_DEVICES=ALL --privileged -v $(pwd)/2-trace:/app/trace -v $(pwd)/config.properties:/app/config.properties ${registry}${base_image_name}${base_image_tag} bash -c "cd /app/trace; python model-tracer.py"
+			;;
+		*)
+			echo "Please ensure cpu, gpu, or inf is configure as processor in config.properties"
+			exit 1
+			;;
+	esac
 else
 	print_help
 fi