diff --git a/01_getting_started/01_hello_world/README.md b/01_getting_started/01_hello_world/README.md
index 7ecc27d..b75d0dc 100644
--- a/01_getting_started/01_hello_world/README.md
+++ b/01_getting_started/01_hello_world/README.md
@@ -30,7 +30,7 @@ Server starts at **http://localhost:8888**
 
 ### 4. Test the API
 
-Visit **http://localhost:8888/docs** for interactive API documentation. QB endpoints are auto-generated by `flash run` based on your `@remote` functions.
+Visit **http://localhost:8888/docs** for interactive API documentation. QB endpoints are auto-generated by `flash run` based on your `@Endpoint` functions.
 
 ```bash
 curl -X POST http://localhost:8888/gpu_worker/runsync \
@@ -38,8 +38,6 @@ curl -X POST http://localhost:8888/gpu_worker/runsync \
   -d '{"message": "Hello GPU!"}'
 ```
 
-Visit **http://localhost:8888/docs** for interactive API documentation.
-
 ### Full CLI Documentation
 
 For complete CLI usage including deployment, environment management, and troubleshooting:
@@ -58,14 +56,14 @@ Simple GPU-based serverless function that:
 - Runs on any available GPU
 
 The worker demonstrates:
-- Remote execution with `@remote` decorator
-- GPU resource configuration with `LiveServerless`
-- Automatic scaling based on demand
+- Remote execution with the `@Endpoint` decorator
+- GPU resource configuration via `gpu=` parameter
+- Automatic scaling via `workers=` parameter
 - Local development and testing
 
 ## API Endpoints
 
-QB (queue-based) endpoints are auto-generated from `@remote` functions. Visit `/docs` for the full API schema.
+QB (queue-based) endpoints are auto-generated from `@Endpoint` functions. Visit `/docs` for the full API schema.
 
 ### `gpu_hello`
 
@@ -100,7 +98,7 @@ Executes a simple GPU worker and returns system/GPU information.
 
 ```
 01_hello_world/
-├── gpu_worker.py        # GPU worker with @remote decorator
+├── gpu_worker.py        # GPU worker with @Endpoint decorator
 ├── pyproject.toml       # Project metadata
 ├── requirements.txt     # Dependencies
 ├── .env.example         # Environment variables template
@@ -110,16 +108,23 @@ Executes a simple GPU worker and returns system/GPU information.
 ## Key Concepts
 
 ### Remote Execution
-The `@remote` decorator transparently executes functions on serverless infrastructure:
+The `@Endpoint` decorator transparently executes functions on serverless infrastructure:
 - Code runs locally during development
 - Automatically deploys to Runpod when configured
 - Handles serialization and resource management
 
+```python
+from runpod_flash import Endpoint, GpuGroup
+
+@Endpoint(name="my-worker", gpu=GpuGroup.ANY, workers=(0, 3))
+async def my_function(data: dict) -> dict:
+    return {"result": "processed"}
+```
+
 ### Resource Scaling
 The GPU worker scales to zero when idle:
-- **workersMin=0**: Scales down completely when idle
-- **workersMax=3**: Up to 3 concurrent workers
-- **idleTimeout=5**: 5 minutes before scaling down
+- **workers=(0, 3)**: Scale from 0 to 3 workers
+- **idle_timeout=5**: 5 minutes before scaling down
 
 ### GPU Detection
 The worker uses PyTorch to detect and report GPU information:
@@ -142,7 +147,7 @@ flash run
 
 ## Next Steps
 
-- Customize GPU type: Change `GpuGroup.ANY` to specific GPU (ADA_24, AMPERE_80, etc.)
+- Customize GPU type: Change `GpuGroup.ANY` to a specific GPU (e.g. `GpuGroup.ADA_24`, `GpuGroup.AMPERE_80`)
 - Add your own GPU-accelerated code
 - Implement error handling and validation
 - Deploy to production with `flash deploy`
diff --git a/01_getting_started/01_hello_world/gpu_worker.py b/01_getting_started/01_hello_world/gpu_worker.py
index be7cb06..5c7c093 100644
--- a/01_getting_started/01_hello_world/gpu_worker.py
+++ b/01_getting_started/01_hello_world/gpu_worker.py
@@ -1,20 +1,17 @@
-# GPU serverless worker -- detects available GPU hardware.
-# Run with: flash run
-# Test directly: python gpu_worker.py
-from runpod_flash import GpuGroup, LiveServerless, remote
+# gpu serverless worker -- detects available GPU hardware.
+# run with: flash run
+# test directly: python gpu_worker.py
+from runpod_flash import Endpoint, GpuGroup
 
-gpu_config = LiveServerless(
+
+@Endpoint(
     name="01_01_gpu_worker",
-    gpus=[GpuGroup.ANY],
-    workersMin=0,
-    workersMax=3,
-    idleTimeout=5,
+    gpu=GpuGroup.ANY,
+    workers=(0, 3),
+    idle_timeout=5,
 )
-
-
-@remote(resource_config=gpu_config)
-async def gpu_hello(payload: dict) -> dict:
-    """Simple GPU worker that returns GPU hardware info."""
+async def gpu_hello(input_data: dict) -> dict:
+    """GPU worker that returns GPU hardware info."""
     import platform
     from datetime import datetime
 
@@ -25,7 +22,7 @@ async def gpu_hello(payload: dict) -> dict:
     gpu_count = torch.cuda.device_count()
     gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
 
-    message = payload.get("message", "Hello from GPU worker!")
+    message = input_data.get("message", "Hello from GPU worker!")
 
     return {
         "status": "success",
diff --git a/01_getting_started/02_cpu_worker/README.md b/01_getting_started/02_cpu_worker/README.md
index 967776d..a242690 100644
--- a/01_getting_started/02_cpu_worker/README.md
+++ b/01_getting_started/02_cpu_worker/README.md
@@ -30,7 +30,7 @@ Server starts at **http://localhost:8888**
 
 ### 4. Test the API
 
-Visit **http://localhost:8888/docs** for interactive API documentation. QB endpoints are auto-generated by `flash run` based on your `@remote` functions.
+Visit **http://localhost:8888/docs** for interactive API documentation. QB endpoints are auto-generated by `flash run` based on your `@Endpoint` functions.
 
 ```bash
 curl -X POST http://localhost:8888/cpu_worker/runsync \
@@ -56,14 +56,14 @@ Simple CPU-based serverless function that:
 - Runs on general-purpose CPU instances
 
 The worker demonstrates:
-- Remote execution with `@remote` decorator
-- CPU resource configuration with `CpuLiveServerless`
-- Automatic scaling based on demand
+- Remote execution with the `@Endpoint` decorator
+- CPU resource configuration via `cpu=` parameter
+- Automatic scaling via `workers=` parameter
 - Lightweight API request handling
 
 ## API Endpoints
 
-QB (queue-based) endpoints are auto-generated from `@remote` functions. Visit `/docs` for the full API schema.
+QB (queue-based) endpoints are auto-generated from `@Endpoint` functions. Visit `/docs` for the full API schema.
 
 ### `cpu_hello`
 
@@ -92,7 +92,7 @@ Executes a simple CPU worker and returns a greeting with system information.
 
 ```
 02_cpu_worker/
-├── cpu_worker.py        # CPU worker with @remote decorator
+├── cpu_worker.py        # CPU worker with @Endpoint decorator
 ├── pyproject.toml       # Project metadata
 ├── requirements.txt     # Dependencies
 ├── .env.example         # Environment variables template
@@ -102,16 +102,18 @@ Executes a simple CPU worker and returns a greeting with system information.
 ## Key Concepts
 
 ### Remote Execution
-The `@remote` decorator transparently executes functions on serverless infrastructure:
+The `@Endpoint` decorator transparently executes functions on serverless infrastructure:
 - Code runs locally during development
 - Automatically deploys to Runpod when configured
 - Handles serialization and resource management
 
-### Resource Scaling
-The CPU worker scales to zero when idle:
-- **workersMin=0**: Scales down completely when idle
-- **workersMax=3**: Up to 3 concurrent workers
-- **idleTimeout=5**: 5 minutes before scaling down
+```python
+from runpod_flash import Endpoint
+
+@Endpoint(name="my-worker", cpu="cpu3c-1-2", workers=(0, 3))
+async def my_function(data: dict) -> dict:
+    return {"result": "processed"}
+```
 
 ### CPU Instance Types
 Available CPU configurations:
@@ -119,6 +121,20 @@ Available CPU configurations:
 - `CpuInstanceType.CPU3C_4_8`: 4 vCPU, 8GB RAM (Compute Optimized)
 - `CpuInstanceType.CPU5G_4_16`: 4 vCPU, 16GB RAM (Latest Gen)
 
+CPU type can be specified as an enum or a string shorthand:
+```python
+# enum
+@Endpoint(name="worker", cpu=CpuInstanceType.CPU3C_1_2)
+
+# string shorthand
+@Endpoint(name="worker", cpu="cpu3c-1-2")
+```
+
+### Resource Scaling
+The CPU worker scales to zero when idle:
+- **workers=(0, 3)**: Scale from 0 to 3 workers
+- **idle_timeout=5**: 5 minutes before scaling down
+
 ## Development
 
 ### Test Worker Locally
@@ -148,7 +164,7 @@ Compare with GPU workers when you need:
 
 ## Next Steps
 
-- Customize CPU type: Change `CpuInstanceType.CPU3G_2_8` to specific instance type
+- Customize CPU type: Change `"cpu3c-1-2"` to a different instance type
 - Add request validation and error handling
 - Integrate with databases or external APIs
 - Deploy to production with `flash deploy`
diff --git a/01_getting_started/02_cpu_worker/cpu_worker.py b/01_getting_started/02_cpu_worker/cpu_worker.py
index f17c321..01fdef7 100644
--- a/01_getting_started/02_cpu_worker/cpu_worker.py
+++ b/01_getting_started/02_cpu_worker/cpu_worker.py
@@ -1,24 +1,21 @@
-# CPU serverless worker -- lightweight processing without GPU.
-# Run with: flash run
-# Test directly: python cpu_worker.py
-from runpod_flash import CpuInstanceType, CpuLiveServerless, remote
+# cpu serverless worker -- lightweight processing without GPU.
+# run with: flash run
+# test directly: python cpu_worker.py
+from runpod_flash import CpuInstanceType, Endpoint
 
-cpu_config = CpuLiveServerless(
+
+@Endpoint(
     name="01_02_cpu_worker",
-    instanceIds=[CpuInstanceType.CPU3C_1_2],
-    workersMin=0,
-    workersMax=3,
-    idleTimeout=5,
+    cpu=CpuInstanceType.CPU3C_1_2,
+    workers=(0, 3),
+    idle_timeout=5,
 )
-
-
-@remote(resource_config=cpu_config)
-async def cpu_hello(payload: dict) -> dict:
-    """Simple CPU worker that returns a greeting."""
+async def cpu_hello(input_data: dict) -> dict:
+    """CPU worker that returns a greeting."""
     import platform
     from datetime import datetime
 
-    message = f"Hello, {payload.get('name', 'Anonymous Panda')}!"
+    message = f"Hello, {input_data.get('name', 'Anonymous Panda')}!"
 
     return {
         "status": "success",
diff --git a/01_getting_started/03_mixed_workers/README.md b/01_getting_started/03_mixed_workers/README.md
index 1e99a94..ae466af 100644
--- a/01_getting_started/03_mixed_workers/README.md
+++ b/01_getting_started/03_mixed_workers/README.md
@@ -6,7 +6,7 @@ Learn the production pattern of combining CPU and GPU workers for cost-effective
 
 - **Mixed worker architecture** - Combining CPU and GPU workers intelligently
 - **Cost optimization** - Using GPU only when necessary
-- **Pipeline orchestration** - Coordinating multiple worker types
+- **Pipeline orchestration** - Coordinating multiple worker types via a load-balanced endpoint
 - **Production patterns** - Real-world ML service architecture
 
 ## Architecture
@@ -131,13 +131,13 @@ Total: $0.0019/sec
 
 ### CPU Preprocessing Worker
 ```python
-preprocess_config = CpuLiveServerless(
+@Endpoint(
     name="preprocess_worker",
-    instanceIds=[CpuInstanceType.CPU3G_2_8],  # 2 vCPU, 8GB
-    workersMin=0,
-    workersMax=10,  # High traffic capacity
-    idleTimeout=3,   # Quick scale-down
+    cpu=CpuInstanceType.CPU3G_2_8,  # 2 vCPU, 8GB
+    workers=(0, 10),
+    idle_timeout=3,
 )
+async def preprocess_text(input_data: dict) -> dict: ...
 ```
 
 **Cost:** ~$0.0002/sec
@@ -145,13 +145,14 @@ preprocess_config = CpuLiveServerless(
 
 ### GPU Inference Worker
 ```python
-gpu_config = LiveServerless(
+@Endpoint(
     name="inference_worker",
-    gpus=[GpuGroup.ADA_24],  # RTX 4090
-    workersMin=0,
-    workersMax=3,
-    idleTimeout=5,
+    gpu=GpuGroup.ADA_24,  # RTX 4090
+    workers=(0, 3),
+    idle_timeout=5,
+    dependencies=["torch"],
 )
+async def gpu_inference(input_data: dict) -> dict: ...
 ```
 
 **Cost:** ~$0.0015/sec
@@ -159,13 +160,13 @@ gpu_config = LiveServerless(
 
 ### CPU Postprocessing Worker
 ```python
-postprocess_config = CpuLiveServerless(
+@Endpoint(
     name="postprocess_worker",
-    instanceIds=[CpuInstanceType.CPU3G_2_8],  # 2 vCPU, 8GB
-    workersMin=0,
-    workersMax=10,
-    idleTimeout=3,
+    cpu=CpuInstanceType.CPU3G_2_8,  # 2 vCPU, 8GB
+    workers=(0, 10),
+    idle_timeout=3,
 )
+async def postprocess_results(input_data: dict) -> dict: ...
 ```
 
 **Cost:** ~$0.0002/sec
@@ -176,7 +177,13 @@ postprocess_config = CpuLiveServerless(
 The `/classify` load-balanced endpoint orchestrates all workers:
 
 ```python
-@remote(resource_config=pipeline_config, method="POST", path="/classify")
+from cpu_worker import postprocess_results, preprocess_text
+from gpu_worker import gpu_inference
+from runpod_flash import Endpoint
+
+pipeline = Endpoint(name="classify_pipeline", cpu="cpu3c-1-2", workers=(1, 3))
+
+@pipeline.post("/classify")
 async def classify(text: str) -> dict:
     """Complete ML pipeline: CPU preprocess -> GPU inference -> CPU postprocess."""
     preprocess_result = await preprocess_text({"text": text})
@@ -234,15 +241,9 @@ For higher volumes, savings multiply significantly.
 
 ```python
 try:
-    # Stage 1: Preprocess (validation already done)
     preprocess_result = await preprocess_text(data)
-
-    # Stage 2: GPU inference
     gpu_result = await gpu_inference(preprocess_result)
-
-    # Stage 3: Postprocess
     final_result = await postprocess_results(gpu_result)
-
     return final_result
 except Exception as e:
     logger.error(f"Pipeline failed: {e}")
@@ -251,13 +252,13 @@ except Exception as e:
 
 ### 2. Timeouts
 
-Set appropriate timeouts for each stage:
+Set appropriate timeouts for each stage via `execution_timeout_ms`:
 ```python
 # CPU stages: short timeouts
-preprocess_config.executionTimeout = 30  # seconds
+@Endpoint(name="preprocess", cpu="cpu3c-1-2", execution_timeout_ms=30000)
 
 # GPU stage: longer timeout
-gpu_config.executionTimeout = 120  # seconds
+@Endpoint(name="inference", gpu=GpuGroup.ADA_24, execution_timeout_ms=120000)
 ```
 
 ### 3. Monitoring
@@ -299,7 +300,7 @@ Review worker usage:
 ### Slow Performance
 
 - Increase CPU worker max count for preprocessing
-- Check if GPU cold start is the issue (set workersMin=1)
+- Check if GPU cold start is the issue (set `workers=(1, 3)` for always-warm)
 - Consider caching preprocessed data
 
 ## Next Steps
diff --git a/01_getting_started/03_mixed_workers/cpu_worker.py b/01_getting_started/03_mixed_workers/cpu_worker.py
index 6de923d..f65fd6c 100644
--- a/01_getting_started/03_mixed_workers/cpu_worker.py
+++ b/01_getting_started/03_mixed_workers/cpu_worker.py
@@ -1,29 +1,21 @@
-# CPU workers for text preprocessing and postprocessing.
-# Part of the mixed CPU/GPU pipeline example.
-# Run with: flash run
-# Test directly: python cpu_worker.py
-from runpod_flash import CpuInstanceType, CpuLiveServerless, remote
-
-cpu_preprocess_config = CpuLiveServerless(
-    name="01_03_mixed_workers_cpu_preprocess",
-    instanceIds=[CpuInstanceType.CPU3G_2_8],
-    idleTimeout=3,
-)
-
-cpu_postprocess_config = CpuLiveServerless(
-    name="01_03_mixed_workers_cpu_postprocess",
-    instanceIds=[CpuInstanceType.CPU3G_2_8],
-    idleTimeout=3,
-)
+# cpu workers for text preprocessing and postprocessing.
+# part of the mixed CPU/GPU pipeline example.
+# run with: flash run
+# test directly: python cpu_worker.py
+from runpod_flash import CpuInstanceType, Endpoint
 
 
-@remote(resource_config=cpu_preprocess_config)
-async def preprocess_text(payload: dict) -> dict:
+@Endpoint(
+    name="01_03_mixed_workers_cpu",
+    cpu=CpuInstanceType.CPU3G_2_8,
+    idle_timeout=3,
+)
+async def preprocess_text(input_data: dict) -> dict:
     """Preprocess text: cleaning and tokenization (cheap CPU work)."""
     import re
     from datetime import datetime
 
-    text = payload.get("text", "")
+    text = input_data.get("text", "")
 
     cleaned_text = text.strip()
     cleaned_text = re.sub(r"\s+", " ", cleaned_text)
@@ -44,14 +36,18 @@ async def preprocess_text(payload: dict) -> dict:
     }
 
 
-@remote(resource_config=cpu_postprocess_config)
-async def postprocess_results(payload: dict) -> dict:
+@Endpoint(
+    name="01_03_mixed_workers_cpu_postprocess",
+    cpu=CpuInstanceType.CPU3G_2_8,
+    idle_timeout=3,
+)
+async def postprocess_results(input_data: dict) -> dict:
     """Postprocess GPU results: formatting and aggregation (cheap CPU work)."""
     from datetime import datetime
 
-    predictions = payload.get("predictions", [])
-    original_text = payload.get("original_text", "")
-    metadata = payload.get("metadata", {})
+    predictions = input_data.get("predictions", [])
+    original_text = input_data.get("original_text", "")
+    metadata = input_data.get("metadata", {})
 
     if predictions:
         top_prediction = max(predictions, key=lambda x: x["confidence"])
diff --git a/01_getting_started/03_mixed_workers/gpu_worker.py b/01_getting_started/03_mixed_workers/gpu_worker.py
index 00414cc..a9350a8 100644
--- a/01_getting_started/03_mixed_workers/gpu_worker.py
+++ b/01_getting_started/03_mixed_workers/gpu_worker.py
@@ -1,28 +1,26 @@
-# GPU worker for ML inference (sentiment classification).
-# Part of the mixed CPU/GPU pipeline example.
-# Run with: flash run
-# Test directly: python gpu_worker.py
-from runpod_flash import GpuGroup, LiveServerless, remote
+# gpu worker for ML inference (sentiment classification).
+# part of the mixed CPU/GPU pipeline example.
+# run with: flash run
+# test directly: python gpu_worker.py
+from runpod_flash import Endpoint, GpuGroup
 
-gpu_config = LiveServerless(
+
+@Endpoint(
     name="01_03_mixed_inference",
-    gpus=[GpuGroup.ADA_24],
-    workersMin=0,
-    workersMax=3,
-    idleTimeout=5,
+    gpu=GpuGroup.ADA_24,
+    workers=(0, 3),
+    idle_timeout=5,
+    dependencies=["torch"],
 )
-
-
-@remote(resource_config=gpu_config, dependencies=["torch"])
-async def gpu_inference(payload: dict) -> dict:
+async def gpu_inference(input_data: dict) -> dict:
     """GPU inference: mock sentiment classification."""
     import random
     from datetime import datetime
 
     import torch
 
-    cleaned_text = payload.get("cleaned_text", "")
-    word_count = payload.get("word_count", 0)
+    cleaned_text = input_data.get("cleaned_text", "")
+    word_count = input_data.get("word_count", 0)
 
     gpu_available = torch.cuda.is_available()
     if gpu_available:
diff --git a/01_getting_started/03_mixed_workers/pipeline.py b/01_getting_started/03_mixed_workers/pipeline.py
index 1d278a0..6a4615f 100644
--- a/01_getting_started/03_mixed_workers/pipeline.py
+++ b/01_getting_started/03_mixed_workers/pipeline.py
@@ -1,15 +1,12 @@
-# Classification pipeline: CPU preprocess -> GPU inference -> CPU postprocess.
-# Demonstrates cross-worker orchestration via a load-balanced endpoint.
-# Run with: flash run
-from runpod_flash import CpuLiveLoadBalancer, remote
+# classification pipeline: CPU preprocess -> GPU inference -> CPU postprocess.
+# demonstrates cross-worker orchestration via a load-balanced endpoint.
+# run with: flash run
+from runpod_flash import Endpoint
 
-pipeline_config = CpuLiveLoadBalancer(
-    name="01_03_classify_pipeline",
-    workersMin=1,
-)
+pipeline = Endpoint(name="01_03_classify_pipeline", cpu="cpu3c-1-2", workers=(1, 3))
 
 
-@remote(resource_config=pipeline_config, method="POST", path="/classify")
+@pipeline.post("/classify")
 async def classify(text: str) -> dict:
     """Complete ML pipeline: CPU preprocess -> GPU inference -> CPU postprocess."""
     from cpu_worker import postprocess_results, preprocess_text
diff --git a/01_getting_started/04_dependencies/README.md b/01_getting_started/04_dependencies/README.md
index eaa5a8f..1bdf133 100644
--- a/01_getting_started/04_dependencies/README.md
+++ b/01_getting_started/04_dependencies/README.md
@@ -43,22 +43,23 @@ flash run
 
 ### 1. Python Dependencies
 
-Specified in `@remote` decorator:
+Specified in the `Endpoint` decorator:
 
 ```python
-@remote(
-    resource_config=config,
+@Endpoint(
+    name="my-worker",
+    gpu=GpuGroup.ADA_24,
     dependencies=[
         "torch==2.1.0",      # Exact version
         "Pillow>=10.0.0",    # Minimum version
         "numpy<2.0.0",       # Maximum version
         "requests",          # Latest version
-    ]
+    ],
 )
 async def my_function(data: dict) -> dict:
     import torch
     import PIL
-    # Your code here
+    # your code here
 ```
 
 ### 2. System Dependencies
@@ -66,10 +67,11 @@ async def my_function(data: dict) -> dict:
 Install apt packages:
 
 ```python
-@remote(
-    resource_config=config,
+@Endpoint(
+    name="my-worker",
+    gpu=GpuGroup.AMPERE_16,
     dependencies=["opencv-python"],
-    system_dependencies=["ffmpeg", "libgl1", "graphviz"]
+    system_dependencies=["ffmpeg", "libgl1", "graphviz"],
 )
 async def process_video(data: dict) -> dict:
     import cv2
@@ -87,9 +89,9 @@ async def process_video(data: dict) -> dict:
 Fastest cold start:
 
 ```python
-@remote(resource_config=config)  # No dependencies!
+@Endpoint(name="my-worker", cpu="cpu3c-1-2")
 async def simple_function(data: dict) -> dict:
-    # Only Python stdlib
+    # only Python stdlib
     import json
     import re
     from datetime import datetime
@@ -198,39 +200,42 @@ system_dependencies=["ffmpeg", "libgl1", "wget"]
 
 ```python
 # Good - Reproducible
-dependencies=[
-    "torch==2.1.0",
-    "transformers==4.35.2",
-    "numpy==1.26.2",
-]
+@Endpoint(
+    name="worker",
+    gpu=GpuGroup.ADA_24,
+    dependencies=[
+        "torch==2.1.0",
+        "transformers==4.35.2",
+        "numpy==1.26.2",
+    ],
+)
 
 # Bad - Unpredictable
-dependencies=[
-    "torch",  # Version changes over time
-    "transformers",
-    "numpy",
-]
+@Endpoint(
+    name="worker",
+    gpu=GpuGroup.ADA_24,
+    dependencies=[
+        "torch",
+        "transformers",
+        "numpy",
+    ],
+)
 ```
 
 ### 2. Minimize Dependencies
 
 ```python
 # Good - Only what's needed
-@remote(
-    dependencies=["requests"]  # Just one package
-)
+@Endpoint(name="fetcher", cpu="cpu3c-1-2", dependencies=["requests"])
 async def fetch_data(url: str):
     import requests
     return requests.get(url).json()
 
 # Bad - Unnecessary bloat
-@remote(
-    dependencies=[
-        "requests",
-        "pandas",  # Not used
-        "numpy",   # Not used
-        "scipy",   # Not used
-    ]
+@Endpoint(
+    name="fetcher",
+    cpu="cpu3c-1-2",
+    dependencies=["requests", "pandas", "numpy", "scipy"],
 )
 async def fetch_data(url: str):
     import requests
@@ -248,13 +253,14 @@ python cpu_worker.py
 ### 4. Document Dependencies
 
 ```python
-@remote(
-    resource_config=config,
+@Endpoint(
+    name="worker",
+    gpu=GpuGroup.ADA_24,
     dependencies=[
         "torch==2.1.0",      # GPU operations
         "Pillow>=10.0.0",    # Image processing
         "requests",          # API calls
-    ]
+    ],
 )
 async def process_image(data: dict):
     """Process image with PyTorch and Pillow."""
@@ -307,7 +313,7 @@ Dependencies take long to install?
 **Solutions:**
 1. Minimize dependencies
 2. Use custom Docker image (advanced)
-3. Keep workers warm (workersMin=1)
+3. Keep workers warm (`workers=(1, 3)`)
 
 ## Cold Start Times
 
@@ -330,29 +336,25 @@ Pillow>=10.0.0
 numpy==1.26.2
 ```
 
-**Note:** Worker dependencies in `@remote` decorator are deployed automatically. `requirements.txt` is for local development only.
+**Note:** Worker dependencies in the `Endpoint` decorator are deployed automatically. `requirements.txt` is for local development only.
 
-## Advanced: Custom Docker Images
+## Advanced: External Docker Images
 
-For complex dependencies, consider custom images:
+For complex dependencies, deploy a pre-built image:
 
 ```python
-from runpod_flash import ServerlessEndpoint
+from runpod_flash import Endpoint, GpuGroup
 
-custom_config = ServerlessEndpoint(
-    name="custom_image_worker",
-    dockerImage="myregistry/my-image:v1.0",
-    gpuIds=["NVIDIA GeForce RTX 4090"],
+vllm = Endpoint(
+    name="vllm-service",
+    image="vllm/vllm-openai:latest",
+    gpu=GpuGroup.ADA_24,
 )
 
-@remote(resource_config=custom_config)
-async def process(data: dict):
-    # All dependencies pre-installed in image
-    pass
+# call it as an API client
+result = await vllm.post("/v1/completions", {"prompt": "hello"})
 ```
 
-See [02_ml_inference/04_custom_images](../../02_ml_inference/04_custom_images/) for details.
-
 ## Next Steps
 
 - **02_ml_inference** - Deploy real ML models
diff --git a/01_getting_started/04_dependencies/cpu_worker.py b/01_getting_started/04_dependencies/cpu_worker.py
index 9d9fa6e..a540c11 100644
--- a/01_getting_started/04_dependencies/cpu_worker.py
+++ b/01_getting_started/04_dependencies/cpu_worker.py
@@ -1,27 +1,14 @@
-# CPU workers demonstrating data science and zero-dependency patterns.
-# Run with: flash run
-# Test directly: python cpu_worker.py
-from runpod_flash import CpuInstanceType, CpuLiveServerless, remote
+# cpu workers demonstrating data science and zero-dependency patterns.
+# run with: flash run
+# test directly: python cpu_worker.py
+from runpod_flash import CpuInstanceType, Endpoint
 
-# Worker with data science dependencies
-data_config = CpuLiveServerless(
-    name="01_04_deps_data",
-    instanceIds=[CpuInstanceType.CPU3C_8_16],
-    workersMin=0,
-    workersMax=3,
-)
-
-# Worker with minimal dependencies
-minimal_config = CpuLiveServerless(
-    name="01_04_deps_minimal",
-    instanceIds=[CpuInstanceType.CPU3C_1_2],
-    workersMin=0,
-    workersMax=3,
-)
 
-
-@remote(
-    resource_config=data_config,
+# worker with data science dependencies
+@Endpoint(
+    name="01_04_deps_data",
+    cpu=CpuInstanceType.CPU3C_8_16,
+    workers=(0, 3),
     dependencies=[
         "pandas==2.1.3",
         "numpy==1.26.2",
@@ -29,7 +16,7 @@
         "matplotlib",
     ],
 )
-async def process_data(payload: dict) -> dict:
+async def process_data(input_data: dict) -> dict:
     """
     Worker with data science dependencies.
 
@@ -46,9 +33,9 @@ async def process_data(payload: dict) -> dict:
     import pandas as pd
     import scipy
 
-    data = payload.get("data", [[1, 2], [3, 4], [5, 6]])
+    data = input_data.get("data", [[1, 2], [3, 4], [5, 6]])
 
-    # Create DataFrame and compute statistics
+    # create DataFrame and compute statistics
     df = pd.DataFrame(data, columns=["A", "B"])
     stats = {
         "mean": df.mean().to_dict(),
@@ -56,7 +43,6 @@ async def process_data(payload: dict) -> dict:
         "sum": df.sum().to_dict(),
     }
 
-    # Numpy operation
     arr = np.array(data)
     numpy_result = {
         "shape": arr.shape,
@@ -79,8 +65,13 @@ async def process_data(payload: dict) -> dict:
     }
 
 
-@remote(resource_config=minimal_config)  # No dependencies!
-async def minimal_process(payload: dict) -> dict:
+# worker with no external dependencies
+@Endpoint(
+    name="01_04_deps_minimal",
+    cpu=CpuInstanceType.CPU3C_1_2,
+    workers=(0, 3),
+)
+async def minimal_process(input_data: dict) -> dict:
     """
     Worker with NO external dependencies.
 
@@ -93,14 +84,12 @@ async def minimal_process(payload: dict) -> dict:
     import re
     from datetime import datetime
 
-    text = payload.get("text", "")
+    text = input_data.get("text", "")
 
-    # Built-in operations only
     word_count = len(text.split())
     char_count = len(text)
     uppercase_count = sum(1 for c in text if c.isupper())
 
-    # JSON manipulation
     result = {
         "text_analysis": {
             "word_count": word_count,
diff --git a/01_getting_started/04_dependencies/gpu_worker.py b/01_getting_started/04_dependencies/gpu_worker.py
index 73227da..6a4750d 100644
--- a/01_getting_started/04_dependencies/gpu_worker.py
+++ b/01_getting_started/04_dependencies/gpu_worker.py
@@ -1,32 +1,18 @@
-# GPU workers demonstrating Python and system dependency management.
-# Run with: flash run
-# Test directly: python gpu_worker.py
-from runpod_flash import GpuGroup, LiveServerless, remote
-
-# Worker with ML dependencies (versioned)
-ml_config = LiveServerless(
-    name="01_04_deps_ml",
-    gpus=[GpuGroup.ADA_32_PRO],
-    workersMin=0,
-    workersMax=2,
-)
-
-# Worker with system dependencies
-system_deps_config = LiveServerless(
-    name="01_04_deps_system",
-    gpus=[GpuGroup.AMPERE_16],
-    workersMin=0,
-    workersMax=2,
-)
+# gpu workers demonstrating Python and system dependency management.
+# run with: flash run
+# test directly: python gpu_worker.py
+from runpod_flash import Endpoint, GpuGroup
 
 
-@remote(
-    resource_config=ml_config,
+@Endpoint(
+    name="01_04_deps_ml",
+    gpu=GpuGroup.ADA_32_PRO,
+    workers=(0, 2),
     dependencies=[
-        "torch==2.1.0",  # Pin specific version
+        "torch==2.1.0",
         "torchvision",
-        "Pillow>=10.0.0",  # Minimum version
-        "numpy<2.0.0",  # Maximum version constraint
+        "Pillow>=10.0.0",
+        "numpy<2.0.0",
     ],
 )
 async def process_with_ml_libs(payload: dict) -> dict:
@@ -45,7 +31,6 @@ async def process_with_ml_libs(payload: dict) -> dict:
     import torchvision
     from PIL import Image
 
-    # Show installed versions
     versions = {
         "torch": torch.__version__,
         "torchvision": torchvision.__version__,
@@ -53,7 +38,6 @@ async def process_with_ml_libs(payload: dict) -> dict:
         "numpy": np.__version__,
     }
 
-    # Simple tensor operation to verify GPU
     if torch.cuda.is_available():
         tensor = torch.randn(100, 100, device="cuda")
         result = tensor.sum().item()
@@ -69,10 +53,12 @@ async def process_with_ml_libs(payload: dict) -> dict:
     }
 
 
-@remote(
-    resource_config=system_deps_config,
+@Endpoint(
+    name="01_04_deps_system",
+    gpu=GpuGroup.AMPERE_16,
+    workers=(0, 2),
     dependencies=["opencv-python", "requests"],
-    system_dependencies=["ffmpeg", "libgl1"],  # System packages via apt
+    system_dependencies=["ffmpeg", "libgl1"],
 )
 async def process_with_system_deps(payload: dict) -> dict:
     """
@@ -87,7 +73,6 @@ async def process_with_system_deps(payload: dict) -> dict:
 
     import cv2
 
-    # Check FFmpeg installation
     try:
         ffmpeg_version = (
             subprocess.check_output(["ffmpeg", "-version"], stderr=subprocess.STDOUT)
@@ -97,7 +82,6 @@ async def process_with_system_deps(payload: dict) -> dict:
     except Exception as e:
         ffmpeg_version = f"Error: {e}"
 
-    # Check OpenCV (requires libgl1)
     opencv_version = cv2.__version__
 
     return {
diff --git a/01_getting_started/README.md b/01_getting_started/README.md
index 9729a00..8b20778 100644
--- a/01_getting_started/README.md
+++ b/01_getting_started/README.md
@@ -10,13 +10,13 @@ The simplest Flash application with GPU workers
 **What you'll learn:**
 - Basic Flash application structure
 - Creating GPU workers
-- Using the `@remote` decorator
+- Using the `@Endpoint` decorator
 - Running Flash applications locally
 - Testing endpoints with Swagger docs
 
 **Concepts:**
-- `LiveServerless` configuration for GPU workers
-- Worker auto-scaling (min/max workers)
+- `Endpoint` with `gpu=` parameter for GPU workers
+- Worker auto-scaling via `workers=(min, max)`
 
 ### [02_cpu_worker](./02_cpu_worker/)
 CPU-only worker example for non-GPU workloads.
@@ -37,7 +37,7 @@ Mixed GPU/CPU workers for cost-effective ML pipelines.
 - Fail-fast validation before expensive GPU operations
 
 **Concepts:**
-- `CpuLiveServerless` for preprocessing and postprocessing
+- CPU `Endpoint` for preprocessing and postprocessing
 - Pipeline orchestration with load-balanced endpoints
 
 ### [04_dependencies](./04_dependencies/)
diff --git a/02_ml_inference/01_text_to_speech/README.md b/02_ml_inference/01_text_to_speech/README.md
index 2157f60..30be7bd 100644
--- a/02_ml_inference/01_text_to_speech/README.md
+++ b/02_ml_inference/01_text_to_speech/README.md
@@ -1,6 +1,6 @@
 # Text-to-Speech with Qwen3-TTS
 
-Text-to-Speech API using [Qwen3-TTS-12Hz-1.7B-CustomVoice](https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice) running on RunPod serverless GPUs.
+Text-to-Speech API using [Qwen3-TTS-12Hz-1.7B-CustomVoice](https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice) running on Runpod serverless GPUs.
 
 ## Overview
 
@@ -8,17 +8,17 @@ This example demonstrates running a 1.7B parameter TTS model on serverless GPU i
 
 ## What You'll Learn
 
-- Running a HuggingFace model with `@remote` on GPU workers
+- Running a HuggingFace model with `@Endpoint` on GPU workers
 - Returning binary audio data (WAV) from API endpoints
 - Using `bfloat16` precision for memory-efficient inference
-- Input validation inside self-contained `@remote` functions
+- Input validation inside self-contained `@Endpoint` functions
 
 ## Quick Start
 
 ### Prerequisites
 
 - Python 3.10+
-- RunPod API key ([get one here](https://docs.runpod.io/get-started/api-keys))
+- Runpod API key ([get one here](https://docs.runpod.io/get-started/api-keys))
 
 ### Setup
 
@@ -39,7 +39,7 @@ First run provisions the endpoint (~1 min). Server starts at http://localhost:88
 
 ### Test the Endpoint
 
-Visit http://localhost:8888/docs for interactive API documentation. QB endpoints are auto-generated by `flash run` based on your `@remote` functions.
+Visit http://localhost:8888/docs for interactive API documentation. QB endpoints are auto-generated by `flash run` based on your `@Endpoint` functions.
 
 **Generate speech (JSON with base64 audio):**
 ```bash
@@ -59,7 +59,7 @@ Check `/docs` for the exact auto-generated endpoint paths and schemas.
 
 ## API Functions
 
-QB (queue-based) endpoints are auto-generated from `@remote` functions. Visit `/docs` for the full API schema.
+QB (queue-based) endpoints are auto-generated from `@Endpoint` functions. Visit `/docs` for the full API schema.
 
 ### `generate_speech`
 
diff --git a/02_ml_inference/01_text_to_speech/gpu_worker.py b/02_ml_inference/01_text_to_speech/gpu_worker.py
index dd2f189..2a92d76 100644
--- a/02_ml_inference/01_text_to_speech/gpu_worker.py
+++ b/02_ml_inference/01_text_to_speech/gpu_worker.py
@@ -1,43 +1,17 @@
 # Qwen3-TTS text-to-speech GPU worker.
-# Run with: flash run
-# Test directly: python gpu_worker.py
-
-from runpod_flash import (
-    CpuInstanceType,
-    CpuLiveServerless,
-    GpuGroup,
-    LiveServerless,
-    remote,
-)
+# run with: flash run
+# test directly: python gpu_worker.py
+from runpod_flash import Endpoint, GpuGroup
 
-# GPU config for Qwen3-TTS - needs 24GB+ VRAM for 1.7B model
-# Naming convention: {category}_{example}_{worker_type}
-gpu_config = LiveServerless(
-    name="02_01_text_to_speech_gpu",
-    gpus=[GpuGroup.ADA_24],  # RTX 4090 or similar with 24GB
-    workersMin=0,
-    workersMax=3,
-    idleTimeout=5,
-)
-
-voices_config = CpuLiveServerless(
-    name="02_01_text_to_speech_voices",
-    instanceIds=[CpuInstanceType.CPU3C_1_2],
-    workersMin=0,
-    workersMax=1,
-    idleTimeout=5,
-)
 
-
-@remote(
-    resource_config=gpu_config,
-    dependencies=[
-        "qwen-tts",
-        "torch",
-        "soundfile",
-    ],
+@Endpoint(
+    name="02_01_text_to_speech_gpu",
+    gpu=GpuGroup.ADA_24,
+    workers=(0, 3),
+    idle_timeout=5,
+    dependencies=["qwen-tts", "torch", "soundfile"],
 )
-async def generate_speech(payload: dict) -> dict:
+async def generate_speech(input_data: dict) -> dict:
     """
     Generate speech using Qwen3-TTS-12Hz-1.7B-CustomVoice model.
 
@@ -60,7 +34,6 @@ async def generate_speech(payload: dict) -> dict:
     import soundfile as sf
     import torch
 
-    # Must be defined inside function for remote execution
     valid_speakers = [
         "Vivian",
         "Serena",
@@ -86,10 +59,10 @@ async def generate_speech(payload: dict) -> dict:
         "Auto",
     ]
 
-    text = payload.get("text", "Hello, this is a test.")
-    speaker = payload.get("speaker", "Ryan")
-    language = payload.get("language", "Auto")
-    instruct = payload.get("instruct", "")
+    text = input_data.get("text", "Hello, this is a test.")
+    speaker = input_data.get("speaker", "Ryan")
+    language = input_data.get("language", "Auto")
+    instruct = input_data.get("instruct", "")
 
     if speaker not in valid_speakers:
         return {
@@ -146,8 +119,12 @@ async def generate_speech(payload: dict) -> dict:
         }
 
 
-@remote(resource_config=voices_config)
-async def get_voices(payload: dict = {}) -> dict:
+@Endpoint(
+    name="02_01_text_to_speech_gpu",
+    gpu=GpuGroup.ADA_24,
+    dependencies=["qwen-tts"],
+)
+async def get_voices(input_data: dict) -> dict:
     """Get available voices and languages."""
     speakers = {
         "Vivian": "Bright, slightly edgy young female voice (Chinese native)",
@@ -180,16 +157,13 @@ async def get_voices(payload: dict = {}) -> dict:
     }
 
 
-# Test locally with: python gpu_worker.py
 if __name__ == "__main__":
     import asyncio
 
-    # Test get_voices
     print("Available voices:")
     result = asyncio.run(get_voices({}))
     print(result)
 
-    # Test speech generation (requires GPU)
     test_payload = {
         "text": "Hello! This is a test of the Qwen3 text to speech system.",
         "speaker": "Ryan",
diff --git a/03_advanced_workers/05_load_balancer/README.md b/03_advanced_workers/05_load_balancer/README.md
index 8004c92..c3f45d7 100644
--- a/03_advanced_workers/05_load_balancer/README.md
+++ b/03_advanced_workers/05_load_balancer/README.md
@@ -1,10 +1,10 @@
 # Load Balancer Endpoints Example
 
-Demonstrates Flash's load-balancer endpoints with custom HTTP routes using the `@remote` decorator with `method` and `path` parameters. This example shows how to create low-latency APIs with direct HTTP routing on a single serverless endpoint.
+Demonstrates Flash's load-balanced endpoints with custom HTTP routes using the `Endpoint` class with route decorators. This example shows how to create low-latency APIs with direct HTTP routing on a single serverless endpoint.
 
-## What Are Load-Balancer Endpoints?
+## What Are Load-Balanced Endpoints?
 
-Load-balancer endpoints use direct HTTP routing to serverless workers, providing lower latency compared to queue-based endpoints. They support custom HTTP methods (GET, POST, PUT, DELETE, PATCH) and multiple routes on a single endpoint.
+Load-balanced endpoints use direct HTTP routing to serverless workers, providing lower latency compared to queue-based endpoints. They support custom HTTP methods (GET, POST, PUT, DELETE, PATCH) and multiple routes on a single endpoint.
 
 | Feature | Queue-Based (QB) | Load-Balanced (LB) |
 |---------|------------------|-------------------|
@@ -12,10 +12,9 @@ Load-balancer endpoints use direct HTTP routing to serverless workers, providing
 | Latency | Higher (queuing) | Lower (direct) |
 | Custom routes | Limited | Full HTTP support (GET, POST, PUT, DELETE, PATCH) |
 | Automatic retries | Yes | No (client handles) |
-| Configuration | Default `ServerlessType.QB` | Use `LiveLoadBalancer` or `LoadBalancerSlsResource` |
 | Use case | Batch processing, long-running tasks | Real-time APIs, request/response patterns |
 
-**Load-balancer endpoints are ideal for:**
+**Load-balanced endpoints are ideal for:**
 - Low-latency REST APIs
 - Custom HTTP routes with different methods (GET, POST, etc.)
 - Request/response patterns that require direct HTTP communication
@@ -81,45 +80,45 @@ curl -X POST http://localhost:8888/05_load_balancer/cpu/transform \
   -d '{"text": "hello", "operation": "uppercase"}'
 ```
 
-## How Load-Balancer Endpoints Work
+## How Load-Balanced Endpoints Work
 
-### Defining Routes with @remote Decorator
+### Defining Routes with Endpoint
 
-Load-balancer endpoints use the `@remote` decorator with `method` and `path` parameters to define HTTP routes. The decorator automatically registers the function as an HTTP endpoint on the load-balancer runtime.
+Load-balanced endpoints use the `Endpoint` class with route decorators (`.get()`, `.post()`, etc.) to define HTTP routes. The decorator automatically registers the function as an HTTP endpoint on the load-balancer runtime.
 
 ```python
-from runpod_flash import remote, LiveLoadBalancer
+from runpod_flash import Endpoint, GpuGroup
 
-# Create load-balanced endpoint (for local development)
-lb = LiveLoadBalancer(name="my_service")
+# create load-balanced endpoint
+api = Endpoint(name="my-service", gpu=GpuGroup.ANY, workers=(1, 3))
 
-# Define HTTP routes with method and path parameters
-@remote(lb, method="GET", path="/health")
+# define HTTP routes with method decorators
+@api.get("/health")
 async def health_check() -> dict:
     """Health check endpoint."""
     return {"status": "healthy"}
 
-@remote(lb, method="POST", path="/compute")
+@api.post("/compute")
 async def compute_data(numbers: list[int]) -> dict:
     """Compute the sum of squared numbers."""
     result = sum(x ** 2 for x in numbers)
     return {"result": result}
 
-@remote(lb, method="GET", path="/info")
+@api.get("/info")
 async def get_info() -> dict:
     """Get service information."""
     return {"info": "service running"}
 ```
 
-**Key parameters for @remote:**
-- `method`: HTTP verb (GET, POST, PUT, DELETE, PATCH)
-- `path`: Route path (must start with `/`)
-- Resource: Use `LiveLoadBalancer` for local development, `LoadBalancerSlsResource` for production deployment
+**Key parameters for Endpoint:**
+- `name`: Endpoint name for identification
+- `gpu=` or `cpu=`: Resource type
+- `workers=(min, max)`: Worker scaling bounds
 
 **How routing works:**
-1. Each `@remote` decorated function becomes an HTTP endpoint
-2. The `method` parameter specifies the HTTP verb
-3. The `path` parameter specifies the URL route
+1. Each route-decorated function becomes an HTTP endpoint
+2. The decorator method (`.get()`, `.post()`, etc.) specifies the HTTP verb
+3. The path argument specifies the URL route
 4. When an HTTP request matches the method and path, the function is called with the request data
 
 ### Multiple Routes on One Endpoint
@@ -127,20 +126,26 @@ async def get_info() -> dict:
 One load-balanced endpoint can have multiple routes:
 
 ```python
-api = LiveLoadBalancer(name="user_api")
+api = Endpoint(name="user-api", cpu="cpu3c-1-2", workers=(1, 5))
 
-@remote(api, method="GET", path="/users")
+@api.get("/users")
 async def list_users(): ...
 
-@remote(api, method="POST", path="/users")
+@api.post("/users")
 async def create_user(name: str): ...
 
-@remote(api, method="DELETE", path="/users/{user_id}")
+@api.delete("/users/{user_id}")
 async def delete_user(user_id: int): ...
 ```
 
 All routes are automatically registered on the same load-balanced endpoint.
 
+### Queue-Based vs Load-Balanced
+
+The `Endpoint` class infers QB vs LB from usage pattern:
+- **Direct decorator** (`@Endpoint(...)`) = queue-based (one function per endpoint)
+- **Route decorators** (`.get()`, `.post()`, etc.) = load-balanced (multiple routes, shared workers)
+
 ### Reserved Paths
 
 The following paths are reserved and cannot be used:
@@ -151,8 +156,8 @@ The following paths are reserved and cannot be used:
 
 ```
 05_load_balancer/
-├── gpu_lb.py            # GPU load-balancer endpoints with @remote
-├── cpu_lb.py            # CPU load-balancer endpoints with @remote
+├── gpu_lb.py            # GPU load-balanced endpoints
+├── cpu_lb.py            # CPU load-balanced endpoints
 ├── .env.example         # Environment template
 ├── requirements.txt     # Dependencies
 └── README.md            # This file
@@ -252,69 +257,6 @@ Response:
 }
 ```
 
-## Resource Types
-
-### LiveLoadBalancer (Local Development)
-
-`LiveLoadBalancer` is used for local development and testing. It provides all load-balancer features in a development environment without requiring a full deployment.
-
-```python
-from runpod_flash import LiveLoadBalancer, remote
-
-# Create load-balanced endpoint for local development
-lb = LiveLoadBalancer(name="my_api")
-
-@remote(lb, method="POST", path="/process")
-async def process(data: dict) -> dict:
-    """Process data on the load-balanced endpoint."""
-    return {"result": "success", "processed": data}
-```
-
-**When to use:**
-- Local development and testing
-- Testing @remote decorated functions before deployment
-- Running examples with `flash run` from the repository root
-
-**Features:**
-- Automatically uses the `runpod-flash-lb` container image
-- Local execution with `/execute` endpoint for development
-- Perfect for testing and debugging
-- No GPU/CPU configuration needed (inherits from resource type)
-
-### LoadBalancerSlsResource (Production Deployment)
-
-`LoadBalancerSlsResource` is the production resource for deploying load-balancer endpoints to RunPod.
-
-```python
-from runpod_flash import LoadBalancerSlsResource, remote
-
-# Create load-balanced endpoint for production deployment
-lb = LoadBalancerSlsResource(
-    name="my_api",
-    imageName="runpod/runpod-flash-lb:latest",
-    workersMin=1,
-    workersMax=5,
-)
-
-@remote(lb, method="POST", path="/process")
-async def process(data: dict) -> dict:
-    """Process data on the deployed load-balanced endpoint."""
-    return {"result": "success", "processed": data}
-```
-
-**When to use:**
-- Production deployment to RunPod
-- Scaling requirements (auto-scaling based on request count)
-- Multi-region deployment
-
-**Features:**
-- Direct HTTP routing to healthy workers
-- Auto-scaling based on request count (default scaler)
-- No `/execute` endpoint (security - direct routes only)
-- Client handles retries (no automatic retries)
-- Lower latency for request/response patterns
-- Custom HTTP routes on a single endpoint
-
 ## Testing Workers Locally
 
 ```bash
@@ -333,75 +275,52 @@ python cpu_lb.py
 flash build
 ```
 
-This generates handlers for your load-balancer endpoints.
+This generates handlers for your load-balanced endpoints.
 
-### Deploy to RunPod
+### Deploy to Runpod
 
 ```bash
 flash deploy new production
 flash deploy send production
 ```
 
-## Local vs Deployed
-
-The `@remote` decorator with method and path works the same way in both local and production environments. The only difference is the resource configuration.
-
-**Local Development (LiveLoadBalancer):**
-- Use `LiveLoadBalancer` for testing load-balancer endpoints locally
-- Automatically uses `runpod-flash-lb` container image
-- Includes `/execute` endpoint for development/testing
-- Testing via `flash run` or direct Python execution
-- Perfect for development, testing, and debugging
-
-**Production Deployment (LoadBalancerSlsResource):**
-- Use `LoadBalancerSlsResource` when deploying to RunPod
-- Specifies the container image and scaling parameters
-- No `/execute` endpoint (security - direct routes only)
-- All execution flows through custom HTTP routes
-- Automatic scaling based on request count
-- Direct HTTP routing to healthy workers
-
-**Migration path:** Code remains identical - just change the resource from `LiveLoadBalancer` to `LoadBalancerSlsResource` when deploying to production!
-
 ## Key Concepts
 
 ### Async Functions
-All `@remote` functions should be async:
+All route-decorated functions should be async:
 ```python
-@remote(config, method="POST", path="/process")
+@api.post("/process")
 async def process_data(input: str) -> dict:
-    # Your code here
     return {"result": "success"}
 ```
 
 ### Error Handling
-For load-balancer endpoints, raise `ValueError` for validation errors. The framework automatically handles these as HTTP 400 Bad Request responses:
+For load-balanced endpoints, raise `ValueError` for validation errors. The framework automatically handles these as HTTP 400 Bad Request responses:
 ```python
-@remote(lb, method="POST", path="/process")
+@api.post("/process")
 async def process(text: str) -> dict:
     if not text:
         raise ValueError("text cannot be empty")
-    if not isinstance(text, str):
-        raise ValueError("text must be a string")
     return {"result": text.upper()}
 ```
 
 **HTTP Error Mapping:**
-- `ValueError` → 400 Bad Request
-- Other exceptions → 500 Internal Server Error
+- `ValueError` -> 400 Bad Request
+- Other exceptions -> 500 Internal Server Error
 
 ### Dependencies
-Specify Python dependencies in the decorator:
+Specify Python dependencies on the Endpoint:
 ```python
-@remote(
-    config,
-    method="POST",
-    path="/analyze",
-    dependencies=["torch", "transformers"]
+api = Endpoint(
+    name="my-service",
+    gpu=GpuGroup.ADA_24,
+    dependencies=["torch", "transformers"],
 )
+
+@api.post("/analyze")
 async def analyze(data: str) -> dict:
     import torch
-    # Your code here
+    # your code here
 ```
 
 ## Environment Variables
@@ -418,7 +337,7 @@ LOG_LEVEL=INFO          # Logging level (default: INFO)
 
 ## Cost Estimates
 
-Load-balancer endpoints are cost-efficient for request/response patterns:
+Load-balanced endpoints are cost-efficient for request/response patterns:
 
 **GPU Service (Compute)**
 - Instance type: GPU (depends on your configuration)
@@ -435,16 +354,16 @@ Load-balancer endpoints are cost-efficient for request/response patterns:
 - Load-balancers: Lower latency, pay for active processing time only
 - Queue-based: Higher throughput, automatic retries, better for batch jobs
 
-For current pricing, see [RunPod Pricing](https://www.runpod.io/pricing).
+For current pricing, see [Runpod Pricing](https://www.runpod.io/pricing).
 
 ## Troubleshooting
 
-### Load-balancer endpoints not responding
+### Load-balanced endpoints not responding
 
 **Problem**: Endpoints return 502 or timeout
 - Ensure workers are properly deployed with `flash deploy`
-- Check worker logs via RunPod console
-- Verify `method` and `path` parameters match your HTTP requests
+- Check worker logs via Runpod console
+- Verify route paths match your HTTP requests
 - Confirm the resource configuration (GPU/CPU types) is available
 
 ### ValueError not mapping to 400 responses
@@ -457,23 +376,23 @@ For current pricing, see [RunPod Pricing](https://www.runpod.io/pricing).
 ### Workers not starting
 
 **Problem**: Workers fail to initialize
-- Check that all dependencies in `dependencies` parameter are available
+- Check that all `dependencies` are available
 - Verify the container image has required system packages
 - Check worker function imports and module availability
-- Review worker logs in the RunPod console
+- Review worker logs in the Runpod console
 
 ### Mixed latency in responses
 
 **Problem**: Some requests are fast, others are slow
-- Load-balancer uses direct HTTP routing (no queue)
+- Load-balanced uses direct HTTP routing (no queue)
 - First request to a cold worker will be slower (initialization)
-- Adjust `workersMin` to keep workers warm if consistent low latency is critical
-- Consider using `idleTimeout` to reduce cold starts
+- Set `workers=(1, N)` to keep workers warm if consistent low latency is critical
+- Adjust `idle_timeout` to reduce cold starts
 
 ## Next Steps
 
 1. Explore the endpoints via Swagger UI (`/docs`)
-2. Modify the `@remote` functions to add your logic
-3. Add new routes with different `method` and `path` values
-4. Deploy to RunPod when ready
+2. Modify the route functions to add your logic
+3. Add new routes with different HTTP methods
+4. Deploy to Runpod when ready
 5. Monitor performance and scaling behavior
diff --git a/03_advanced_workers/05_load_balancer/cpu_lb.py b/03_advanced_workers/05_load_balancer/cpu_lb.py
index 55949c4..47f1d95 100644
--- a/03_advanced_workers/05_load_balancer/cpu_lb.py
+++ b/03_advanced_workers/05_load_balancer/cpu_lb.py
@@ -1,20 +1,18 @@
-# CPU load-balanced endpoints with custom HTTP routes.
-# Run with: flash run
-# Test directly: python cpu_lb.py
-from runpod_flash import CpuLiveLoadBalancer, remote
+# cpu load-balanced endpoints with custom HTTP routes.
+# run with: flash run
+# test directly: python cpu_lb.py
+from runpod_flash import Endpoint
 
-cpu_config = CpuLiveLoadBalancer(
-    name="03_05_load_balancer_cpu",
-)
+api = Endpoint(name="03_05_load_balancer_cpu", cpu="cpu3c-1-2")
 
 
-@remote(cpu_config, method="GET", path="/health")
+@api.get("/health")
 async def cpu_health() -> dict:
     """Health check endpoint for CPU service."""
     return {"status": "healthy", "service": "cpu"}
 
 
-@remote(cpu_config, method="POST", path="/validate")
+@api.post("/validate")
 async def validate_data(text: str) -> dict:
     """Validate and analyze text data.
 
@@ -32,7 +30,6 @@ async def validate_data(text: str) -> dict:
 
     start_time = time.time()
 
-    # Simple text analysis
     words = text.split()
     char_count = len(text)
     word_count = len(words)
@@ -51,7 +48,7 @@ async def validate_data(text: str) -> dict:
     }
 
 
-@remote(cpu_config, method="POST", path="/transform")
+@api.post("/transform")
 async def transform_data(text: str, operation: str = "uppercase") -> dict:
     """Transform text data.
 
@@ -76,7 +73,6 @@ async def transform_data(text: str, operation: str = "uppercase") -> dict:
     start_time = time.time()
     result = ""
 
-    # Perform transformation
     if operation == "uppercase":
         result = text.upper()
     elif operation == "lowercase":
diff --git a/03_advanced_workers/05_load_balancer/gpu_lb.py b/03_advanced_workers/05_load_balancer/gpu_lb.py
index 28001c6..a17301a 100644
--- a/03_advanced_workers/05_load_balancer/gpu_lb.py
+++ b/03_advanced_workers/05_load_balancer/gpu_lb.py
@@ -1,26 +1,23 @@
-# GPU load-balanced endpoints with custom HTTP routes.
-# Run with: flash run
-# Test directly: python gpu_lb.py
-from runpod_flash import LiveLoadBalancer, remote
+# gpu load-balanced endpoints with custom HTTP routes.
+# run with: flash run
+# test directly: python gpu_lb.py
+from runpod_flash import Endpoint, GpuGroup
 
-gpu_config = LiveLoadBalancer(
-    name="03_05_load_balancer_gpu",
-    workersMin=1,
-)
+api = Endpoint(name="03_05_load_balancer_gpu", gpu=GpuGroup.ANY, workers=(1, 3))
 
 
-@remote(gpu_config, method="GET", path="/health")
+@api.get("/health")
 async def gpu_health() -> dict:
     """Health check endpoint for GPU service."""
     return {"status": "healthy", "service": "gpu"}
 
 
-@remote(gpu_config, method="POST", path="/compute")
-async def compute_intensive(numbers: list[float]) -> dict:
+@api.post("/compute")
+async def compute_intensive(request: dict) -> dict:
     """Perform compute-intensive operation on GPU.
 
     Args:
-        numbers: List of numbers to process
+        request: Request dict with numbers to process
 
     Returns:
         Computation results
@@ -28,13 +25,13 @@ async def compute_intensive(numbers: list[float]) -> dict:
     import time
     from datetime import datetime, timezone
 
+    numbers = request.get("numbers", [])
     start_time = time.time()
 
-    # Simulate GPU-intensive computation
     result = sum(x**2 for x in numbers)
-    mean = sum(numbers) / len(numbers) if numbers else 0
-    max_val = max(numbers) if numbers else None
-    min_val = min(numbers) if numbers else None
+    mean = sum(numbers) / len(numbers)
+    max_val = max(numbers)
+    min_val = min(numbers)
 
     compute_time = (time.time() - start_time) * 1000
 
@@ -50,7 +47,7 @@ async def compute_intensive(numbers: list[float]) -> dict:
     }
 
 
-@remote(gpu_config, method="GET", path="/info")
+@api.get("/info")
 async def gpu_info() -> dict:
     """Get GPU availability information."""
     try:
@@ -81,7 +78,8 @@ async def test():
         print(f"   {result}\n")
 
         print("2. Compute intensive:")
-        result = await compute_intensive([1.0, 2.0, 3.0, 4.0, 5.0])
+        request_data = {"numbers": [1, 2, 3, 4, 5]}
+        result = await compute_intensive(request_data)
         print(f"   Sum of squares: {result['sum_of_squares']}")
         print(f"   Mean: {result['mean']}\n")
 
diff --git a/03_advanced_workers/README.md b/03_advanced_workers/README.md
index 3a1607b..89e4550 100644
--- a/03_advanced_workers/README.md
+++ b/03_advanced_workers/README.md
@@ -20,8 +20,7 @@ Load-balancer endpoints with custom HTTP routes.
 - Low-latency inference services
 
 **Resources:**
-- `LiveLoadBalancer` - Local development
-- `LoadBalancerSlsResource` - Production deployment
+- `Endpoint` with route decorators (`.get()`, `.post()`, etc.)
 
 ### 01_streaming _(coming soon)_
 Streaming responses with Server-Sent Events (SSE) and WebSockets.
diff --git a/04_scaling_performance/01_autoscaling/README.md b/04_scaling_performance/01_autoscaling/README.md
index 2baa7cf..ddb76de 100644
--- a/04_scaling_performance/01_autoscaling/README.md
+++ b/04_scaling_performance/01_autoscaling/README.md
@@ -36,18 +36,18 @@ curl -X POST http://localhost:8888/cpu_worker/runsync \
 
 ### GPU Workers (`gpu_worker.py`)
 
-| Strategy | workersMin | workersMax | idleTimeout | scalerType | scalerValue | Use Case |
-|----------|-----------|-----------|-------------|------------|-------------|----------|
-| Scale to Zero | 0 | 3 | 5 min | QUEUE_DELAY | 4 | Sporadic/batch, cost-first |
-| Always On | 1 | 3 | 60 min | QUEUE_DELAY | 4 | Steady traffic, latency-first |
-| High Throughput | 2 | 10 | 30 min | REQUEST_COUNT | 3 | Bursty traffic, throughput-first |
+| Strategy | workers | idle_timeout | scaler_type | scaler_value | Use Case |
+|----------|---------|-------------|-------------|-------------|----------|
+| Scale to Zero | (0, 3) | 5 min | QUEUE_DELAY | 4 | Sporadic/batch, cost-first |
+| Always On | (1, 3) | 60 min | QUEUE_DELAY | 4 | Steady traffic, latency-first |
+| High Throughput | (2, 10) | 30 min | REQUEST_COUNT | 3 | Bursty traffic, throughput-first |
 
 ### CPU Workers (`cpu_worker.py`)
 
-| Strategy | workersMin | workersMax | idleTimeout | Use Case |
-|----------|-----------|-----------|-------------|----------|
-| Scale to Zero | 0 | 5 | 5 min | Cost-optimized preprocessing |
-| Burst Ready | 1 | 10 | 30 min | Always-warm API gateway |
+| Strategy | workers | idle_timeout | Use Case |
+|----------|---------|-------------|----------|
+| Scale to Zero | (0, 5) | 5 min | Cost-optimized preprocessing |
+| Burst Ready | (1, 10) | 30 min | Always-warm API gateway |
 
 ## How Autoscaling Works
 
@@ -56,39 +56,75 @@ Requests arrive
     |
     v
 +-------------------+
-|   Request Queue   |  <-- scalerType monitors this
+|   Request Queue   |  <-- scaler_type monitors this
 +-------------------+
     |
     v
 +-------------------+     scale up
 |   Scaler Logic    | ----------------> Start new workers
-| (QUEUE_DELAY or   |                   (up to workersMax)
+| (QUEUE_DELAY or   |                   (up to workers max)
 |  REQUEST_COUNT)   |
 +-------------------+
     |
     v
-+-------------------+     idle > idleTimeout
++-------------------+     idle > idle_timeout
 |   Active Workers  | ----------------> Terminate worker
-| (workersMin..Max) |                   (down to workersMin)
+| (min..max)        |                   (down to workers min)
 +-------------------+
 ```
 
 **Scaler types:**
 
-- **QUEUE_DELAY** -- Scales based on how long requests wait in the queue. `scalerValue` is the target queue delay in seconds. Good for latency-sensitive workloads.
-- **REQUEST_COUNT** -- Scales based on pending request count per worker. `scalerValue` is the target requests per worker. Good for throughput-sensitive workloads.
+- **QUEUE_DELAY** -- Scales based on how long requests wait in the queue. `scaler_value` is the target queue delay in seconds. Good for latency-sensitive workloads.
+- **REQUEST_COUNT** -- Scales based on pending request count per worker. `scaler_value` is the target requests per worker. Good for throughput-sensitive workloads.
 
 ## Configuration Reference
 
 | Parameter | Type | Default | Description |
 |-----------|------|---------|-------------|
-| `workersMin` | int | 0 | Minimum workers kept warm (0 = scale to zero) |
-| `workersMax` | int | 3 | Maximum concurrent workers |
-| `idleTimeout` | int | 5 | Minutes before idle workers terminate |
-| `scalerType` | ServerlessScalerType | QUEUE_DELAY | Scaling trigger metric |
-| `scalerValue` | int | 4 | Target value for the scaler metric |
-| `gpus` | list[GpuGroup] | -- | GPU types for LiveServerless |
-| `instanceIds` | list[CpuInstanceType] | -- | CPU instance types for CpuLiveServerless |
+| `workers` | int or (min, max) | (0, 1) | Worker scaling bounds |
+| `idle_timeout` | int | 60 | Minutes before idle workers terminate |
+| `scaler_type` | ServerlessScalerType | QUEUE_DELAY | Scaling trigger metric |
+| `scaler_value` | int | 4 | Target value for the scaler metric |
+| `gpu` | GpuGroup or GpuType | ANY | GPU type for GPU endpoints |
+| `cpu` | CpuInstanceType or str | -- | CPU instance type for CPU endpoints |
+
+### Example Configurations
+
+```python
+from runpod_flash import Endpoint, GpuGroup, ServerlessScalerType
+
+# scale to zero, cost-optimized
+@Endpoint(
+    name="batch-worker",
+    gpu=GpuGroup.ANY,
+    workers=(0, 3),
+    idle_timeout=5,
+    scaler_type=ServerlessScalerType.QUEUE_DELAY,
+    scaler_value=4,
+)
+async def batch_process(payload: dict) -> dict: ...
+
+# always-on, latency-optimized
+@Endpoint(
+    name="api-worker",
+    gpu=GpuGroup.ANY,
+    workers=(1, 3),
+    idle_timeout=60,
+)
+async def api_process(payload: dict) -> dict: ...
+
+# high-throughput, burst-optimized
+@Endpoint(
+    name="burst-worker",
+    gpu=GpuGroup.ANY,
+    workers=(2, 10),
+    idle_timeout=30,
+    scaler_type=ServerlessScalerType.REQUEST_COUNT,
+    scaler_value=3,
+)
+async def burst_process(payload: dict) -> dict: ...
+```
 
 ## Cost Analysis
 
@@ -96,14 +132,14 @@ Requests arrive
 
 Assumptions: GPU cost ~$0.0015/sec, 8 hours of actual compute per day.
 
-**Scale to Zero (workersMin=0):**
+**Scale to Zero (`workers=(0, 3)`):**
 ```
 Compute: 8h x 3600s x $0.0015 = $43.20/day
 Cold starts: ~5-30s penalty per scale-up event
 Monthly: ~$1,296
 ```
 
-**Always On (workersMin=1):**
+**Always On (`workers=(1, 3)`):**
 ```
 Baseline: 24h x 3600s x $0.0015 = $129.60/day (1 worker always running)
 Extra compute: handled by autoscaling
@@ -148,28 +184,28 @@ python load_test.py --endpoint /cpu_worker/runsync --requests 100 --concurrency
 
 ### Interpreting Results
 
-- **First burst p95 vs second burst p95**: If pause > idleTimeout, second burst includes cold start latency. A large gap indicates scale-down occurred.
-- **Error rate > 0%**: Workers may be overwhelmed. Increase `workersMax` or reduce `scalerValue`.
+- **First burst p95 vs second burst p95**: If pause > idle_timeout, second burst includes cold start latency. A large gap indicates scale-down occurred.
+- **Error rate > 0%**: Workers may be overwhelmed. Increase the max workers or reduce `scaler_value`.
 - **High p99 with low p50**: Queue delay is building up. Consider switching to `REQUEST_COUNT` scaler.
 
 ## Cold Start Mitigation
 
-Cold starts occur when `workersMin=0` and all workers have been terminated.
+Cold starts occur when `workers=(0, N)` and all workers have been terminated.
 
 **Strategies:**
 
-1. **Set workersMin=1** -- Keeps one worker warm. Simplest fix, costs ~$130/day for GPU.
-2. **Increase idleTimeout** -- Workers stay alive longer between requests. Good for intermittent traffic.
+1. **Set `workers=(1, N)`** -- Keeps one worker warm. Simplest fix, costs ~$130/day for GPU.
+2. **Increase `idle_timeout`** -- Workers stay alive longer between requests. Good for intermittent traffic.
 3. **Warm-up requests** -- Send periodic health checks to prevent scale-down. Application-level solution.
 4. **Optimize container startup** -- Reduce model load time with smaller models, model caching, or quantization.
 
 ## Production Checklist
 
 - [ ] Choose scaling strategy based on traffic pattern (sporadic, steady, bursty)
-- [ ] Set `workersMax` based on budget and peak load
-- [ ] Set `idleTimeout` based on traffic gaps (longer = fewer cold starts, higher cost)
-- [ ] Choose `scalerType` based on priority (latency = QUEUE_DELAY, throughput = REQUEST_COUNT)
-- [ ] Tune `scalerValue` after load testing (lower = more aggressive scaling)
+- [ ] Set max workers based on budget and peak load
+- [ ] Set `idle_timeout` based on traffic gaps (longer = fewer cold starts, higher cost)
+- [ ] Choose `scaler_type` based on priority (latency = QUEUE_DELAY, throughput = REQUEST_COUNT)
+- [ ] Tune `scaler_value` after load testing (lower = more aggressive scaling)
 - [ ] Run load test to verify scaling behavior
 - [ ] Monitor cold start frequency in production
 - [ ] Set up cost alerts for unexpected scaling
diff --git a/04_scaling_performance/01_autoscaling/cpu_worker.py b/04_scaling_performance/01_autoscaling/cpu_worker.py
index f7e82d2..a508158 100644
--- a/04_scaling_performance/01_autoscaling/cpu_worker.py
+++ b/04_scaling_performance/01_autoscaling/cpu_worker.py
@@ -1,30 +1,17 @@
-# CPU autoscaling strategies -- scale-to-zero and burst-ready.
-# Run with: flash run
-# Test directly: python cpu_worker.py
-from runpod_flash import CpuInstanceType, CpuLiveServerless, remote
-
-# --- Strategy 1: Scale to Zero ---
-# Cost-optimized for preprocessing tasks that tolerate cold starts.
-cpu_scale_to_zero_config = CpuLiveServerless(
-    name="04_01_cpu_scale_to_zero",
-    instanceIds=[CpuInstanceType.CPU3C_1_2],
-    workersMin=0,
-    workersMax=5,
-    idleTimeout=5,
-)
-
-# --- Strategy 2: Burst Ready ---
-# Always-warm worker for API gateway or latency-sensitive CPU tasks.
-cpu_burst_ready_config = CpuLiveServerless(
-    name="04_01_cpu_burst_ready",
-    instanceIds=[CpuInstanceType.CPU3G_2_8],
-    workersMin=1,
-    workersMax=10,
-    idleTimeout=30,
-)
+# cpu autoscaling strategies -- scale-to-zero and burst-ready.
+# run with: flash run
+# test directly: python cpu_worker.py
+from runpod_flash import CpuInstanceType, Endpoint
 
 
-@remote(resource_config=cpu_scale_to_zero_config)
+# --- strategy 1: scale to zero ---
+# cost-optimized for preprocessing tasks that tolerate cold starts.
+@Endpoint(
+    name="04_01_cpu_scale_to_zero",
+    cpu=CpuInstanceType.CPU3C_1_2,
+    workers=(0, 5),
+    idle_timeout=5,
+)
 async def cpu_scale_to_zero(payload: dict) -> dict:
     """CPU worker with scale-to-zero -- cost-optimized preprocessing."""
     import hashlib
@@ -35,7 +22,6 @@ async def cpu_scale_to_zero(payload: dict) -> dict:
 
     text = payload.get("text", "")
 
-    # Simulate CPU-bound preprocessing: hashing, serialization, string ops
     text_hash = hashlib.sha256(text.encode()).hexdigest()
     normalized = " ".join(text.lower().split())
     tokens = normalized.split()
@@ -58,7 +44,14 @@ async def cpu_scale_to_zero(payload: dict) -> dict:
     }
 
 
-@remote(resource_config=cpu_burst_ready_config)
+# --- strategy 2: burst ready ---
+# always-warm worker for API gateway or latency-sensitive CPU tasks.
+@Endpoint(
+    name="04_01_cpu_burst_ready",
+    cpu=CpuInstanceType.CPU3G_2_8,
+    workers=(1, 10),
+    idle_timeout=30,
+)
 async def cpu_burst_ready(payload: dict) -> dict:
     """CPU worker with burst-ready scaling -- always-warm for low latency."""
     import hashlib
@@ -69,7 +62,6 @@ async def cpu_burst_ready(payload: dict) -> dict:
 
     text = payload.get("text", "")
 
-    # Simulate CPU-bound API gateway work: validation, transformation, routing
     text_hash = hashlib.sha256(text.encode()).hexdigest()
     words = text.split()
     word_lengths = [len(w) for w in words]
diff --git a/04_scaling_performance/01_autoscaling/gpu_worker.py b/04_scaling_performance/01_autoscaling/gpu_worker.py
index 67f3889..ca982a2 100644
--- a/04_scaling_performance/01_autoscaling/gpu_worker.py
+++ b/04_scaling_performance/01_autoscaling/gpu_worker.py
@@ -1,49 +1,20 @@
-# GPU autoscaling strategies -- scale-to-zero, always-on, high-throughput.
-# Run with: flash run
-# Test directly: python gpu_worker.py
-from runpod_flash import GpuGroup, LiveServerless, ServerlessScalerType, remote
-
-# --- Strategy 1: Scale to Zero ---
-# Sporadic or batch workloads where cost matters more than cold-start latency.
-# Workers scale down to zero after 5 minutes of idle time.
-scale_to_zero_config = LiveServerless(
-    name="04_01_scale_to_zero",
-    gpus=[GpuGroup.ANY],
-    workersMin=0,
-    workersMax=3,
-    idleTimeout=5,
-    scalerType=ServerlessScalerType.QUEUE_DELAY,
-    scalerValue=4,
-)
+# gpu autoscaling strategies -- scale-to-zero, always-on, high-throughput.
+# run with: flash run
+# test directly: python gpu_worker.py
+from runpod_flash import Endpoint, GpuGroup, ServerlessScalerType
 
-# --- Strategy 2: Always On ---
-# Steady traffic where low latency matters more than cost.
-# At least one worker stays warm to avoid cold starts.
-always_on_config = LiveServerless(
-    name="04_01_always_on",
-    gpus=[GpuGroup.ANY],
-    workersMin=1,
-    workersMax=3,
-    idleTimeout=60,
-    scalerType=ServerlessScalerType.QUEUE_DELAY,
-    scalerValue=4,
-)
 
-# --- Strategy 3: High Throughput ---
-# Bursty traffic where throughput matters most.
-# Starts with 2 warm workers, scales aggressively to 10 based on request count.
-high_throughput_config = LiveServerless(
-    name="04_01_high_throughput",
-    gpus=[GpuGroup.ANY],
-    workersMin=2,
-    workersMax=10,
-    idleTimeout=30,
-    scalerType=ServerlessScalerType.REQUEST_COUNT,
-    scalerValue=3,
+# --- strategy 1: scale to zero ---
+# sporadic or batch workloads where cost matters more than cold-start latency.
+# workers scale down to zero after 5 minutes of idle time.
+@Endpoint(
+    name="04_01_scale_to_zero",
+    gpu=GpuGroup.ANY,
+    workers=(0, 3),
+    idle_timeout=5,
+    scaler_type=ServerlessScalerType.QUEUE_DELAY,
+    scaler_value=4,
 )
-
-
-@remote(resource_config=scale_to_zero_config)
 async def scale_to_zero_inference(payload: dict) -> dict:
     """GPU inference with scale-to-zero -- cost-optimized for sporadic workloads."""
     import asyncio
@@ -79,7 +50,17 @@ async def scale_to_zero_inference(payload: dict) -> dict:
     }
 
 
-@remote(resource_config=always_on_config)
+# --- strategy 2: always on ---
+# steady traffic where low latency matters more than cost.
+# at least one worker stays warm to avoid cold starts.
+@Endpoint(
+    name="04_01_always_on",
+    gpu=GpuGroup.ANY,
+    workers=(1, 3),
+    idle_timeout=60,
+    scaler_type=ServerlessScalerType.QUEUE_DELAY,
+    scaler_value=4,
+)
 async def always_on_inference(payload: dict) -> dict:
     """GPU inference with always-on worker -- latency-optimized for steady traffic."""
     import asyncio
@@ -115,7 +96,17 @@ async def always_on_inference(payload: dict) -> dict:
     }
 
 
-@remote(resource_config=high_throughput_config)
+# --- strategy 3: high throughput ---
+# bursty traffic where throughput matters most.
+# starts with 2 warm workers, scales aggressively to 10 based on request count.
+@Endpoint(
+    name="04_01_high_throughput",
+    gpu=GpuGroup.ANY,
+    workers=(2, 10),
+    idle_timeout=30,
+    scaler_type=ServerlessScalerType.REQUEST_COUNT,
+    scaler_value=3,
+)
 async def high_throughput_inference(payload: dict) -> dict:
     """GPU inference with high-throughput scaling -- optimized for bursty traffic."""
     import asyncio
diff --git a/05_data_workflows/01_network_volumes/README.md b/05_data_workflows/01_network_volumes/README.md
index 6c82f18..d8d325c 100644
--- a/05_data_workflows/01_network_volumes/README.md
+++ b/05_data_workflows/01_network_volumes/README.md
@@ -68,8 +68,8 @@ Visit `http://localhost:8888/docs` for interactive API documentation.
 
 ```
 01_network_volumes/
-├── gpu_worker.py        # Stable Diffusion worker with @remote
-├── cpu_worker.py        # List and serve images with @remote
+├── gpu_worker.py        # Stable Diffusion worker with @Endpoint
+├── cpu_worker.py        # List and serve images with Endpoint routes
 ├── requirements.txt
 └── README.md
 ```
@@ -78,7 +78,7 @@ Visit `http://localhost:8888/docs` for interactive API documentation.
 
 ### POST /gpu_worker/runsync
 
-GPU worker (QB, class-based `@remote`). Generates an image and saves it to the shared volume.
+GPU worker (QB, class-based `@Endpoint`). Generates an image and saves it to the shared volume.
 
 **Request**:
 ```json
diff --git a/05_data_workflows/01_network_volumes/cpu_worker.py b/05_data_workflows/01_network_volumes/cpu_worker.py
index d947c48..d2c02cf 100644
--- a/05_data_workflows/01_network_volumes/cpu_worker.py
+++ b/05_data_workflows/01_network_volumes/cpu_worker.py
@@ -1,21 +1,22 @@
-# CPU worker with network volume for listing and serving generated images.
-# Run with: flash run
-# Test directly: python cpu_worker.py
-from runpod_flash import CpuLiveLoadBalancer, NetworkVolume, remote
+# cpu worker with network volume for listing and serving generated images.
+# run with: flash run
+# test directly: python cpu_worker.py
+from runpod_flash import Endpoint, NetworkVolume
 
 volume = NetworkVolume(
     name="flash-05-volume",
     size=50,
 )
 
-cpu_config = CpuLiveLoadBalancer(
+api = Endpoint(
     name="05_01_cpu_worker",
-    workersMin=1,
-    networkVolume=volume,
+    cpu="cpu3c-1-2",
+    workers=(1, 3),
+    volume=volume,
 )
 
 
-@remote(resource_config=cpu_config, path="/images", method="GET")
+@api.get("/images")
 async def list_images_in_volume() -> dict:
     """List generated images from the shared volume."""
     import os
@@ -30,7 +31,7 @@ async def list_images_in_volume() -> dict:
     }
 
 
-@remote(resource_config=cpu_config, path="/images/{file_name}", method="GET")
+@api.get("/images/{file_name}")
 async def get_image_from_volume(file_name: str) -> dict:
     """Get image metadata from the shared volume."""
     import base64
diff --git a/05_data_workflows/01_network_volumes/gpu_worker.py b/05_data_workflows/01_network_volumes/gpu_worker.py
index c948228..6a0019e 100644
--- a/05_data_workflows/01_network_volumes/gpu_worker.py
+++ b/05_data_workflows/01_network_volumes/gpu_worker.py
@@ -1,9 +1,9 @@
-# GPU worker with network volume for Stable Diffusion image generation.
-# Run with: flash run
-# Test directly: python gpu_worker.py
+# gpu worker with network volume for Stable Diffusion image generation.
+# run with: flash run
+# test directly: python gpu_worker.py
 import logging
 
-from runpod_flash import GpuGroup, LiveServerless, NetworkVolume, remote
+from runpod_flash import Endpoint, GpuGroup, NetworkVolume
 
 logger = logging.getLogger(__name__)
 
@@ -14,18 +14,16 @@
     size=50,
 )
 
-gpu_config = LiveServerless(
+
+@Endpoint(
     name="05_01_gpu_worker",
-    gpus=[GpuGroup.ANY],
-    workersMin=0,
-    workersMax=3,
-    idleTimeout=5,
-    networkVolume=volume,
+    gpu=GpuGroup.ANY,
+    workers=(0, 3),
+    idle_timeout=5,
+    volume=volume,
     env={"HF_HUB_CACHE": MODEL_PATH, "MODEL_PATH": MODEL_PATH},
+    dependencies=["diffusers", "torch", "transformers"],
 )
-
-
-@remote(resource_config=gpu_config, dependencies=["diffusers", "torch", "transformers"])
 class SimpleSD:
     def __init__(self):
         import gc
diff --git a/CLAUDE.md b/CLAUDE.md
index 3804c9c..a9a348c 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -4,73 +4,85 @@
 
 ## Project Overview
 
-Production-ready examples demonstrating Flash framework capabilities. Flat-file pattern: each worker is a standalone `.py` file with `@remote` decorator, auto-discovered by `flash run`. 6 categories, 18 worker files, 31 `@remote` endpoints. Root `pyproject.toml` declares only `runpod-flash` dependency; runtime deps declared inline via `@remote(dependencies=[...])`.
+Production-ready examples demonstrating Flash framework capabilities. Flat-file pattern: each worker is a standalone `.py` file with `@Endpoint` decorator, auto-discovered by `flash run`. 6 categories, 18 worker files. Root `pyproject.toml` declares only `runpod-flash` dependency; runtime deps declared inline via `Endpoint(dependencies=[...])`.
 
 ## Architecture
 
 ### Key Abstractions
 
-1. **@remote decorator (function)** -- Core pattern. `async def` marked for remote execution. All 18 worker files use this.
-2. **@remote decorator (class)** -- Used on `SimpleSD` class (`05_data_workflows`). Class-based pattern for stateful workers.
-3. **Resource config types** -- `LiveServerless` (GPU), `CpuLiveServerless` (CPU), `CpuLiveLoadBalancer` (CPU LB), `LiveLoadBalancer` (GPU LB). Module-level config objects.
-4. **Cross-worker orchestration** -- Pipeline files import from QB workers, chain with `await`. LB imports from QB workers.
+1. **@Endpoint decorator (QB)** -- Core pattern. `async def` marked with `@Endpoint(name=..., gpu=..., ...)` for queue-based remote execution.
+2. **Endpoint routes (LB)** -- Load-balanced pattern. `api = Endpoint(...)` with `@api.get()/@api.post()` route decorators for HTTP endpoints.
+3. **@Endpoint decorator (class)** -- Used on `SimpleSD` class (`05_data_workflows`). Class-based pattern for stateful workers.
+4. **Cross-worker orchestration** -- Pipeline files import from QB workers, chain with `await`. LB endpoint orchestrates QB workers.
 5. **Flat-file discovery** -- No FastAPI boilerplate, no routers, no `main.py`. `flash run` auto-generates routes from decorated functions.
-6. **In-function imports** -- Heavy libs (torch, transformers, etc.) imported inside `@remote` body, only `runpod_flash` at module level.
+6. **In-function imports** -- Heavy libs (torch, transformers, etc.) imported inside `@Endpoint` body, only `runpod_flash` at module level.
 
 ### Entry Points
 
-All 18 worker files across 6 categories. Total: 30 `@remote` functions + 1 `@remote` class. Each file is an independent entry point discovered by `flash run`.
+All worker files across 6 categories. Each file is an independent entry point discovered by `flash run`.
 
 ### Module Structure
 
 ```
 01_getting_started/          # Fundamentals
-  01_single_gpu_worker/      # Basic GPU worker
+  01_hello_world/            # Basic GPU worker
   02_cpu_worker/             # CPU-only worker
-  03_pipeline/               # Cross-worker orchestration (CPU -> GPU -> LB)
+  03_mixed_workers/          # Cross-worker orchestration (CPU -> GPU -> LB)
   04_dependencies/           # Runtime dependency declaration
-  05_multi_resource/         # Multiple resource types in one project
 02_ml_inference/             # ML deployment
   01_text_to_speech/         # Qwen3-TTS model serving
 03_advanced_workers/         # Advanced patterns
-  01_lb_endpoint/            # LB endpoints with custom HTTP methods
+  05_load_balancer/          # LB endpoints with custom HTTP routes
 04_scaling_performance/      # Autoscaling
   01_autoscaling/            # Scaling strategy examples
 05_data_workflows/           # Data pipelines
-  01_network_volumes/        # Network volume usage
-  02_stable_diffusion/       # Stable Diffusion with @remote class
+  01_network_volumes/        # Network volume usage with @Endpoint class
 06_real_world/               # Placeholder for production patterns
 ```
 
-### Worker File Pattern
+### Worker File Patterns
 
+**Queue-based (function decorator):**
 ```python
-from runpod_flash import GpuGroup, LiveServerless, remote
-
-gpu_config = LiveServerless(
-    name="01_01_gpu_worker",
-    gpus=[GpuGroup.ANY],
-    workersMin=0,
-    workersMax=3,
-    idleTimeout=5,
-)
+from runpod_flash import Endpoint, GpuGroup
 
-@remote(resource_config=gpu_config)
+@Endpoint(
+    name="my-worker",
+    gpu=GpuGroup.ANY,
+    workers=(0, 3),
+    idle_timeout=5,
+)
 async def my_function(payload: dict) -> dict:
     """All runtime imports inside the function body."""
     import torch
-    # implementation
     return {"status": "success"}
 ```
 
-### Resource Types
+**Load-balanced (route decorators):**
+```python
+from runpod_flash import Endpoint
+
+api = Endpoint(name="my-api", cpu="cpu3c-1-2", workers=(1, 3))
+
+@api.post("/process")
+async def process(data: dict) -> dict:
+    return {"result": data}
+
+@api.get("/health")
+async def health() -> dict:
+    return {"status": "ok"}
+```
+
+### Resource Configuration
+
+GPU vs CPU is a parameter, not a class choice:
 
-| Type | Import | Use Case |
-|------|--------|----------|
-| `LiveServerless` | `from runpod_flash import LiveServerless, GpuGroup` | GPU workers (9 files) |
-| `CpuLiveServerless` | `from runpod_flash import CpuLiveServerless, CpuInstanceType` | CPU serverless (4 files) |
-| `CpuLiveLoadBalancer` | `from runpod_flash import CpuLiveLoadBalancer` | CPU LB endpoints (4 files) |
-| `LiveLoadBalancer` | `from runpod_flash import LiveLoadBalancer` | GPU LB endpoints (1 file) |
+| Config | Syntax | Use Case |
+|--------|--------|----------|
+| GPU endpoint | `@Endpoint(name=..., gpu=GpuGroup.ANY)` | GPU workers |
+| CPU endpoint | `@Endpoint(name=..., cpu="cpu3c-1-2")` | CPU workers |
+| GPU LB | `api = Endpoint(name=..., gpu=GpuGroup.ANY); @api.post(...)` | GPU LB endpoints |
+| CPU LB | `api = Endpoint(name=..., cpu="cpu3c-1-2"); @api.post(...)` | CPU LB endpoints |
 
 ### Cross-Worker Orchestration
 
@@ -79,11 +91,11 @@ Pipeline files import functions from other workers and chain them:
 ```python
 from cpu_worker import preprocess_text
 from gpu_worker import gpu_inference
-from runpod_flash import CpuLiveLoadBalancer, remote
+from runpod_flash import Endpoint
 
-pipeline_config = CpuLiveLoadBalancer(name="pipeline", workersMin=1)
+pipeline = Endpoint(name="pipeline", cpu="cpu3c-1-2", workers=(1, 3))
 
-@remote(resource_config=pipeline_config, method="POST", path="/classify")
+@pipeline.post("/classify")
 async def classify(text: str) -> dict:
     result = await preprocess_text({"text": text})
     return await gpu_inference(result)
@@ -95,21 +107,17 @@ All examples import from `runpod_flash`. Import frequency by symbol:
 
 | Symbol | Files Using It | Breakage Risk |
 |--------|---------------|---------------|
-| `remote` | 18 | ALL examples break |
-| `LiveServerless` | 9 | GPU examples break |
+| `Endpoint` | 18 | ALL examples break |
 | `GpuGroup` | 7 | GPU config breaks |
-| `CpuLiveServerless` | 4 | CPU examples break |
 | `CpuInstanceType` | 4 | CPU config breaks |
-| `CpuLiveLoadBalancer` | 4 | Pipeline examples break |
 | `NetworkVolume` | 2 | Volume examples break |
-| `LiveLoadBalancer` | 1 | LB example breaks |
 | `ServerlessScalerType` | 1 | Scaling example breaks |
 
 ## Cross-Repo Dependencies
 
 ### Depends On
 
-- **flash** (`runpod_flash` package) -- all 18 files import from it. Any breaking change to `@remote` signature, resource config constructors, or enum values breaks examples at import time.
+- **flash** (`runpod_flash` package) -- all files import from it. Any breaking change to `Endpoint` constructor, enum values, or route decorator signature breaks examples at import time.
 
 ### Depended On By
 
@@ -117,9 +125,9 @@ All examples import from `runpod_flash`. Import frequency by symbol:
 
 ### Interface Contracts
 
-- `@remote(resource_config=...)` decorator signature -- any parameter rename or removal breaks all 18 files
-- Resource config constructors (`LiveServerless`, `CpuLiveServerless`, etc.) -- field name changes break config objects
-- `GpuGroup`, `CpuInstanceType` enum values -- value removals break GPU/CPU configs
+- `Endpoint(name=..., gpu=..., cpu=..., workers=...)` constructor -- parameter rename/removal breaks all files
+- `.get()/.post()/.put()/.delete()/.patch()` route decorator signatures
+- `GpuGroup`, `GpuType`, `CpuInstanceType` enum values -- value removals break GPU/CPU configs
 - `NetworkVolume` constructor -- field changes break volume examples
 
 ### Dependency Chain
@@ -202,20 +210,20 @@ No formal test infrastructure exists. Each worker has an optional `if __name__ =
 
 To test manually:
 ```bash
-cd 01_getting_started/01_single_gpu_worker
+cd 01_getting_started/01_hello_world
 flash run                    # Starts dev server, auto-discovers workers
 # Use http://localhost:8888/docs to invoke endpoints
 ```
 
 ### Recommended Test Strategy
 
-1. Add `tests/test_imports.py` that imports every worker file (catches `@remote` signature drift)
+1. Add `tests/test_imports.py` that imports every worker file (catches `Endpoint` signature drift)
 2. Add `tests/test_configs.py` that validates all resource configs construct without error
 3. Add CI job that runs `flash run --check` (dry-run mode) against each example category
 
 ## Common Mistakes
 
-1. **Accessing external scope in @remote functions** -- only local variables, parameters, and internal imports work. The function body is serialized and sent to a remote worker.
+1. **Accessing external scope in @Endpoint functions** -- only local variables, parameters, and internal imports work. The function body is serialized and sent to a remote worker.
 2. **Module-level imports of heavy libraries** -- import torch, numpy, transformers, etc. inside the function body, not at module level.
 3. **Missing `if __name__ == "__main__"` test block** -- each worker should be independently testable.
 4. **Mutable default arguments** -- use `None` and initialize in function body.
diff --git a/CLI-REFERENCE.md b/CLI-REFERENCE.md
index 9990e0b..a058cf3 100644
--- a/CLI-REFERENCE.md
+++ b/CLI-REFERENCE.md
@@ -41,7 +41,7 @@ flash <command> --help    # Show help for specific command
 1. **Start Here**: [Getting Started Guide](docs/cli/getting-started.md)
    - Create your first project
    - Run locally and test
-   - Deploy to RunPod
+   - Deploy to Runpod
 
 2. **Deep Dive**: [Command Reference](docs/cli/commands.md)
    - Understand all options and parameters
@@ -101,7 +101,7 @@ flash init my-api --force
 
 ### What It Creates
 
-- `gpu_worker.py` - GPU worker template with `@remote` decorator
+- `gpu_worker.py` - GPU worker template with `@Endpoint` decorator
 - `pyproject.toml` - Project dependencies
 - `.env.example` - Environment variable template
 - `.gitignore` - Git ignore patterns
@@ -163,7 +163,7 @@ flash run --no-reload
 **Auto-provision resources:**
 ```bash
 flash run --auto-provision
-# Automatically creates RunPod endpoints on startup
+# Automatically creates Runpod endpoints on startup
 ```
 
 **Using environment variables:**
@@ -175,7 +175,7 @@ flash run
 
 ### What It Does
 
-1. Discovers all `.py` files (excluding `.venv/`, `.flash/`, `.runpod/`, `__init__.py`) and finds `@remote` decorated functions via AST parsing
+1. Discovers all `.py` files (excluding `.venv/`, `.flash/`, `.runpod/`, `__init__.py`) and finds `@Endpoint` decorated functions via AST parsing
 2. Generates a FastAPI application with routes for each discovered function
 3. Starts uvicorn development server with hot reload
 4. Provides interactive API documentation at `/docs`
@@ -238,7 +238,7 @@ flash build -o my-app-v1.0.tar.gz
 **Exclude packages present in base image:**
 ```bash
 flash build --exclude torch,torchvision,torchaudio
-# Reduces archive size by excluding packages already in RunPod base images
+# Reduces archive size by excluding packages already in Runpod base images
 ```
 
 **Skip transitive dependencies:**
@@ -258,7 +258,7 @@ flash build --use-local-flash
 1. Creates `.build/` directory (kept for inspection)
 2. Installs dependencies via pip for Linux x86_64
 3. Generates `flash_manifest.json` with resource configurations
-4. Creates handler files for each `@remote` function
+4. Creates handler files for each `@Endpoint` function
 5. Packages everything into `artifact.tar.gz`
 6. Reports archive size (max 500MB for deployment)
 
@@ -297,7 +297,7 @@ The `.build/` directory is preserved for inspection. Check:
 
 ## flash deploy
 
-Build and deploy the Flash application to RunPod in a single command.
+Build and deploy the Flash application to Runpod in a single command.
 
 ### Syntax
 
@@ -356,7 +356,7 @@ flash deploy --preview
 1. Runs `flash build` with specified options
 2. Validates `RUNPOD_API_KEY` environment variable
 3. Selects target environment (auto or via `--env`)
-4. Uploads artifact to RunPod
+4. Uploads artifact to Runpod
 5. Creates/updates serverless endpoints for each resource
 6. Displays endpoint URLs and access information
 7. Provides next steps for testing
@@ -396,7 +396,7 @@ flash deploy --env production  # Redeploy with changes
 
 ## flash undeploy
 
-Delete deployed RunPod serverless endpoints.
+Delete deployed Runpod serverless endpoints.
 
 ### Syntax
 
@@ -458,9 +458,9 @@ flash undeploy --cleanup-stale
 ### What It Does
 
 1. Lists tracked endpoints (from `.runpod/` directory)
-2. Verifies endpoints exist on RunPod
+2. Verifies endpoints exist on Runpod
 3. Prompts for confirmation (unless `--force`)
-4. Deletes endpoints via RunPod API
+4. Deletes endpoints via Runpod API
 5. Removes local tracking files
 
 ### Use Cases
@@ -480,7 +480,7 @@ flash undeploy --interactive
 **Fix tracking inconsistencies:**
 ```bash
 flash undeploy --cleanup-stale
-# If endpoints deleted manually via RunPod console
+# If endpoints deleted manually via Runpod console
 ```
 
 ### Related Commands
@@ -779,7 +779,7 @@ Flash CLI respects these environment variables:
 
 | Variable | Purpose | Used By |
 |----------|---------|---------|
-| `RUNPOD_API_KEY` | RunPod API authentication | `deploy`, `undeploy`, `env`, `app` |
+| `RUNPOD_API_KEY` | Runpod API authentication | `deploy`, `undeploy`, `env`, `app` |
 | `FLASH_HOST` | Default development server host | `run` |
 | `FLASH_PORT` | Default development server port | `run` |
 
@@ -815,7 +815,7 @@ Flash uses these configuration files:
 See the [Workflows Guide](docs/cli/workflows.md) for detailed step-by-step instructions:
 
 - **Local Development** - Create, run, test, iterate
-- **Build and Deploy** - Package and deploy to RunPod
+- **Build and Deploy** - Package and deploy to Runpod
 - **Multi-Environment** - Manage staging and production
 - **Testing** - Validate before production
 - **Cleanup** - Remove unused resources
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6083a0a..f42f66b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -45,7 +45,7 @@ All examples must meet these standards:
 - [ ] Error handling is implemented
 - [ ] Environment variables are documented
 - [ ] Dependencies are declared in pyproject.toml
-- [ ] Runtime deps declared in `@remote(dependencies=[...])`
+- [ ] Runtime deps declared in `Endpoint(dependencies=[...])`
 - [ ] Example discovered by `flash run` from project root
 
 ### 2. Code Quality
@@ -132,7 +132,7 @@ flash run
 
 ### 5. Verify Discovery
 
-`flash run` auto-discovers all `.py` files containing `@remote` functions. Verify your example loads:
+`flash run` auto-discovers all `.py` files containing `@Endpoint` functions. Verify your example loads:
 
 ```bash
 # From the repository root
@@ -140,7 +140,7 @@ flash run
 # Check http://localhost:8888/docs for your new endpoints
 ```
 
-Runtime dependencies (torch, transformers, etc.) are declared in `@remote(dependencies=[...])` and installed on the remote worker, not locally.
+Runtime dependencies (torch, transformers, etc.) are declared in `Endpoint(dependencies=[...])` and installed on the remote worker, not locally.
 
 ### 6. Create Pull Request
 
@@ -166,8 +166,8 @@ Each example is a flat directory with self-contained worker files (named `*_work
 ```
 your_example/
 ├── README.md              # Required: comprehensive documentation
-├── gpu_worker.py          # Required: GPU worker with @remote decorator
-├── cpu_worker.py          # Optional: CPU worker with @remote decorator
+├── gpu_worker.py          # Required: GPU worker with @Endpoint decorator
+├── cpu_worker.py          # Optional: CPU worker with @Endpoint decorator
 ├── pyproject.toml         # Required: project metadata and dependencies
 ├── .flashignore           # Optional: files to exclude from deployment
 ├── tests/                 # Recommended: test files
@@ -177,16 +177,14 @@ your_example/
     └── architecture.png
 ```
 
-`flash run` discovers all `.py` files with `@remote` functions automatically -- no `main.py`, no `workers/` directories, no router wiring.
+`flash run` discovers all `.py` files with `@Endpoint` functions automatically -- no `main.py`, no `workers/` directories, no router wiring.
 
 ### Minimal Worker (`gpu_worker.py`)
 
 ```python
-from runpod_flash import remote, LiveServerless
+from runpod_flash import Endpoint, GpuGroup
 
-config = LiveServerless(name="your_worker")
-
-@remote(resource_config=config, dependencies=["torch"])
+@Endpoint(name="your-worker", gpu=GpuGroup.ANY, dependencies=["torch"])
 async def your_function(payload: dict) -> dict:
     """
     Clear docstring explaining what this function does.
@@ -199,7 +197,7 @@ async def your_function(payload: dict) -> dict:
     """
     import torch
 
-    # Your implementation
+    # your implementation
     result = process(payload)
 
     return {"status": "success", "result": result}
@@ -407,12 +405,11 @@ RUNPOD_API_KEY = "hardcoded_key"  # Never do this!
 ### Error Handling
 
 ```python
-# Good - handle errors within @remote functions
-from runpod_flash import remote, LiveServerless
-
-config = LiveServerless(name="processor")
+# Good - handle errors within @Endpoint functions
+from runpod_flash import Endpoint, GpuGroup
 
-@remote(resource_config=config)
+# Good
+@Endpoint(name="processor", gpu=GpuGroup.ANY)
 async def process(data: dict) -> dict:
     try:
         result = do_work(data)
@@ -421,9 +418,9 @@ async def process(data: dict) -> dict:
         return {"status": "error", "detail": str(e)}
 
 # Bad
-@remote(resource_config=config)
+@Endpoint(name="processor", gpu=GpuGroup.ANY)
 async def process(data: dict) -> dict:
-    result = do_work(data)  # No error handling
+    result = do_work(data)  # no error handling
     return result
 ```
 
@@ -431,7 +428,7 @@ async def process(data: dict) -> dict:
 
 ### pyproject.toml
 
-Each example declares only its local development dependencies in `pyproject.toml`. Runtime deps needed on the GPU (torch, transformers, etc.) go in `@remote(dependencies=[...])`:
+Each example declares only its local development dependencies in `pyproject.toml`. Runtime deps needed on the GPU (torch, transformers, etc.) go in `Endpoint(dependencies=[...])`:
 
 ```toml
 [project]
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index 4c14d40..9efc3f5 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -464,17 +464,17 @@ cd 01_getting_started/05_new_example
 touch README.md gpu_worker.py pyproject.toml
 ```
 
-Each worker file (named `*_worker.py` by convention) is self-contained with `@remote` decorated functions. `flash run` discovers all `.py` files with `@remote` functions automatically -- no `main.py`, no `workers/` directories needed.
+Each worker file (named `*_worker.py` by convention) is self-contained with `@Endpoint` decorated functions. `flash run` discovers all `.py` files with `@Endpoint` functions automatically -- no `main.py`, no `workers/` directories needed.
 
 **3. Declare dependencies:**
 
-Add a `pyproject.toml` with `runpod-flash` as the only local dependency. Runtime deps (torch, etc.) go in `@remote(dependencies=[...])`.
+Add a `pyproject.toml` with `runpod-flash` as the only local dependency. Runtime deps (torch, etc.) go in `Endpoint(dependencies=[...])`.
 
 **4. Verify discovery:**
 
 ```bash
 cd ../../  # Back to root
-flash run  # Discovers all .py files with @remote functions
+flash run  # Discovers all .py files with @Endpoint functions
 ```
 
 ### Cleaning Up
diff --git a/README.md b/README.md
index 04b053c..155773d 100644
--- a/README.md
+++ b/README.md
@@ -285,9 +285,9 @@ The root [main.py](main.py) provides a programmatic discovery system that automa
 **Discovery Process**:
 1. Scans all example category directories (`01_getting_started/`, `02_ml_inference/`, etc.)
 2. Detects two patterns:
-   - **Single-file workers**: `gpu_worker.py`, `cpu_worker.py` with `APIRouter` exports
-   - **Directory-based workers**: `workers/gpu/__init__.py`, `workers/cpu/__init__.py` with `APIRouter` exports
-3. Dynamically imports and registers all routers with unique prefixes (e.g., `/01_hello_world/gpu/`)
+   - **Queue-based workers**: `@Endpoint(...)` decorated functions
+   - **Load-balanced workers**: `Endpoint` instances with `.get()/.post()` route decorators
+3. Dynamically imports and registers all endpoints with unique prefixes (e.g., `/01_hello_world/gpu/`)
 4. Generates metadata and documentation automatically
 
 **Benefits**:
@@ -303,17 +303,11 @@ Each example follows this structure:
 ```
 example_name/
 ├── README.md              # Documentation and deployment guide
-├── main.py               # FastAPI application entry point
-├── workers/              # Remote worker functions
-│   ├── gpu/              # GPU workers
-│   │   ├── __init__.py   # FastAPI router
-│   │   └── endpoint.py   # @remote decorated functions
-│   └── cpu/              # CPU workers
-│       ├── __init__.py
-│       └── endpoint.py
-├── requirements.txt      # Python dependencies
-├── pyproject.toml        # Project configuration
-└── .env.example          # Environment variable template
+├── gpu_worker.py          # GPU worker with @Endpoint decorator
+├── cpu_worker.py          # CPU worker with @Endpoint decorator
+├── requirements.txt       # Python dependencies
+├── pyproject.toml         # Project configuration
+└── .env.example           # Environment variable template
 ```
 
 ### Dependency Management
@@ -349,35 +343,57 @@ This automation ensures that `flash run` from the root directory always has acce
 
 ## Key Concepts
 
-### Remote Workers
+### Endpoint
 
-The `@remote` decorator marks functions for execution on Runpod's serverless infrastructure:
+The `Endpoint` class configures functions for execution on Runpod's serverless infrastructure:
 
-```python
-from runpod_flash import remote, LiveServerless, GpuGroup
+**Queue-based (one function = one endpoint):**
 
-config = LiveServerless(
-    name="my_worker",
-    gpus=[GpuGroup.ADA_24],  # RTX 4090
-    workersMin=0,            # Scale to zero when idle
-    workersMax=3,            # Maximum concurrent workers
-)
+```python
+from runpod_flash import Endpoint, GpuGroup
 
-@remote(resource_config=config, dependencies=["torch"])
+@Endpoint(name="my-worker", gpu=GpuGroup.ADA_24, workers=(0, 3), dependencies=["torch"])
 async def process(data: dict) -> dict:
     import torch
-    # This code runs on Runpod GPUs
+    # this code runs on Runpod GPUs
     return {"result": "processed"}
 ```
 
+**Load-balanced (multiple routes, shared workers):**
+
+```python
+from runpod_flash import Endpoint
+
+api = Endpoint(name="my-api", cpu="cpu3c-1-2", workers=(1, 3))
+
+@api.get("/health")
+async def health():
+    return {"status": "ok"}
+
+@api.post("/compute")
+async def compute(data: dict) -> dict:
+    return {"result": data}
+```
+
+**Client mode (connect to an existing endpoint):**
+
+```python
+from runpod_flash import Endpoint
+
+ep = Endpoint(id="ep-abc123")
+job = await ep.run({"prompt": "hello"})
+await job.wait()
+print(job.output)
+```
+
 ### Resource Types
 
-**GPU Workers** (`LiveServerless`):
+**GPU Workers** (`gpu=`):
 - `GpuGroup.ADA_24` - RTX 4090 (24GB)
 - `GpuGroup.ADA_48_PRO` - RTX 6000 Ada, L40 (48GB)
 - `GpuGroup.AMPERE_80` - A100 (80GB)
 
-**CPU Workers** (`CpuLiveServerless`):
+**CPU Workers** (`cpu=`):
 - `CpuInstanceType.CPU3G_2_8` - 2 vCPU, 8GB RAM
 - `CpuInstanceType.CPU3C_4_8` - 4 vCPU, 8GB RAM (Compute)
 - `CpuInstanceType.CPU5G_4_16` - 4 vCPU, 16GB RAM (Latest)
@@ -385,9 +401,9 @@ async def process(data: dict) -> dict:
 ### Auto-Scaling
 
 Workers automatically scale based on demand:
-- `workersMin=0` - Scale to zero when idle (cost-efficient)
-- `workersMax=N` - Maximum concurrent workers
-- `idleTimeout=5` - Minutes before scaling down
+- `workers=(0, 3)` - Scale from 0 to 3 workers (cost-efficient)
+- `workers=(1, 5)` - Keep 1 warm, scale up to 5
+- `idle_timeout=5` - Minutes before scaling down
 
 ## Contributing
 
diff --git a/docs/cli/commands.md b/docs/cli/commands.md
index 8e87e9f..ae02f64 100644
--- a/docs/cli/commands.md
+++ b/docs/cli/commands.md
@@ -58,7 +58,7 @@ If `PROJECT_NAME` is omitted or set to `.`, the project is initialized in the cu
 project-name/
 ├── main.py                 # FastAPI application entry point
 ├── mothership.py          # Mothership endpoint configuration
-├── gpu_worker.py          # GPU worker template with @remote decorator
+├── gpu_worker.py          # GPU worker template with @Endpoint decorator
 ├── pyproject.toml         # Project dependencies and metadata
 ├── requirements.txt       # Pinned dependencies (generated)
 ├── .env.example           # Environment variable template
@@ -73,7 +73,7 @@ project-name/
 **main.py**
 - FastAPI application that loads routers from worker files
 - Configured for local development and testing
-- Automatically discovers `@remote` decorated functions
+- Automatically discovers `@Endpoint` decorated functions
 
 **mothership.py**
 - Configures the mothership endpoint (load-balanced FastAPI application endpoint)
@@ -81,8 +81,8 @@ project-name/
 - Delete this file if you don't want to deploy the mothership endpoint
 
 **gpu_worker.py**
-- Template worker with `@remote` decorator
-- Configured for GPU resources via `LiveServerless`
+- Template worker with `@Endpoint` decorator
+- Configured for GPU resources via `gpu=` parameter
 - Contains example endpoint with proper structure
 
 **pyproject.toml**
@@ -249,14 +249,14 @@ flash run [OPTIONS]
 
 ### Description
 
-Starts a local uvicorn development server that runs your Flash application. The server automatically discovers all `@remote` decorated functions and makes them available as HTTP endpoints. Supports hot reloading, custom host/port configuration, and optional resource auto-provisioning.
+Starts a local uvicorn development server that runs your Flash application. The server automatically discovers all `@Endpoint` decorated functions and makes them available as HTTP endpoints. Supports hot reloading, custom host/port configuration, and optional resource auto-provisioning.
 
 ### Architecture: Hybrid Local + Cloud
 
 With `flash run`, your system operates in a **hybrid architecture**:
 
 - **Your FastAPI app runs locally** on your machine (localhost:8888)
-- **`@remote` functions run on Runpod** as serverless endpoints
+- **`@Endpoint` functions run on Runpod** as serverless endpoints
 - **Hot reload works** because your app code is local—changes are picked up instantly
 - **Endpoints are prefixed with `live-`** to distinguish from production (e.g., `gpu-worker` becomes `live-gpu-worker`)
 
@@ -371,7 +371,7 @@ flash run --auto-provision
 
 This will:
 1. Start local development server
-2. Provision Runpod endpoints for all `@remote` functions
+2. Provision Runpod endpoints for all `@Endpoint` functions
 3. Allow testing with real Runpod infrastructure
 4. Incur Runpod costs (workers will spin up)
 
@@ -398,7 +398,7 @@ Share the URL with team members for testing.
 
 1. **Application Loading**: Imports `main.py` and discovers FastAPI app
 2. **Router Discovery**: Scans for APIRouter exports in worker files
-3. **Remote Function Discovery**: Finds all `@remote` decorated functions
+3. **Remote Function Discovery**: Finds all `@Endpoint` decorated functions
 4. **Uvicorn Startup**: Starts ASGI server with specified configuration
 5. **Hot Reload Setup**: Watches Python files for changes (if enabled)
 6. **Documentation Generation**: Creates OpenAPI schema at `/docs`
@@ -588,7 +588,7 @@ flash build [OPTIONS]
 
 ### Description
 
-Packages your Flash application and its dependencies into a tar.gz archive suitable for deployment to Runpod. The build process installs dependencies cross-platform (Linux x86_64), generates handler files for each `@remote` function, creates a manifest with resource configurations, and produces a final artifact ready for upload.
+Packages your Flash application and its dependencies into a tar.gz archive suitable for deployment to Runpod. The build process installs dependencies cross-platform (Linux x86_64), generates handler files for each `@Endpoint` function, creates a manifest with resource configurations, and produces a final artifact ready for upload.
 
 The `.build/` directory is preserved after building for inspection and debugging.
 
@@ -630,12 +630,12 @@ The build command executes these steps:
    - Respects `--no-deps` and `--exclude` options
 
 3. **Generate Manifest**
-   - Scans code for `@remote` decorated functions
-   - Extracts resource configurations (`LiveServerless`, etc.)
+   - Scans code for `@Endpoint` decorated functions
+   - Extracts resource configurations (Endpoint params)
    - Creates `flash_manifest.json` with deployment metadata
 
 4. **Generate Handlers**
-   - Creates handler file for each `@remote` function
+   - Creates handler file for each `@Endpoint` function
    - Handlers interface between Runpod and your functions
    - Includes error handling and serialization logic
 
@@ -827,7 +827,7 @@ The `flash_manifest.json` contains deployment metadata:
   "resources": [
     {
       "name": "gpu_worker_process_request",
-      "type": "LiveServerless",
+      "type": "Endpoint",
       "config": {
         "name": "my_api_gpu",
         "gpus": ["ANY"],
@@ -999,7 +999,7 @@ If only one environment exists, it's used automatically. With multiple environme
 With `flash deploy`, your **entire application** runs on Runpod Serverless:
 
 - **Your FastAPI app runs on Runpod** as the "mothership" endpoint
-- **`@remote` functions run on Runpod** as separate worker endpoints
+- **`@Endpoint` functions run on Runpod** as separate worker endpoints
 - **Users call the mothership URL** directly (e.g., `https://xyz123.api.runpod.ai/api/hello`)
 - **No `live-` prefix** on endpoint names—these are production endpoints
 - **No hot reload**—code changes require a new deployment
@@ -1451,10 +1451,10 @@ Deployed Endpoints:
 ┌──────────────────────────┬─────────────────────┬──────────────────────┬────────────┐
 │ Name                     │ Type                │ Environment          │ Status     │
 ├──────────────────────────┼─────────────────────┼──────────────────────┼────────────┤
-│ my-api-gpu               │ LiveServerless      │ production           │ Active     │
-│ my-api-mothership        │ LiveLoadBalancer    │ production           │ Active     │
-│ test-worker              │ LiveServerless      │ dev                  │ Active     │
-│ old-api-gpu              │ LiveServerless      │ staging              │ Inactive   │
+│ my-api-gpu               │ Endpoint            │ production           │ Active     │
+│ my-api-mothership        │ Endpoint (LB)       │ production           │ Active     │
+│ test-worker              │ Endpoint            │ dev                  │ Active     │
+│ old-api-gpu              │ Endpoint            │ staging              │ Inactive   │
 └──────────────────────────┴─────────────────────┴──────────────────────┴────────────┘
 
 Total: 4 endpoints
@@ -1469,7 +1469,7 @@ flash undeploy my-api-gpu
 Output:
 ```
 Endpoint: my-api-gpu
-Type: LiveServerless (GPU)
+Type: Endpoint (GPU)
 Environment: production
 Status: Active
 
@@ -1975,10 +1975,10 @@ Endpoints (4):
 ┌──────────────────────────┬─────────────────────┬────────────┬──────────────┐
 │ Name                     │ Type                │ Status     │ Workers      │
 ├──────────────────────────┼─────────────────────┼────────────┼──────────────┤
-│ my-api-gpu               │ LiveServerless      │ Active     │ 0/3          │
-│ my-api-mothership        │ LiveLoadBalancer    │ Active     │ 1/3          │
-│ batch-processor          │ LiveServerless      │ Idle       │ 0/5          │
-│ image-worker             │ LiveServerless      │ Active     │ 2/3          │
+│ my-api-gpu               │ Endpoint            │ Active     │ 0/3          │
+│ my-api-mothership        │ Endpoint (LB)       │ Active     │ 1/3          │
+│ batch-processor          │ Endpoint            │ Idle       │ 0/5          │
+│ image-worker             │ Endpoint            │ Active     │ 2/3          │
 └──────────────────────────┴─────────────────────┴────────────┴──────────────┘
 
 Resource Configuration:
diff --git a/docs/cli/getting-started.md b/docs/cli/getting-started.md
index 5863b68..940b8f0 100644
--- a/docs/cli/getting-started.md
+++ b/docs/cli/getting-started.md
@@ -48,9 +48,9 @@ cd hello-flash
 **What happened:**
 - Created `hello-flash/` directory
 - Generated project structure:
-  - `main.py` - FastAPI application
-  - `mothership.py` - Mothership endpoint config
   - `gpu_worker.py` - GPU worker template
+  - `cpu_worker.py` - CPU worker template
+  - `lb_worker.py` - Load-balanced worker template
   - `pyproject.toml` - Dependencies
   - `.env.example` - Environment variables template
 
@@ -63,20 +63,12 @@ cd hello-flash
 Open `gpu_worker.py` to see your first remote function:
 
 ```python
-from runpod_flash import remote, LiveServerless, GpuGroup
+from runpod_flash import Endpoint, GpuType
 
-gpu_config = LiveServerless(
-    name="hello_flash_gpu",
-    gpus=[GpuGroup.ANY],
-    workersMin=0,
-    workersMax=3,
-    idleTimeout=300,
-)
-
-@remote(resource_config=gpu_config)
+@Endpoint(name="gpu_worker", gpu=GpuType.ANY, dependencies=["torch"])
 async def process_request(payload: dict) -> dict:
     """Example GPU worker that processes requests."""
-    # Your GPU processing logic here
+    # your GPU processing logic here
     return {
         "status": "success",
         "message": "Hello from Flash GPU worker!",
@@ -85,8 +77,9 @@ async def process_request(payload: dict) -> dict:
 ```
 
 **Key concepts:**
-- `@remote` decorator marks functions to be run in the Runpod cloud
-- `LiveServerless` configures GPU resources for the function to be run on
+- `@Endpoint` decorator marks functions to be run in the Runpod cloud
+- `gpu=` configures the GPU type, `workers=` sets scaling bounds
+- `dependencies=` specifies runtime Python packages
 
 ---
 
@@ -107,7 +100,7 @@ INFO: Application startup complete
 
 **Checkpoint:** Server is running at http://localhost:8888
 
-**What's happening:** Your FastAPI app runs locally on your machine, but when you call a `@remote` function, it executes on Runpod Serverless. This hybrid architecture gives you hot-reload for rapid development while testing real GPU/CPU workloads in the cloud. Endpoints created during `flash run` are prefixed with `live-` to keep them separate from production.
+**What's happening:** Your FastAPI app runs locally on your machine, but when you call an `@Endpoint` function, it executes on Runpod Serverless. This hybrid architecture gives you hot-reload for rapid development while testing real GPU/CPU workloads in the cloud. Endpoints created during `flash run` are prefixed with `live-` to keep them separate from production.
 
 ---
 
diff --git a/docs/cli/troubleshooting.md b/docs/cli/troubleshooting.md
index aa05d97..44ac31a 100644
--- a/docs/cli/troubleshooting.md
+++ b/docs/cli/troubleshooting.md
@@ -782,15 +782,11 @@ ERROR: Failed to create endpoint: Insufficient GPU availability
 
 **1. Use more flexible GPU type:**
 ```python
-# Before (specific GPU)
-gpu_config = LiveServerless(
-    gpus=[GpuGroup.A100]
-)
+# before (specific GPU)
+@Endpoint(name="worker", gpu=GpuGroup.A100)
 
-# After (any GPU)
-gpu_config = LiveServerless(
-    gpus=[GpuGroup.ANY]
-)
+# after (any GPU)
+@Endpoint(name="worker", gpu=GpuGroup.ANY)
 ```
 
 Redeploy:
@@ -920,10 +916,8 @@ RuntimeError: CUDA not available
 
 Fix:
 ```python
-# Ensure GPU specified in config
-gpu_config = LiveServerless(
-    gpus=[GpuGroup.ANY]
-)
+# ensure GPU specified in Endpoint
+@Endpoint(name="worker", gpu=GpuGroup.ANY)
 ```
 
 **4. Redeploy after fixing:**
diff --git a/docs/cli/workflows.md b/docs/cli/workflows.md
index bc087ef..7ecc93e 100644
--- a/docs/cli/workflows.md
+++ b/docs/cli/workflows.md
@@ -99,10 +99,9 @@ INFO: Application startup complete.
 Edit your worker files (e.g., `gpu_worker.py`):
 
 ```python
-@remote(resource_config=gpu_config)
+@Endpoint(name="my-worker", gpu=GpuGroup.ANY)
 async def process_request(payload: dict) -> dict:
-    """Updated function with new logic."""
-    # Add new feature here
+    """Process incoming requests on GPU."""
     result = perform_processing(payload)
     return {"status": "success", "result": result}
 ```
@@ -598,24 +597,24 @@ flash deploy --env production
 
 **dev environment** (`dev_worker.py`):
 ```python
-dev_gpu_config = LiveServerless(
+@Endpoint(
     name="myapi_dev_gpu",
-    gpus=[GpuGroup.ANY],  # Any GPU is fine
-    workersMin=0,         # Scale to zero when idle
-    workersMax=2,         # Small max for cost
-    idleTimeout=60,       # Quick shutdown
+    gpu=GpuGroup.ANY,      # any GPU is fine
+    workers=(0, 2),        # scale to zero, small max for cost
+    idle_timeout=1,        # quick shutdown
 )
+async def process(payload: dict) -> dict: ...
 ```
 
 **production environment** (`prod_worker.py`):
 ```python
-prod_gpu_config = LiveServerless(
+@Endpoint(
     name="myapi_prod_gpu",
-    gpus=[GpuGroup.A100],  # Specific GPU for consistency
-    workersMin=1,          # Always have one ready
-    workersMax=10,         # Handle load spikes
-    idleTimeout=300,       # Keep warm longer
+    gpu=GpuGroup.A100,     # specific GPU for consistency
+    workers=(1, 10),       # always have one ready, handle load spikes
+    idle_timeout=5,        # keep warm longer
 )
+async def process(payload: dict) -> dict: ...
 ```
 
 **Use environment variables:**
@@ -1054,9 +1053,9 @@ Deployed Endpoints:
 ┌──────────────────────────┬─────────────────────┬──────────────────────┬────────────┐
 │ Name                     │ Type                │ Environment          │ Status     │
 ├──────────────────────────┼─────────────────────┼──────────────────────┼────────────┤
-│ my-api-gpu               │ LiveServerless      │ production           │ Active     │
-│ test-feature-gpu         │ LiveServerless      │ dev                  │ Idle       │
-│ old-worker-v1            │ LiveServerless      │ staging              │ Inactive   │
+│ my-api-gpu               │ Endpoint            │ production           │ Active     │
+│ test-feature-gpu         │ Endpoint            │ dev                  │ Idle       │
+│ old-worker-v1            │ Endpoint            │ staging              │ Inactive   │
 └──────────────────────────┴─────────────────────┴──────────────────────┴────────────┘
 ```
 
@@ -1154,19 +1153,12 @@ flash env get production
 
 **Before (high cost):**
 ```python
-expensive_config = LiveServerless(
-    workersMin=5,  # Always 5 running
-    workersMax=10
-)
+@Endpoint(name="worker", gpu=GpuGroup.ANY, workers=(5, 10))  # always 5 running
 ```
 
 **After (optimized):**
 ```python
-optimized_config = LiveServerless(
-    workersMin=0,   # Scale to zero
-    workersMax=10,  # Handle spikes
-    idleTimeout=60  # Quick shutdown
-)
+@Endpoint(name="worker", gpu=GpuGroup.ANY, workers=(0, 10), idle_timeout=1)  # scale to zero
 ```
 
 Redeploy with optimized config:
@@ -1456,15 +1448,11 @@ cat gpu_worker.py
 
 **A. Change GPU type:**
 ```python
-# Before (specific GPU)
-gpu_config = LiveServerless(
-    gpus=[GpuGroup.A100]  # May not be available
-)
+# before (specific GPU, may not be available)
+@Endpoint(name="worker", gpu=GpuGroup.A100)
 
-# After (more flexible)
-gpu_config = LiveServerless(
-    gpus=[GpuGroup.ANY]  # Any available GPU
-)
+# after (more flexible)
+@Endpoint(name="worker", gpu=GpuGroup.ANY)
 ```
 
 Redeploy:
@@ -1479,10 +1467,9 @@ sleep 300
 flash deploy --env production
 ```
 
-**C. Choose different region:**
-Modify config to try different GPU types:
+**C. Choose different GPU type:**
 ```python
-gpus=[GpuGroup.RTX_4090]  # More common
+@Endpoint(name="worker", gpu=GpuGroup.RTX_4090)  # more common
 ```
 
 #### Issue 5: Runtime Errors
@@ -1545,10 +1532,7 @@ RuntimeError: CUDA not available
 
 Solution: Verify GPU configuration:
 ```python
-gpu_config = LiveServerless(
-    gpus=[GpuGroup.ANY],  # Ensure GPU specified
-    ...
-)
+@Endpoint(name="worker", gpu=GpuGroup.ANY)
 ```
 
 #### Issue 6: Performance Issues
@@ -1571,11 +1555,12 @@ flash env get production
 
 **A. Reduce cold starts:**
 ```python
-# Increase workersMin to keep workers warm
-gpu_config = LiveServerless(
-    workersMin=1,  # Was 0
-    workersMax=5,
-    idleTimeout=300  # Keep alive longer
+# keep workers warm with workers=(1, N)
+@Endpoint(
+    name="worker",
+    gpu=GpuGroup.ANY,
+    workers=(1, 5),       # keep 1 warm
+    idle_timeout=5,       # keep alive longer
 )
 ```
 
@@ -1585,24 +1570,21 @@ gpu_config = LiveServerless(
 - Cache model loading
 
 ```python
-# Lazy loading example
+# lazy loading example
 _model = None
 
-@remote(resource_config=gpu_config)
+@Endpoint(name="worker", gpu=GpuGroup.ANY)
 async def infer(payload: dict) -> dict:
     global _model
     if _model is None:
-        _model = load_model()  # Only load once
+        _model = load_model()  # only load once
     return _model.predict(payload)
 ```
 
 **C. Increase worker capacity:**
 ```python
-# Handle more concurrent requests
-gpu_config = LiveServerless(
-    workersMax=10,  # Was 3
-    ...
-)
+# handle more concurrent requests
+@Endpoint(name="worker", gpu=GpuGroup.ANY, workers=(0, 10))
 ```
 
 ### General Debugging Approach