#!/bin/sh

clear

#usage: mlx_lm.server [-h] [--model MODEL] [--adapter-path ADAPTER_PATH] [--host HOST] [--port PORT] [--draft-model DRAFT_MODEL] [--num-draft-tokens NUM_DRAFT_TOKENS] [--trust-remote-code] [--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
#                     [--chat-template CHAT_TEMPLATE] [--use-default-chat-template] [--temp TEMP] [--top-p TOP_P] [--top-k TOP_K] [--min-p MIN_P] [--max-tokens MAX_TOKENS] [--chat-template-args CHAT_TEMPLATE_ARGS]
#                     [--decode-concurrency DECODE_CONCURRENCY] [--prompt-concurrency PROMPT_CONCURRENCY] [--prefill-step-size PREFILL_STEP_SIZE] [--prompt-cache-size PROMPT_CACHE_SIZE] [--prompt-cache-bytes PROMPT_CACHE_BYTES] [--pipeline]
#
#MLX Http Server.
#
#options:
#  -h, --help            show this help message and exit
#  --model MODEL         The path to the MLX model weights, tokenizer, and config
#  --adapter-path ADAPTER_PATH
#                        Optional path for the trained adapter weights and config.
#  --host HOST           Host for the HTTP server (default: 127.0.0.1)
#  --port PORT           Port for the HTTP server (default: 8080)
#  --draft-model DRAFT_MODEL
#                        A model to be used for speculative decoding.
#  --num-draft-tokens NUM_DRAFT_TOKENS
#                        Number of tokens to draft when using speculative decoding.
#  --trust-remote-code   Enable trusting remote code for tokenizer
#  --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
#                        Set the logging level (default: INFO)
#  --chat-template CHAT_TEMPLATE
#                        Specify a chat template for the tokenizer
#  --use-default-chat-template
#                        Use the default chat template
#  --temp TEMP           Default sampling temperature (default: 0.0)
#  --top-p TOP_P         Default nucleus sampling top-p (default: 1.0)
#  --top-k TOP_K         Default top-k sampling (default: 0, disables top-k)
#  --min-p MIN_P         Default min-p sampling (default: 0.0, disables min-p)
#  --max-tokens MAX_TOKENS
#                        Default maximum number of tokens to generate (default: 512)
#  --chat-template-args CHAT_TEMPLATE_ARGS
#                        A JSON formatted string of arguments for the tokenizer's apply_chat_template, e.g. '{"enable_thinking":false}'
#  --decode-concurrency DECODE_CONCURRENCY
#                        When a request is batchable then decode that many requests in parallel
#  --prompt-concurrency PROMPT_CONCURRENCY
#                        When a request is batchable then process that many prompts in parallel
#  --prefill-step-size PREFILL_STEP_SIZE
#                        Step size for prefill processing (default: 2048)
#  --prompt-cache-size PROMPT_CACHE_SIZE
#                        Maximum number of distinct KV caches to hold in the prompt cache
#  --prompt-cache-bytes PROMPT_CACHE_BYTES
#                        Maximum size in bytes of the KV caches
#  --pipeline            Use pipelining instead of tensor parallelism

# start MLX-server
mlx_lm.server \
  --temp 0.7 \
  --max-tokens 262144 \
  --chat-template-args '{"enable_thinking":false}'