curl --request POST \
--url https://api.orq.ai/v2/gateway/chat/completions \
--header 'Authorization: Bearer <token>' \
--header 'Content-Type: application/json' \
--data '
{
"messages": [
{
"role": "system",
"content": "<string>",
"name": "<string>"
}
],
"model": "<string>",
"metadata": {},
"audio": {
"voice": "alloy",
"format": "wav"
},
"frequency_penalty": 123,
"max_tokens": 123,
"max_completion_tokens": 123,
"logprobs": true,
"top_logprobs": 10,
"n": 2,
"presence_penalty": 123,
"response_format": {
"type": "text"
},
"reasoning_effort": "<string>",
"verbosity": "<string>",
"seed": 123,
"stop": "<string>",
"stream_options": {
"include_usage": true
},
"thinking": {
"type": "enabled",
"budget_tokens": 123,
"thinking_level": "low"
},
"temperature": 1,
"top_p": 123,
"top_k": 123,
"tools": [
{
"function": {
"name": "<string>",
"description": "<string>",
"parameters": {
"type": "object",
"properties": {},
"required": [
"<string>"
],
"additionalProperties": true
},
"strict": true
},
"type": "function"
}
],
"tool_choice": "none",
"parallel_tool_calls": true,
"modalities": [
"text"
],
"orq": {
"retry": {
"count": 3,
"on_codes": [
429,
500,
502
]
},
"fallbacks": [
{
"model": "openai/gpt-5"
},
{
"model": "anthropic/claude-4-opus"
}
],
"contact": {
"id": "contact_01ARZ3NDEKTSV4RRFFQ69G5FAV",
"display_name": "Jane Doe",
"email": "[email protected]"
},
"thread": {
"id": "thread_01ARZ3NDEKTSV4RRFFQ69G5FAV",
"tags": [
"customer-support"
]
},
"inputs": {
"customer_name": "John Smith",
"issue_type": "billing"
},
"cache": {
"ttl": 3600,
"type": "exact_match"
},
"knowledge_bases": [
{
"knowledge_id": "knowledge_01ARZ3NDEKTSV4RRFFQ69G5FAV",
"top_k": 5
}
],
"timeout": {
"call_timeout": 30000
}
},
"stream": false
}
'{
"id": "<string>",
"choices": [
{
"finish_reason": "stop",
"message": {
"content": "<string>",
"refusal": "<string>",
"tool_calls": [
{
"id": "<string>",
"type": "function",
"function": {
"name": "<string>",
"arguments": "<string>"
}
}
],
"role": "assistant",
"reasoning": "<string>",
"reasoning_signature": "<string>",
"redacted_reasoning": "<string>",
"audio": {
"id": "<string>",
"expires_at": 123,
"data": "<string>",
"transcript": "<string>"
}
},
"index": 0,
"logprobs": {
"content": [
{
"token": "<string>",
"logprob": 123,
"bytes": [
123
],
"top_logprobs": [
{
"token": "<string>",
"logprob": 123,
"bytes": [
123
]
}
]
}
],
"refusal": [
{
"token": "<string>",
"logprob": 123,
"bytes": [
123
],
"top_logprobs": [
{
"token": "<string>",
"logprob": 123,
"bytes": [
123
]
}
]
}
]
}
}
],
"created": 123,
"model": "<string>",
"object": "chat.completion",
"system_fingerprint": "<string>",
"usage": {
"completion_tokens": 123,
"prompt_tokens": 123,
"total_tokens": 123,
"prompt_tokens_details": {
"cached_tokens": 123,
"cache_creation_tokens": 123,
"audio_tokens": 123
},
"completion_tokens_details": {
"reasoning_tokens": 123,
"accepted_prediction_tokens": 123,
"rejected_prediction_tokens": 123,
"audio_tokens": 123
}
}
}Creates a model response for the given chat conversation with support for retries, fallbacks, prompts, and variables.
curl --request POST \
--url https://api.orq.ai/v2/gateway/chat/completions \
--header 'Authorization: Bearer <token>' \
--header 'Content-Type: application/json' \
--data '
{
"messages": [
{
"role": "system",
"content": "<string>",
"name": "<string>"
}
],
"model": "<string>",
"metadata": {},
"audio": {
"voice": "alloy",
"format": "wav"
},
"frequency_penalty": 123,
"max_tokens": 123,
"max_completion_tokens": 123,
"logprobs": true,
"top_logprobs": 10,
"n": 2,
"presence_penalty": 123,
"response_format": {
"type": "text"
},
"reasoning_effort": "<string>",
"verbosity": "<string>",
"seed": 123,
"stop": "<string>",
"stream_options": {
"include_usage": true
},
"thinking": {
"type": "enabled",
"budget_tokens": 123,
"thinking_level": "low"
},
"temperature": 1,
"top_p": 123,
"top_k": 123,
"tools": [
{
"function": {
"name": "<string>",
"description": "<string>",
"parameters": {
"type": "object",
"properties": {},
"required": [
"<string>"
],
"additionalProperties": true
},
"strict": true
},
"type": "function"
}
],
"tool_choice": "none",
"parallel_tool_calls": true,
"modalities": [
"text"
],
"orq": {
"retry": {
"count": 3,
"on_codes": [
429,
500,
502
]
},
"fallbacks": [
{
"model": "openai/gpt-5"
},
{
"model": "anthropic/claude-4-opus"
}
],
"contact": {
"id": "contact_01ARZ3NDEKTSV4RRFFQ69G5FAV",
"display_name": "Jane Doe",
"email": "[email protected]"
},
"thread": {
"id": "thread_01ARZ3NDEKTSV4RRFFQ69G5FAV",
"tags": [
"customer-support"
]
},
"inputs": {
"customer_name": "John Smith",
"issue_type": "billing"
},
"cache": {
"ttl": 3600,
"type": "exact_match"
},
"knowledge_bases": [
{
"knowledge_id": "knowledge_01ARZ3NDEKTSV4RRFFQ69G5FAV",
"top_k": 5
}
],
"timeout": {
"call_timeout": 30000
}
},
"stream": false
}
'{
"id": "<string>",
"choices": [
{
"finish_reason": "stop",
"message": {
"content": "<string>",
"refusal": "<string>",
"tool_calls": [
{
"id": "<string>",
"type": "function",
"function": {
"name": "<string>",
"arguments": "<string>"
}
}
],
"role": "assistant",
"reasoning": "<string>",
"reasoning_signature": "<string>",
"redacted_reasoning": "<string>",
"audio": {
"id": "<string>",
"expires_at": 123,
"data": "<string>",
"transcript": "<string>"
}
},
"index": 0,
"logprobs": {
"content": [
{
"token": "<string>",
"logprob": 123,
"bytes": [
123
],
"top_logprobs": [
{
"token": "<string>",
"logprob": 123,
"bytes": [
123
]
}
]
}
],
"refusal": [
{
"token": "<string>",
"logprob": 123,
"bytes": [
123
],
"top_logprobs": [
{
"token": "<string>",
"logprob": 123,
"bytes": [
123
]
}
]
}
]
}
}
],
"created": 123,
"model": "<string>",
"object": "chat.completion",
"system_fingerprint": "<string>",
"usage": {
"completion_tokens": 123,
"prompt_tokens": 123,
"total_tokens": 123,
"prompt_tokens_details": {
"cached_tokens": 123,
"cache_creation_tokens": 123,
"audio_tokens": 123
},
"completion_tokens_details": {
"reasoning_tokens": 123,
"accepted_prediction_tokens": 123,
"rejected_prediction_tokens": 123,
"audio_tokens": 123
}
}
}Bearer authentication header of the form Bearer <token>, where <token> is your auth token.
A list of messages comprising the conversation so far.
Developer-provided instructions that the model should follow, regardless of messages sent by the user.
Show child attributes
The role of the messages author, in this case system.
system The contents of the system message.
An optional name for the participant. Provides the model information to differentiate between participants of the same role.
Model ID used to generate the response, like openai/gpt-4o or anthropic/claude-haiku-4-5-20251001. The AI Gateway offers a wide range of models with different capabilities, performance characteristics, and price points. Refer to the (Supported models)[/docs/proxy/supported-models] to browse available models.
Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format. Keys can have a maximum length of 64 characters and values can have a maximum length of 512 characters.
Show child attributes
512Parameters for audio output. Required when audio output is requested with modalities: ["audio"]. Learn more.
Show child attributes
The voice the model uses to respond. Supported voices are alloy, echo, fable, onyx, nova, and shimmer.
alloy, echo, fable, onyx, nova, shimmer Specifies the output audio format. Must be one of wav, mp3, flac, opus, or pcm16.
wav, mp3, flac, opus, pcm16 Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
[Deprecated]. The maximum number of tokens that can be generated in the chat completion. This value can be used to control costs for text generated via API.
This value is now deprecated in favor of max_completion_tokens, and is not compatible with o1 series models.
An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens
Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message.
An integer between 0 and 20 specifying the number of most likely tokens to return at each token position, each with an associated log probability. logprobs must be set to true if this parameter is used.
0 <= x <= 20How many chat completion choices to generate for each input message. Note that you will be charged based on the number of generated tokens across all of the choices. Keep n as 1 to minimize costs.
x >= 1Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
Constrains effort on reasoning for reasoning models. Reducing reasoning effort can result in faster responses and fewer tokens used on reasoning in a response.
Adjusts response verbosity. Lower levels yield shorter answers.
If specified, our system will make a best effort to sample deterministically, such that repeated requests with the same seed and parameters should return the same result.
Up to 4 sequences where the API will stop generating further tokens.
Options for streaming response. Only set this when you set stream: true.
Show child attributes
If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.
Show child attributes
Enables or disables the thinking mode capability
enabled, disabled Determines how many tokens the model can use for its internal reasoning process. Larger budgets can enable more thorough analysis for complex problems, improving response quality. Must be ≥1024 and less than max_tokens.
The level of reasoning the model should use. This setting is supported only by gemini-3 models. If budget_tokens is specified and thinking_level is available, budget_tokens will be ignored.
low, high What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
0 <= x <= 2An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass.
Limits the model to consider only the top k most likely tokens at each step.
A list of tools the model may call.
Show child attributes
Show child attributes
The name of the function to call.
A description of what the function does, used by the model to choose when and how to call the function.
The parameters the functions accepts, described as a JSON Schema object
Show child attributes
Whether to enable strict schema adherence when generating the function call.
The type of the tool. Currently, only function is supported.
function Controls which (if any) tool is called by the model.
none, auto, required Whether to enable parallel function calling during tool use.
Output types that you would like the model to generate. Most models are capable of generating text, which is the default: ["text"]. The gpt-4o-audio-preview model can also be used to generate audio. To request that this model generate both text and audio responses, you can use: ["text", "audio"].
text, audio Leverage Orq's intelligent routing capabilities to enhance your AI application with enterprise-grade reliability and observability. Orq provides automatic request management including retries on failures, model fallbacks for high availability, contact-level analytics tracking, conversation threading, and dynamic prompt templating with variable substitution.
Show child attributes
The name to display on the trace. If not specified, the default system name will be used.
Retry configuration for the request
Prompt configuration for the request
Information about the contact making the request. If the contact does not exist, it will be created automatically.
Show child attributes
Unique identifier for the contact
"contact_01ARZ3NDEKTSV4RRFFQ69G5FAV"
Display name of the contact
"Jane Doe"
Email address of the contact
URL to the contact's avatar or logo
"https://example.com/avatars/jane-doe.jpg"
A list of tags associated with the contact
["hr", "engineering"]
Thread information to group related requests
Show child attributes
Unique identifier of the knowledge base to search
"customer-knowledge-base"
The number of results to return. If not provided, will default to the knowledge base configured top_k.
1 <= x <= 100The threshold to apply to the search. If not provided, will default to the knowledge base configured threshold
0 <= x <= 1The type of search to perform. If not provided, will default to the knowledge base configured retrieval_type
vector_search, keyword_search, hybrid_search The metadata filter to apply to the search. Check the Searching a Knowledge Base for more information.
Additional search options
Show child attributes
Whether to include the vector in the chunk
Whether to include the metadata in the chunk
Whether to include the scores in the chunk
Override the rerank configuration for this search. If not provided, will use the knowledge base configured rerank settings.
Show child attributes
The name of the rerank model to use. Refer to the model list.
"cohere/rerank-multilingual-v3.0"
The threshold value used to filter the rerank results, only documents with a relevance score greater than the threshold will be returned
0 <= x <= 1The number of top results to return after reranking. If not provided, will default to the knowledge base configured top_k.
1 <= x <= 100Override the agentic RAG configuration for this search. If not provided, will use the knowledge base configured agentic RAG settings.
Show child attributes
The name of the model for the Agent to use. Refer to the model list.
The query to use to search the knowledge base. If not provided we will use the last user message from the messages of the requests
Array of models with weights for load balancing requests
[
{ "model": "openai/gpt-4o", "weight": 0.7 },
{
"model": "anthropic/claude-3-5-sonnet",
"weight": 0.3
}
]
{
"retry": { "count": 3, "on_codes": [429, 500, 502] },
"fallbacks": [
{ "model": "openai/gpt-5" },
{ "model": "anthropic/claude-4-opus" }
],
"contact": {
"id": "contact_01ARZ3NDEKTSV4RRFFQ69G5FAV",
"display_name": "Jane Doe",
"email": "[email protected]"
},
"thread": {
"id": "thread_01ARZ3NDEKTSV4RRFFQ69G5FAV",
"tags": ["customer-support"]
},
"inputs": {
"customer_name": "John Smith",
"issue_type": "billing"
},
"cache": { "ttl": 3600, "type": "exact_match" },
"knowledge_bases": [
{
"knowledge_id": "knowledge_01ARZ3NDEKTSV4RRFFQ69G5FAV",
"top_k": 5
}
],
"timeout": { "call_timeout": 30000 }
}
Returns a chat completion object, or a streamed sequence of chat completion chunk objects if the request is streamed.
Represents a chat completion response returned by model, based on the provided input.
A unique identifier for the chat completion.
A list of chat completion choices. Can be more than one if n is greater than 1.
Show child attributes
The reason the model stopped generating tokens.
stop, length, tool_calls, content_filter, function_call A chat completion message generated by the model.
Show child attributes
Show child attributes
function Show child attributes
The name of the function to be called. Must be a-z, A-Z, 0-9, or contain underscores and dashes, with a maximum length of 64.
The arguments to call the function with, as generated by the model in JSON format. Note that the model does not always generate valid JSON, and may hallucinate parameters not defined by your function schema. Validate the arguments in your code before calling your function.
assistant Internal thought process of the model
The signature holds a cryptographic token which verifies that the thinking block was generated by the model, and is verified when thinking is part of a multiturn conversation. This value should not be modified and should always be sent to the API when the reasoning is redacted. Currently only supported by Anthropic.
Occasionally the model's internal reasoning will be flagged by the safety systems of the provider. When this occurs, the provider will encrypt the reasoning. These redacted reasoning is decrypted when passed back to the API, allowing the model to continue its response without losing context.
If the audio output modality is requested, this object contains data about the audio response from the model.
The index of the choice in the list of choices.
Log probability information for the choice.
Show child attributes
A list of message content tokens with log probability information.
Show child attributes
The token.
The log probability of this token, if it is within the top 20 most likely tokens. Otherwise, the value -9999.0 is used to signify that the token is very unlikely.
A list of integers representing the UTF-8 bytes representation of the token.
List of the most likely tokens and their log probability, at this token position.
Show child attributes
The token.
The log probability of this token, if it is within the top 20 most likely tokens. Otherwise, the value -9999.0 is used to signify that the token is very unlikely.
A list of integers representing the UTF-8 bytes representation of the token.
A list of message refusal tokens with log probability information.
Show child attributes
The token.
The log probability of this token, if it is within the top 20 most likely tokens. Otherwise, the value -9999.0 is used to signify that the token is very unlikely.
A list of integers representing the UTF-8 bytes representation of the token.
List of the most likely tokens and their log probability, at this token position.
Show child attributes
The token.
The log probability of this token, if it is within the top 20 most likely tokens. Otherwise, the value -9999.0 is used to signify that the token is very unlikely.
A list of integers representing the UTF-8 bytes representation of the token.
The Unix timestamp (in seconds) of when the chat completion was created.
The model used for the chat completion.
chat.completion This fingerprint represents the backend configuration that the model runs with.
Usage statistics for the completion request.
Show child attributes
Number of tokens in the generated completion.
Number of tokens in the prompt.
Total number of tokens used in the request (prompt + completion).
Show child attributes
The number of audio output tokens produced by the response.
Was this page helpful?