export interface MultiheadAttentionOptions {
/**
* Total embedding dimension of the model. This is the dimension of query, key, and value embeddings
* before projection. Must be divisible by num_heads to split into independent attention heads.
* Common values: 256, 512, 768 (BERT), 1024, 2048.
*
* **Example:** embed_dim=512, num_heads=8 means each head processes 512/8=64 dimensions independently.
*/
embed_dim: number;
/**
* Number of parallel attention heads. Higher numbers enable learning diverse attention patterns but
* increase computation and parameters. Each head processes embed_dim/num_heads dimensions.
* Common values: 8, 12 (BERT), 16. Must divide embed_dim evenly.
*
* **Trade-offs:**
* - More heads: Better for capturing diverse patterns (syntax, semantics, syntax, etc.)
* - Fewer heads: Faster computation, fewer parameters, simpler learned patterns
* - Typical: 8-16 heads for reasonable size models
*/
num_heads: number;
/**
* Dropout probability applied to attention weights after softmax. Helps prevent overfitting by
* randomly zeroing attention connections during training. Set to 0 (default) for no dropout.
* Common values: 0.0 (no dropout), 0.1, 0.2. Automatically disabled during evaluation mode.
*
* **Effect:** Prevents co-adaptation of attention heads. During training, each head randomly
* ignores some key-value pairs with probability dropout.
*
* **Default:** 0.0 (no dropout)
*/
dropout?: number;
/**
* Whether to add learnable bias terms to query/key/value projections and output projection.
* Typically true for better expressiveness, false for minimal parameters. Most models use bias=true.
*
* **With bias=true:** Allows shifting the input space before dot-product attention.
* **With bias=false:** Simpler model, marginal accuracy difference in most cases.
*
* **Default:** true
*/
bias?: boolean;
/**
* Whether to add a learnable key and value bias vector to the beginning of key/value sequences.
* This is a specialized technique used in some Transformer variants. Rarely used in practice.
*
* **Effect:** When true, appends learned vectors to key and value sequences before attention.
* Used in some variants for learnable positional biasing or auxiliary tokens.
*
* **Default:** false
*/
add_bias_kv?: boolean;
/**
* Whether to add a zero attention vector to key and value sequences. This is a specialized
* attention variant. Rarely used in modern architectures. When true, prepends zero vectors to
* key/value sequences before computing attention.
*
* **Effect:** Provides additional "no attention" positions that can attend to nothing explicitly.
*
* **Default:** false
*/
add_zero_attn?: boolean;
/**
* Total dimension of the key embeddings. Used for cross-attention where key comes from a different
* source than query. For self-attention, usually equals embed_dim. For cross-attention (like
* encoder-decoder), can differ if encoder has different hidden dimension.
*
* **Cross-attention example:** embed_dim=512 (decoder), kdim=768 (encoder output dimension)
* **Self-attention:** kdim defaults to embed_dim
*
* **Default:** undefined (uses embed_dim)
*/
kdim?: number;
/**
* Total dimension of the value embeddings. Used for cross-attention where value comes from a
* different source. Usually equals kdim. For self-attention, equals embed_dim.
*
* **Purpose:** Allows projecting encoder outputs to different space for cross-attention.
* **Cross-attention example:** vdim=768 when attending to encoder outputs with 768 dimensions
* **Self-attention:** vdim defaults to embed_dim
*
* **Default:** undefined (uses embed_dim)
*/
vdim?: number;
/**
* Whether input/output tensors follow batch-first format. Controls expected shape convention.
* **batch_first=false (default):** Shapes are (sequence_length, batch, embedding_dim)
* - Matches PyTorch default and RNN conventions
* - More natural for variable-length sequences
* **batch_first=true:** Shapes are (batch, sequence_length, embedding_dim)
* - More intuitive for most users (batch first like CNN conventions)
* - Requires automatic transposition internally
*
* **Behavior:** This parameter only controls shape interpretation. Internally, computation
* uses sequence-first format for efficiency. Transposition is automatic based on this flag.
*
* **Example:**
* - batch_first=false: input shape [50, 32, 512] = [seq_len=50, batch=32, embed=512]
* - batch_first=true: input shape [32, 50, 512] = [batch=32, seq_len=50, embed=512]
*
* **Default:** false (sequence-first, matching PyTorch default)
*/
batch_first?: boolean;
}
embed_dim(number)- – Total embedding dimension of the model. This is the dimension of query, key, and value embeddings before projection. Must be divisible by num_heads to split into independent attention heads. Common values: 256, 512, 768 (BERT), 1024, 2048. Example: embed_dim=512, num_heads=8 means each head processes 512/8=64 dimensions independently.
num_heads(number)- – Number of parallel attention heads. Higher numbers enable learning diverse attention patterns but increase computation and parameters. Each head processes embed_dim/num_heads dimensions. Common values: 8, 12 (BERT), 16. Must divide embed_dim evenly. Trade-offs: - More heads: Better for capturing diverse patterns (syntax, semantics, syntax, etc.) - Fewer heads: Faster computation, fewer parameters, simpler learned patterns - Typical: 8-16 heads for reasonable size models
dropout(number)optional- – Dropout probability applied to attention weights after softmax. Helps prevent overfitting by randomly zeroing attention connections during training. Set to 0 (default) for no dropout. Common values: 0.0 (no dropout), 0.1, 0.2. Automatically disabled during evaluation mode. Effect: Prevents co-adaptation of attention heads. During training, each head randomly ignores some key-value pairs with probability dropout. Default: 0.0 (no dropout)
bias(boolean)optional- – Whether to add learnable bias terms to query/key/value projections and output projection. Typically true for better expressiveness, false for minimal parameters. Most models use bias=true. With bias=true: Allows shifting the input space before dot-product attention. With bias=false: Simpler model, marginal accuracy difference in most cases. Default: true
add_bias_kv(boolean)optional- – Whether to add a learnable key and value bias vector to the beginning of key/value sequences. This is a specialized technique used in some Transformer variants. Rarely used in practice. Effect: When true, appends learned vectors to key and value sequences before attention. Used in some variants for learnable positional biasing or auxiliary tokens. Default: false
add_zero_attn(boolean)optional- – Whether to add a zero attention vector to key and value sequences. This is a specialized attention variant. Rarely used in modern architectures. When true, prepends zero vectors to key/value sequences before computing attention. Effect: Provides additional "no attention" positions that can attend to nothing explicitly. Default: false
kdim(number)optional- – Total dimension of the key embeddings. Used for cross-attention where key comes from a different source than query. For self-attention, usually equals embed_dim. For cross-attention (like encoder-decoder), can differ if encoder has different hidden dimension. Cross-attention example: embed_dim=512 (decoder), kdim=768 (encoder output dimension) Self-attention: kdim defaults to embed_dim Default: undefined (uses embed_dim)
vdim(number)optional- – Total dimension of the value embeddings. Used for cross-attention where value comes from a different source. Usually equals kdim. For self-attention, equals embed_dim. Purpose: Allows projecting encoder outputs to different space for cross-attention. Cross-attention example: vdim=768 when attending to encoder outputs with 768 dimensions Self-attention: vdim defaults to embed_dim Default: undefined (uses embed_dim)
batch_first(boolean)optional- – Whether input/output tensors follow batch-first format. Controls expected shape convention. batch_first=false (default): Shapes are (sequence_length, batch, embedding_dim) - Matches PyTorch default and RNN conventions - More natural for variable-length sequences batch_first=true: Shapes are (batch, sequence_length, embedding_dim) - More intuitive for most users (batch first like CNN conventions) - Requires automatic transposition internally Behavior: This parameter only controls shape interpretation. Internally, computation uses sequence-first format for efficiency. Transposition is automatic based on this flag. Example: - batch_first=false: input shape [50, 32, 512] = [seq_len=50, batch=32, embed=512] - batch_first=true: input shape [32, 50, 512] = [batch=32, seq_len=50, embed=512] Default: false (sequence-first, matching PyTorch default)