export interface TransformerDecoderLayerOptions {
/**
* Total embedding dimension of the model. All sub-layers (self-attention, cross-attention, FFN)
* project to this dimension. Must be divisible by nhead. Common values: 512, 768, 1024.
*
* **Example:** d_model=512 with nhead=8 means each head processes 64 dimensions.
* **Consistency:** Should typically match encoder's d_model for encoder-decoder architectures.
*/
d_model: number;
/**
* Number of parallel attention heads in both self-attention and cross-attention sublayers.
* Both attention mechanisms use the same number of heads. Common values: 8, 12, 16.
*
* **Constraint:** d_model must be divisible by nhead (validated in MultiheadAttention).
* **Consistency:** Should match encoder's nhead for encoder-decoder architectures.
*/
nhead: number;
/**
* Dimensionality of the intermediate feed-forward network hidden layer. Projects from d_model
* to dim_feedforward, applies activation, then projects back to d_model.
*
* **Common ratio:** Typically 4x d_model (e.g., 512 → 2048).
* **Effect:** Larger values increase model capacity but also computation and memory.
* **Consistency:** Can differ from encoder's dim_feedforward if needed, but typically same.
*
* **Default:** 2048
*/
dim_feedforward?: number;
/**
* Dropout probability applied during training to attention weights and FFN intermediate layer.
* Prevents co-adaptation and overfitting. Automatically disabled in evaluation mode.
*
* **Common values:** 0.0 (no dropout), 0.1 (standard), 0.2 (heavy regularization).
* **Effect:** Higher values = stronger regularization = potentially better generalization.
* **Typical:** 0.1 for most models.
*
* **Default:** 0.1
*/
dropout?: number;
/**
* Activation function in the feed-forward network: 'relu' (original Transformer) or 'gelu' (modern).
*
* **'relu':** max(0, x). Sharp activation, standard in 2017 Transformer paper.
* **'gelu':** Smooth approximation of ReLU, used in BERT, GPT-2/3. Better gradient flow.
*
* **Recommendation:** Use 'gelu' for new models (better convergence, generalization).
* **Consistency:** Can differ from encoder's activation if needed, but typically same.
*
* **Default:** 'relu'
*/
activation?: 'relu' | 'gelu';
/**
* Epsilon for layer normalization numerical stability. Prevents division by zero in LayerNorm.
*
* **Typical:** 1e-5 (default), 1e-6, 1e-12.
* **Effect:** Smaller = more stable, larger = safer for extreme cases.
* **Consistency:** Should match encoder's layer_norm_eps.
*
* **Default:** 1e-5
*/
layer_norm_eps?: number;
/**
* Shape format: false = (seq_len, batch, d_model), true = (batch, seq_len, d_model).
*
* **batch_first=false (default):** Matches PyTorch convention and original Transformer.
* **batch_first=true:** More intuitive for users familiar with CNN conventions.
*
* **Behavior:** Only affects shape interpretation, not computation.
* **Consistency:** Should match encoder's batch_first setting.
*
* **Default:** false
*/
batch_first?: boolean;
/**
* Layer normalization placement: Pre-LN vs Post-LN architecture.
*
* **norm_first=false (Post-LN, default):** LayerNorm applied AFTER each sub-layer.
* - Original design from "Attention is All You Need" (2017).
* - Can be unstable with many layers (gradient scaling).
*
* **norm_first=true (Pre-LN):** LayerNorm applied BEFORE each sub-layer.
* - More stable for deep models (12+ layers).
* - Better gradient flow, easier to train.
* - Standard in GPT-2/3, modern Transformers.
*
* **Recommendation:** Use norm_first=true for 12+ layer models.
* **Consistency:** Should match encoder's norm_first setting.
*
* **Default:** false (Post-LN, original design)
*/
norm_first?: boolean;
}
d_model(number)- – Total embedding dimension of the model. All sub-layers (self-attention, cross-attention, FFN) project to this dimension. Must be divisible by nhead. Common values: 512, 768, 1024. Example: d_model=512 with nhead=8 means each head processes 64 dimensions. Consistency: Should typically match encoder's d_model for encoder-decoder architectures.
nhead(number)- – Number of parallel attention heads in both self-attention and cross-attention sublayers. Both attention mechanisms use the same number of heads. Common values: 8, 12, 16. Constraint: d_model must be divisible by nhead (validated in MultiheadAttention). Consistency: Should match encoder's nhead for encoder-decoder architectures.
dim_feedforward(number)optional- – Dimensionality of the intermediate feed-forward network hidden layer. Projects from d_model to dim_feedforward, applies activation, then projects back to d_model. Common ratio: Typically 4x d_model (e.g., 512 → 2048). Effect: Larger values increase model capacity but also computation and memory. Consistency: Can differ from encoder's dim_feedforward if needed, but typically same. Default: 2048
dropout(number)optional- – Dropout probability applied during training to attention weights and FFN intermediate layer. Prevents co-adaptation and overfitting. Automatically disabled in evaluation mode. Common values: 0.0 (no dropout), 0.1 (standard), 0.2 (heavy regularization). Effect: Higher values = stronger regularization = potentially better generalization. Typical: 0.1 for most models. Default: 0.1
activation('relu' | 'gelu')optional- – Activation function in the feed-forward network: 'relu' (original Transformer) or 'gelu' (modern). 'relu': max(0, x). Sharp activation, standard in 2017 Transformer paper. 'gelu': Smooth approximation of ReLU, used in BERT, GPT-2/3. Better gradient flow. Recommendation: Use 'gelu' for new models (better convergence, generalization). Consistency: Can differ from encoder's activation if needed, but typically same. Default: 'relu'
layer_norm_eps(number)optional- – Epsilon for layer normalization numerical stability. Prevents division by zero in LayerNorm. Typical: 1e-5 (default), 1e-6, 1e-12. Effect: Smaller = more stable, larger = safer for extreme cases. Consistency: Should match encoder's layer_norm_eps. Default: 1e-5
batch_first(boolean)optional- – Shape format: false = (seq_len, batch, d_model), true = (batch, seq_len, d_model). batch_first=false (default): Matches PyTorch convention and original Transformer. batch_first=true: More intuitive for users familiar with CNN conventions. Behavior: Only affects shape interpretation, not computation. Consistency: Should match encoder's batch_first setting. Default: false
norm_first(boolean)optional- – Layer normalization placement: Pre-LN vs Post-LN architecture. norm_first=false (Post-LN, default): LayerNorm applied AFTER each sub-layer. - Original design from "Attention is All You Need" (2017). - Can be unstable with many layers (gradient scaling). norm_first=true (Pre-LN): LayerNorm applied BEFORE each sub-layer. - More stable for deep models (12+ layers). - Better gradient flow, easier to train. - Standard in GPT-2/3, modern Transformers. Recommendation: Use norm_first=true for 12+ layer models. Consistency: Should match encoder's norm_first setting. Default: false (Post-LN, original design)