torch.nn.TransformerDecoderLayerOptions

export interface TransformerDecoderLayerOptions {
  /**
   * Total embedding dimension of the model. All sub-layers (self-attention, cross-attention, FFN)
   * project to this dimension. Must be divisible by nhead. Common values: 512, 768, 1024.
   *
   * **Example:** d_model=512 with nhead=8 means each head processes 64 dimensions.
   * **Consistency:** Should typically match encoder's d_model for encoder-decoder architectures.
   */
  d_model: number;

  /**
   * Number of parallel attention heads in both self-attention and cross-attention sublayers.
   * Both attention mechanisms use the same number of heads. Common values: 8, 12, 16.
   *
   * **Constraint:** d_model must be divisible by nhead (validated in MultiheadAttention).
   * **Consistency:** Should match encoder's nhead for encoder-decoder architectures.
   */
  nhead: number;

  /**
   * Dimensionality of the intermediate feed-forward network hidden layer. Projects from d_model
   * to dim_feedforward, applies activation, then projects back to d_model.
   *
   * **Common ratio:** Typically 4x d_model (e.g., 512 → 2048).
   * **Effect:** Larger values increase model capacity but also computation and memory.
   * **Consistency:** Can differ from encoder's dim_feedforward if needed, but typically same.
   *
   * **Default:** 2048
   */
  dim_feedforward?: number;

  /**
   * Dropout probability applied during training to attention weights and FFN intermediate layer.
   * Prevents co-adaptation and overfitting. Automatically disabled in evaluation mode.
   *
   * **Common values:** 0.0 (no dropout), 0.1 (standard), 0.2 (heavy regularization).
   * **Effect:** Higher values = stronger regularization = potentially better generalization.
   * **Typical:** 0.1 for most models.
   *
   * **Default:** 0.1
   */
  dropout?: number;

  /**
   * Activation function in the feed-forward network: 'relu' (original Transformer) or 'gelu' (modern).
   *
   * **'relu':** max(0, x). Sharp activation, standard in 2017 Transformer paper.
   * **'gelu':** Smooth approximation of ReLU, used in BERT, GPT-2/3. Better gradient flow.
   *
   * **Recommendation:** Use 'gelu' for new models (better convergence, generalization).
   * **Consistency:** Can differ from encoder's activation if needed, but typically same.
   *
   * **Default:** 'relu'
   */
  activation?: 'relu' | 'gelu';

  /**
   * Epsilon for layer normalization numerical stability. Prevents division by zero in LayerNorm.
   *
   * **Typical:** 1e-5 (default), 1e-6, 1e-12.
   * **Effect:** Smaller = more stable, larger = safer for extreme cases.
   * **Consistency:** Should match encoder's layer_norm_eps.
   *
   * **Default:** 1e-5
   */
  layer_norm_eps?: number;

  /**
   * Shape format: false = (seq_len, batch, d_model), true = (batch, seq_len, d_model).
   *
   * **batch_first=false (default):** Matches PyTorch convention and original Transformer.
   * **batch_first=true:** More intuitive for users familiar with CNN conventions.
   *
   * **Behavior:** Only affects shape interpretation, not computation.
   * **Consistency:** Should match encoder's batch_first setting.
   *
   * **Default:** false
   */
  batch_first?: boolean;

  /**
   * Layer normalization placement: Pre-LN vs Post-LN architecture.
   *
   * **norm_first=false (Post-LN, default):** LayerNorm applied AFTER each sub-layer.
   *   - Original design from "Attention is All You Need" (2017).
   *   - Can be unstable with many layers (gradient scaling).
   *
   * **norm_first=true (Pre-LN):** LayerNorm applied BEFORE each sub-layer.
   *   - More stable for deep models (12+ layers).
   *   - Better gradient flow, easier to train.
   *   - Standard in GPT-2/3, modern Transformers.
   *
   * **Recommendation:** Use norm_first=true for 12+ layer models.
   * **Consistency:** Should match encoder's norm_first setting.
   *
   * **Default:** false (Post-LN, original design)
   */
  norm_first?: boolean;
}

d_model(number): – Total embedding dimension of the model. All sub-layers (self-attention, cross-attention, FFN) project to this dimension. Must be divisible by nhead. Common values: 512, 768, 1024. Example: d_model=512 with nhead=8 means each head processes 64 dimensions. Consistency: Should typically match encoder's d_model for encoder-decoder architectures.
nhead(number): – Number of parallel attention heads in both self-attention and cross-attention sublayers. Both attention mechanisms use the same number of heads. Common values: 8, 12, 16. Constraint: d_model must be divisible by nhead (validated in MultiheadAttention). Consistency: Should match encoder's nhead for encoder-decoder architectures.
dim_feedforward(number)optional: – Dimensionality of the intermediate feed-forward network hidden layer. Projects from d_model to dim_feedforward, applies activation, then projects back to d_model. Common ratio: Typically 4x d_model (e.g., 512 → 2048). Effect: Larger values increase model capacity but also computation and memory. Consistency: Can differ from encoder's dim_feedforward if needed, but typically same. Default: 2048
dropout(number)optional: – Dropout probability applied during training to attention weights and FFN intermediate layer. Prevents co-adaptation and overfitting. Automatically disabled in evaluation mode. Common values: 0.0 (no dropout), 0.1 (standard), 0.2 (heavy regularization). Effect: Higher values = stronger regularization = potentially better generalization. Typical: 0.1 for most models. Default: 0.1
activation('relu' | 'gelu')optional: – Activation function in the feed-forward network: 'relu' (original Transformer) or 'gelu' (modern). 'relu': max(0, x). Sharp activation, standard in 2017 Transformer paper. 'gelu': Smooth approximation of ReLU, used in BERT, GPT-2/3. Better gradient flow. Recommendation: Use 'gelu' for new models (better convergence, generalization). Consistency: Can differ from encoder's activation if needed, but typically same. Default: 'relu'
layer_norm_eps(number)optional: – Epsilon for layer normalization numerical stability. Prevents division by zero in LayerNorm. Typical: 1e-5 (default), 1e-6, 1e-12. Effect: Smaller = more stable, larger = safer for extreme cases. Consistency: Should match encoder's layer_norm_eps. Default: 1e-5
batch_first(boolean)optional: – Shape format: false = (seq_len, batch, d_model), true = (batch, seq_len, d_model). batch_first=false (default): Matches PyTorch convention and original Transformer. batch_first=true: More intuitive for users familiar with CNN conventions. Behavior: Only affects shape interpretation, not computation. Consistency: Should match encoder's batch_first setting. Default: false
norm_first(boolean)optional: – Layer normalization placement: Pre-LN vs Post-LN architecture. norm_first=false (Post-LN, default): LayerNorm applied AFTER each sub-layer. - Original design from "Attention is All You Need" (2017). - Can be unstable with many layers (gradient scaling). norm_first=true (Pre-LN): LayerNorm applied BEFORE each sub-layer. - More stable for deep models (12+ layers). - Better gradient flow, easier to train. - Standard in GPT-2/3, modern Transformers. Recommendation: Use norm_first=true for 12+ layer models. Consistency: Should match encoder's norm_first setting. Default: false (Post-LN, original design)