export interface TransformerEncoderLayerOptions {
/**
* Total embedding dimension of the model. All sub-layers (attention, FFN) project to this dimension.
* Must be divisible by nhead to enable multi-head attention. Common values: 256, 512, 768 (BERT),
* 1024 (GPT-2), 1280 (T5-large).
*
* **Example:** d_model=512 with nhead=8 means each attention head processes 512/8=64 dimensions.
*/
d_model: number;
/**
* Number of parallel attention heads in the multi-head attention sublayer. Higher values enable
* learning diverse attention patterns (different heads focus on different aspects of input).
* Common values: 8, 12 (BERT), 16 (GPT-2), 20 (T5).
*
* **Constraint:** d_model must be divisible by nhead (enforced in MultiheadAttention constructor).
* **Trade-off:** More heads = more parameters and computation, but potentially better expressiveness.
*/
nhead: number;
/**
* Dimensionality of the intermediate feed-forward network layer. The FFN projects from d_model to
* dim_feedforward, applies activation, then projects back to d_model. This intermediate expansion
* enables non-linear feature combinations.
*
* **Common ratio:** Typically 4x d_model (e.g., d_model=512 → dim_feedforward=2048).
* **Effect:** Larger values increase model capacity and parameters, but also computation and memory.
* **Recommendation:** Keep at 4x d_model unless you have specific reasons to change.
*
* **Default:** 2048
*/
dim_feedforward?: number;
/**
* Dropout probability applied to attention weights and between FFN layers. Helps prevent overfitting
* by randomly dropping attention connections and intermediate activations during training.
* Automatically disabled during evaluation mode (self.training=false).
*
* **Common values:** 0.0 (no dropout), 0.1 (10%), 0.2 (20%) for regularization.
* **Effect:** Higher values = more regularization = potentially better generalization but slower convergence.
* **Typical:** 0.1 for most models, 0.2 for smaller datasets to prevent overfitting.
*
* **Default:** 0.1
*/
dropout?: number;
/**
* Activation function used in the feed-forward network. Two options: 'relu' (original Transformer)
* or 'gelu' (used in BERT, GPT-2/3). ReLU is simpler and faster, GELU is smoother.
*
* **'relu':** max(0, x). Sharp activation, was standard in "Attention is All You Need" (2017).
* Simpler, slightly faster, but gradient can be unstable.
* **'gelu':** x * Φ(x) where Φ is CDF of normal distribution. Smoother, used in modern models (BERT, GPT-2).
* Better gradient flow, better generalization in practice.
*
* **Recommendation:** Use 'gelu' for new models unless you have compatibility requirements.
*
* **Default:** 'relu'
*/
activation?: 'relu' | 'gelu';
/**
* Epsilon value for layer normalization to ensure numerical stability. The LayerNorm computes
* (x - mean) / sqrt(var + eps). Prevents division by zero when variance is very small.
*
* **Typical values:** 1e-5, 1e-6, 1e-12.
* **Effect:** Smaller values = more stable but can cause numerical issues. Larger values = less stable
* but safer for extreme cases.
* **Recommendation:** Keep at default 1e-5 unless debugging numerical issues.
*
* **Default:** 1e-5
*/
layer_norm_eps?: number;
/**
* Input/output shape format convention. Controls how tensors are interpreted without changing computation.
*
* **batch_first=false (default):** Shape is (sequence_length, batch_size, d_model).
* - Standard in PyTorch RNNs and original Transformer paper.
* - More natural for variable-length sequences (pad to max_len).
* - Internally, attention computation prefers this order for efficiency.
*
* **batch_first=true:** Shape is (batch_size, sequence_length, d_model).
* - More intuitive for users familiar with CNN conventions (batch first).
* - Requires automatic transposition internally (negligible overhead).
*
* **Behavior:** Setting this flag only changes shape interpretation. Computation remains identical.
*
* **Default:** false (sequence-first)
*/
batch_first?: boolean;
/**
* Layer normalization placement within the block (Pre-LN vs Post-LN architecture).
*
* **norm_first=false (Post-LN, default):** LayerNorm applied AFTER each sub-layer.
* - Original "Attention is All You Need" design (2017).
* - Formula: x' = LayerNorm(x + SubLayer(x))
* - Can suffer from training instability with many layers (gradient scaling issues).
*
* **norm_first=true (Pre-LN):** LayerNorm applied BEFORE each sub-layer.
* - Formula: x' = x + SubLayer(LayerNorm(x))
* - More stable for deep models (12+ layers). Used in GPT-2, GPT-3, modern Transformers.
* - Better gradient flow, enables training without careful learning rate tuning.
*
* **Recommendation:** Use norm_first=true for models with 12+ layers or when training is unstable.
*
* **Default:** false (Post-LN, matching original Transformer)
*/
norm_first?: boolean;
}
d_model(number)- – Total embedding dimension of the model. All sub-layers (attention, FFN) project to this dimension. Must be divisible by nhead to enable multi-head attention. Common values: 256, 512, 768 (BERT), 1024 (GPT-2), 1280 (T5-large). Example: d_model=512 with nhead=8 means each attention head processes 512/8=64 dimensions.
nhead(number)- – Number of parallel attention heads in the multi-head attention sublayer. Higher values enable learning diverse attention patterns (different heads focus on different aspects of input). Common values: 8, 12 (BERT), 16 (GPT-2), 20 (T5). Constraint: d_model must be divisible by nhead (enforced in MultiheadAttention constructor). Trade-off: More heads = more parameters and computation, but potentially better expressiveness.
dim_feedforward(number)optional- – Dimensionality of the intermediate feed-forward network layer. The FFN projects from d_model to dim_feedforward, applies activation, then projects back to d_model. This intermediate expansion enables non-linear feature combinations. Common ratio: Typically 4x d_model (e.g., d_model=512 → dim_feedforward=2048). Effect: Larger values increase model capacity and parameters, but also computation and memory. Recommendation: Keep at 4x d_model unless you have specific reasons to change. Default: 2048
dropout(number)optional- – Dropout probability applied to attention weights and between FFN layers. Helps prevent overfitting by randomly dropping attention connections and intermediate activations during training. Automatically disabled during evaluation mode (self.training=false). Common values: 0.0 (no dropout), 0.1 (10%), 0.2 (20%) for regularization. Effect: Higher values = more regularization = potentially better generalization but slower convergence. Typical: 0.1 for most models, 0.2 for smaller datasets to prevent overfitting. Default: 0.1
activation('relu' | 'gelu')optional- – Activation function used in the feed-forward network. Two options: 'relu' (original Transformer) or 'gelu' (used in BERT, GPT-2/3). ReLU is simpler and faster, GELU is smoother. 'relu': max(0, x). Sharp activation, was standard in "Attention is All You Need" (2017). Simpler, slightly faster, but gradient can be unstable. 'gelu': x * Φ(x) where Φ is CDF of normal distribution. Smoother, used in modern models (BERT, GPT-2). Better gradient flow, better generalization in practice. Recommendation: Use 'gelu' for new models unless you have compatibility requirements. Default: 'relu'
layer_norm_eps(number)optional- – Epsilon value for layer normalization to ensure numerical stability. The LayerNorm computes (x - mean) / sqrt(var + eps). Prevents division by zero when variance is very small. Typical values: 1e-5, 1e-6, 1e-12. Effect: Smaller values = more stable but can cause numerical issues. Larger values = less stable but safer for extreme cases. Recommendation: Keep at default 1e-5 unless debugging numerical issues. Default: 1e-5
batch_first(boolean)optional- – Input/output shape format convention. Controls how tensors are interpreted without changing computation. batch_first=false (default): Shape is (sequence_length, batch_size, d_model). - Standard in PyTorch RNNs and original Transformer paper. - More natural for variable-length sequences (pad to max_len). - Internally, attention computation prefers this order for efficiency. batch_first=true: Shape is (batch_size, sequence_length, d_model). - More intuitive for users familiar with CNN conventions (batch first). - Requires automatic transposition internally (negligible overhead). Behavior: Setting this flag only changes shape interpretation. Computation remains identical. Default: false (sequence-first)
norm_first(boolean)optional- – Layer normalization placement within the block (Pre-LN vs Post-LN architecture). norm_first=false (Post-LN, default): LayerNorm applied AFTER each sub-layer. - Original "Attention is All You Need" design (2017). - Formula: x' = LayerNorm(x + SubLayer(x)) - Can suffer from training instability with many layers (gradient scaling issues). norm_first=true (Pre-LN): LayerNorm applied BEFORE each sub-layer. - Formula: x' = x + SubLayer(LayerNorm(x)) - More stable for deep models (12+ layers). Used in GPT-2, GPT-3, modern Transformers. - Better gradient flow, enables training without careful learning rate tuning. Recommendation: Use norm_first=true for models with 12+ layers or when training is unstable. Default: false (Post-LN, matching original Transformer)