
    Ph                        d Z ddlmZmZmZ ddlZddlZddlmZ ddl	mc m
Z ddlmZmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/m0Z0  ejb                  e2      Z3dDde4fdZ5	 dEdejl                  de4de4de4fdZ7dejl                  dejp                  de4de9de4dejl                  fdZ:ddejv                  fdejl                  d e4d!e4d"e<d#e4d$ejz                  de>ejl                  ejl                  f   fd%Z?d&ejl                  d'ee4   dejl                  fd(Z@ G d) d*e'      ZA G d+ d,e(      ZB G d- d.e       ZC G d/ d0e%      ZD G d1 d2e)      ZE G d3 d4e&      ZFe G d5 d6e$             ZG G d7 d8eG      ZH G d9 d:eG      ZI G d; d<eG      ZJ G d= d>eG      ZK G d? d@eG      ZL G dA dBe#      ZMg dCZNy)Fz<Blt modular model, inheriting from Mllama where appropriate.    )CallableOptionalUnionN   )CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringlogging)OutputRecordercheck_model_inputs   )Cohere2RotaryEmbeddingrotate_half)MllamaForCausalLMMllamaPreTrainedModelMllamaSelfAttentionDecoderLayerMllamaTextCrossAttentionMllamaTextMLPMllamaTextRMSNormMllamaTextSelfAttentioneager_attention_forward   )	BltConfigBltGlobalTransformerConfigBltLocalDecoderConfigBltLocalEncoderConfigBltPatcherConfigprimec                     t        j                  |t         j                  | j                        }t        j                  | j
                  d   | j                        }||z  }t        j                  | |z  d      S )a  
    A polynomial rolling hash algorithm that converts sequences
    of tokens into hash values. The hash is computed as:
        hash = (token_0 * prime^0 + token_1 * prime^1 + ... + token_n * prime^n)

    The rolling hash allows the model to efficiently
    identify and encode recurring byte-level patterns in the input text.

    Args:
        token_tensor (torch.Tensor): [batch_size, seq_len, group_size] containing token IDs to hash
        prime (int): Prime number used as the base for the polynomial hash.

    Returns:
        torch.Tensor: Hash values of shape [batch_size, seq_len] where each value
                     represents the hash of the corresponding token group

    Example:
        >>> tokens = torch.tensor([[1, 2, 3], [4, 5, 6]])
        >>> hashes = rolling_polynomial_hash(tokens, prime=31)
        >>> # hash[0] = 1*31^0 + 2*31^1 + 3*31^2
        >>> # hash[1] = 4*31^0 + 5*31^1 + 6*31^2
    dtypedevicer(   dim)torchtensorint64r(   arangeshapesum)token_tensorr$   prime_tensorpowersprime_powerss        ]/var/www/html/saasai/venv/lib/python3.12/site-packages/transformers/models/blt/modular_blt.pyrolling_polynomial_hashr8   9   sa    . <<U[[ATATUL\\,,,R09L9LMF'L99\L0b99    	token_ids
group_sizemax_hashc                 Z   t        j                         5  | j                  \  }}t        j                  ||dz
  t         j                  | j
                        }t        j                  || gd      }|j                  d|d      }t        ||      }	|	|z  }
ddd       |
S # 1 sw Y   
S xY w)z1Hash token groups and map to range [0, max_hash].r   r&   r+   N)	r-   no_gradr1   zerosr/   r(   catunfoldr8   )r:   r;   r$   r<   
batch_sizeseq_lenpaddingpadded_tokenswindowshasheshash_valuess              r7   byte_group_hash_functionrI   V   s     
'oo
G++j*q.T]TdTde		7I"6A>  &&q*a8(%8x' 
  
 s   BB  B*local_encoder_tokensencoder_hash_tok_embedding$encoder_hash_byte_group_nb_functionsencoder_hash_byte_group_sizeencoder_hash_byte_group_vocabreturnc                     g d}|j                  |       }d}t        |      D ]@  }	||	t        |      z     }
|D ](  }t        | ||
|      }|||z  z   }| ||      z  }|dz  }* B |S )z=Compute token embeddings enhanced with hash-based embeddings.)ʚ;l   21A ioYl   vt l   . l   }g l   Au l   0 l   T l   AK l   | r   r   )embed_tokensrangelenrI   )rJ   local_encoderrK   rL   rM   rN   primes
embeddingsembedding_idxfunc_nbr$   r;   hash_idsoffset_hash_idss                 r7   compute_hash_embeddingsr\   h   s    F ++,@AJM=>wV,-6J/0DjRWYvwH&9V)VVO4_EEJQM 7 ? r9   F	patch_idsnum_patchessequence_lengthpatches_as_queriescross_attn_kr'   c                 z   | j                   \  }}| j                  }|rp||z  }	|}
t        j                  ||      j	                  d      j	                  d      j                  |||      }| j	                  d      j                  |||      }no|}	||z  }
| j	                  d      j                  |||      }t        j                  ||      j	                  d      j	                  d      j                  |||      }||k(  }|rdnd}|j                  ||      }||	|
f}|j                   |k7  rt        d|j                    d|       |j	                  d      }d|j                  |      z
  }|j                  |j                  t        j                        t        j                  |      j                        }|S )	aR  
    Prepare cross-attention mask for patch-based attention, following mllama's robust approach.

    This function creates masks that control which patches can attend to which other patches,
    with support for query/key role swapping and cross-attention multipliers.

    Args:
        patch_ids (torch.Tensor): Tensor of shape [batch_size, seq_len] containing patch ids.
        num_patches (int): Total number of patches.
        sequence_length (int): Length of the sequence.
        patches_as_queries (bool): If True, patches are used as queries, otherwise as keys.
        cross_attn_k (int): Cross-attention multiplier for repeating patches.
        dtype (torch.dtype): Data type for the output mask.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]:
            - cross_attention_mask: 4D tensor [batch_size, 1, q_len, kv_len]
    r*   r   r)   r   r+   zCross attention mask shape z doesn't match expected g      ?)r1   r(   r-   r0   	unsqueezeexpandrepeat_interleave
ValueErrortomasked_fillboolfinfomin)r]   r^   r_   r`   ra   r'   rB   rC   r(   q_lenkv_lenq_patch_idskv_patch_idscross_attention_mask
repeat_dimexpected_shapeinverted_cross_attn_masks                    r7   #_prepare_patch_cross_attention_maskrt      s   4 $//JF l*  LLV4Yq\Yr]VJW5	 	 !**1-44ZgV|+))"-44Z+VLLV4>>qAKKANUUV`bikvw 	 ',6 )bJ/AA,T^A_ !%0N!!^3)*>*D*D)EE]^l]mn
 	

 099!<  #%9%<%<U%CC3?? ##EJJ/U1C1G1G  r9   patch_lengthsmax_patch_lengthc                 6   || S | j                  d      }g }| D ]j  }g }||dkD     D ]J  }|j                         }t        ||      \  }}|j                  |g|z         |s:|j	                  |       L |j	                  |       l t        d |D              }	t        j                  ||	f| j                  | j                        }
t        |      D ]D  \  }}|s	t        j                  || j                  | j                        |
|dt        |      f<   F |
dk7  j                  d      j                         |
j                  d   k  rM|
dk7  j                  d      j!                         j                         j                         dz   }|
ddd|f   }
|
S )a  
    Splits patch lengths into smaller segments if they exceed `max_patch_length`.
    Pads the result to uniform length across the batch.

    Args:
        patch_lengths (torch.Tensor): [batch_size, num_patches] tensor of patch lengths.
        max_patch_length (int, optional): Maximum allowed length per patch.

    Returns:
        torch.Tensor: [batch_size, max_len] tensor of split and padded patch lengths.
    Nr   c              3   2   K   | ]  }t        |        y wN)rT   ).0splitss     r7   	<genexpr>z(process_patch_lengths.<locals>.<genexpr>   s     6I&#f+Is   r&   r+   r   )sizeitemdivmodextendappendmaxr-   r?   r'   r(   	enumerater.   rT   anyr2   r1   nonzero)ru   rv   rB   	processedseqr{   lengthfull_chunks	remaindermax_lenpaddedilast_nonzeros                r7   process_patch_lengthsr      s    ##A&JI#'lF[[]F%+F4D%E"KMM+,{:;i( # 	   6I66G[[*g.m6I6IR_RfRfgFy)	6',||F-BUBU^k^r^r'sF1mFm#$ *
 	!Q##%Q7!((Q(/779==?DDFJ=L=()Mr9   c                       e Zd Zy)BltMLPN__name__
__module____qualname__ r9   r7   r   r         r9   r   c                       e Zd Zy)
BltRMSNormNr   r   r9   r7   r   r     r   r9   r   c                       e Zd Zy)BltRotaryEmbeddingNr   r   r9   r7   r   r     r   r9   r   c                   $     e Zd Zdef fdZ xZS )BltTransformerLayer	layer_idxc                     t         |           t        ||      | _        t	        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        y )N)configr   eps)super__init__BltSelfAttention	self_attnr   mlpr   hidden_sizerms_norm_epsinput_layernormpost_attention_layernormselfr   r   	__class__s      r7   r   zBltTransformerLayer.__init__  s]    )9M&>)&*<*<&BUBUV(263E3E6K^K^(_%r9   )r   r   r   intr   __classcell__r   s   @r7   r   r     s    `# ` `r9   r   c            	            e Zd Zdedef fdZ	 	 	 d	dej                  dej                  dej                  def fdZ	 xZ
S )
r   r   r   c                 4    t         |   ||       d| _        y )NT)r   r   	is_causalr   s      r7   r   zBltSelfAttention.__init__  s    +r9   hidden_statesattention_maskposition_embeddings	use_cachec           
      0    t        |   d||||||d|S )N)r   r   r   r   past_key_valuescache_positionr   )r   forward)	r   r   r   r   r   r   r   kwargsr   s	           r7   r   zBltSelfAttention.forward#  s7     w 
') 3+)
 
 	
r9   )FNN)r   r   r   r   r   r   r-   Tensorri   r   r   r   s   @r7   r   r     s]    y S   
||
 
 #\\	

 
 
r9   r   c                        e Zd ZdZddededee   f fdZ	 	 	 	 ddej                  deej                     dee
   d	eej                     d
eej                     dee   fdZ xZS )BltCrossAttentionz<Cross-attention module for Blt, following transformers styler   r   r   c                     t         |           d| _        t        | j                  |j
                        | _        t        | j                  |j
                        | _        y )NFr   )r   r   r   r   r   r   q_normk_norm)r   r   r   r   r   s       r7   r   zBltCrossAttention.__init__;  sI     !1!1v7J7JK !1!1v7J7JKr9   r   cross_attention_statesr   r   r   r   c                    |j                         \  }}}	| j                  |      }
| j                  |
      }
|
j                  ||| j                  | j
                        j                  dd      }
|| j                  |      }| j                  |      }| j                  |      }|j                  |d| j                  | j
                        j                  dd      }|j                  |d| j                  | j
                        j                  dd      }|~|j                  ||| j                  d|i      \  }}nZ|d   dk7  rG|j                  | j                     j                  |j                  | j                     j                  }}nt!        d      t"        }| j$                  j&                  dk7  rt(        | j$                  j&                     } || |
|||f| j*                  sdn| j,                  | j.                  d	|\  }}|j1                  ||d      j3                         }| j5                  |      }||z   }||fS )
Nr   r   r)   r   r   z^Cross attention layer can't find neither `cross_attn_states` nor cached values for key/values!eagerg        )dropoutscaling)r}   r   q_projview	num_headshead_dim	transposer   k_projv_projnum_key_value_headsupdater   layerskeysvaluesrf   r   r   _attn_implementationr   trainingr   r   reshape
contiguouso_proj)r   r   r   r   r   r   r   bszrl   _query_states
key_statesvalue_statesattention_interfaceattn_outputattn_weightss                   r7   r   zBltCrossAttention.forwardA  s2    &**,UA{{=1{{<0#((eT^^T]]S]]^_abc!-%)[[1G%H"%;<J;;'=>L#b$2J2JDMMZddefhijJ',,S"d6N6NPTP]P]^hhijlmnL*+:+A+Adnn?OQ_>`,(
L A!#&&t~~6;;&&t~~6== %J
 p  )@;;++w6"9$++:Z:Z"[$7	%
  $}}C$,,LL	%
 	%
!\ "))#ub9DDFkk+.!M1L((r9   ry   NNNN)r   r   r   __doc__r   r   r   r   r-   r   r   
LongTensorr   r   r   r   r   s   @r7   r   r   8  s    FLy LS LxPS} L :>+/15593)||3) !) 63) "%	3)
 !.3) !!1!123) +,3)r9   r   c                   j    e Zd ZU eed<   dZdZdZdgZ e	e
dd       e	edd      dZd	 Zd
 Zd Zy)BltPreTrainedModelr   Fr   r   local_decoderindex
layer_namer   )r   
attentionsc                     t        d      NzNo need to inherit it!AttributeErrorr   modules     r7   _init_weightsz BltPreTrainedModel._init_weights      566r9   c                     t        d      r   r   r   s     r7   _update_causal_maskz&BltPreTrainedModel._update_causal_mask  r   r9   c                     t        d      r   r   r   s     r7   5_prepare_4d_causal_attention_mask_with_cache_positionzHBltPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position  r   r9   N)r   r   r   r   __annotations___supports_attention_backend_supports_flash_attn_supports_flex_attn_no_split_modulesr   r   r   _can_record_outputsr   r   r   r   r9   r7   r   r   w  sU    "' ./'(;1Q`a$%5Q?[
777r9   r   c                   |    e Zd ZU eed<   d eedd      iZdef fdZ	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e   de	e
j                     de	e
j                     de	e   de	e
j                     dee   fdZd Z xZS )BltLocalEncoderr   encoder_attentionsr   rU   r   c           	         t         |   |       d| _        || _        t	        j
                  t        |j                        D cg c]  }t        ||       c}      | _	        t        |      | _        t	        j                  |j                  |j                  |j                  z  d      | _        t	        j                   |j"                  |j                        | _        t	        j
                         | _        |j(                  r|j                  nd}t        |      D ]3  }| j&                  j+                  t-        |||j                               5 | j/                          y c c}w )NFr   in_featuresout_featuresbiasr   r   r   r   )r   r   gradient_checkpointingr   nn
ModuleListrS   num_hidden_layersr   r   r   
rotary_embLinearr   ra   patch_embedding_projection	Embedding
vocab_sizerR   cross_attn_layerscross_attn_all_layersr   r   	post_initr   r   r   layers_to_addr   s       r7   r   zBltLocalEncoder.__init__  s"    &+#mmEJ6KcKcEdeEd	 3Ede
 -F;*,))**++f.A.AA+
'
 LL):):F<N<NO!#4:4P4P00VW}-I""))!9RXRdRde .
 	! fs   E'	input_idsinputs_embedspatch_embedsr   position_idsr   r   encoder_attention_maskr^   r]   r   c           	         || j                  |      }|j                  d   }t        j                  || j                  j                  | j
                        }|Mt        j                  |j                  d   |j                        j                  d      j                  |d      }| j                  ||      }t        j                  || j                  j                  | j
                        }t        | j                        D ]  \  }} ||f||||d|}|t        | j                        dz
  k(  s| j                  j                  sF| j!                  ||	|
      }| j#                  |      }|j%                  ||j                  d   | j                  j&                  z  | j                  j(                        }| j                  j                  r|nd} | j*                  |   d|||d|\  }}||z   } |}||fS )	Nr   pr   r   r*   r)   r   r   r   r   r   r   r   r   )rR   r1   Fr   r   r   r-   r0   r(   rc   rd   r  r   r   rT   r  patch_reducer  r   ra   r   r  )r   r  r  r  r   r  r   r   r  r^   r]   r   rB   r   r   idxlayerr   cross_attention_outputr   encoder_cross_statess                        r7   r   zBltLocalEncoder.forward  s      --i8M"((+
		-4;;3F3FQUQ^Q^_]003M<P<PQ[[\]^eefprtu  #oom\J		-4;;3F3FQUQ^Q^_#DKK0JC!$7- /- M c$++&**dkk.O.O#00YW#>>|L+33 2 21 58P8P PRVR]R]RiRi  $(;;#D#DC!	,MD,B,B9,M -".+8#9- 	-)&  ,.DD- 1.  ,222r9   c                 F   |j                   d   }|j                   d   }|j                  d      j                  dd|j                   d         }t        j                  |||f|j
                  |j                        }|j                  |d|dd      }|ddd|ddf   }|S )	a  
        Reduce variable length patches to single embedding per patch
        Note: this works with variable number of patches for different sequences in the batch
        It handles variable length patches by assuming that patch_lengths will be 0 for any
        extra patches on the *right*. Since there can be a variable number of patches
        this function also return the number of patches for each sequence in the batch.
        Any embeddings on the right that are not allocated to a patch
        (i.e. if the sum(patch_lengths[i]) < seq_len for any i)
        will be sent to a dummy patch, which is trimmed before returning.
        r   r)   r&   r   amaxF)srcr,   r   reduceinclude_selfN)r1   rc   rd   r-   r?   r'   r(   scatter_reduce)r   r   max_num_patchesr]   rB   embedding_dimreduced_embeddingss          r7   r!  zBltLocalEncoder.patch_reduce  s     #((+
%++B/''+222r=;N;Nr;RS	"[[-8@S@S\i\p\p
 0>> ? 
 03CO3CQ0FG!!r9   
NNNNNNNNNN)r   r   r   r"   r   r   r   r   r   r   r-   r   r   r   r   r   r   r   r!  r   r   s   @r7   r   r     s!   !!n-=QSbc4 2 1504/31537+/599=%),043E,,-43  -43 u||,	43
 !.43 u//043 "%43 !!1!1243 !) 643 c]43 ELL)43 +,43l"r9   r   c                   :    e Zd ZU eed<   def fdZe	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     dee   d	eej                     d
eej                     dee   fd       Z xZS )BltLocalDecoderr   c           	         t         |   |       d| _        || _        d| _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _
        t        |      | _        t        j                  |j                  |j                  |j                   z  d      | _        t%        |j                  |j&                        | _        t        j                         | _        |j,                  r|j                  nd}t        |      D ]3  }| j*                  j/                  t1        |||j                               5 | j3                          y c c}w )NFTr  r  r   r   r  )r   r   r  r   cross_attn_decoderr	  r
  rS   r  r   r   r   r  r  hidden_size_globalr   ra   r  r   r   normr  r  r   r   r  r  s       r7   r   zBltLocalDecoder.__init__  s%    &+#"&mmEJ6KcKcEdeEd	 3Ede
 -F;*,))11++f.A.AA+
'
 v11v7J7JK	!#4:4P4P00VW}-I""))!9RXRdRde .
 	! fs   E%r  r  r  r   r  r   r   r  r   c	           	      $   |j                   d   }
|}| j                  |      }|j                  |
|j                   d   | j                  j                  z  | j                  j
                        }|| j                  s||z   }|Mt        j                  |j                   d   |j                        j                  d      j                  |
d      }| j                  ||      }t        j                  || j                  j                  | j                        }t!        | j"                        D ]O  \  }}|dk(  s| j                  j$                  r! | j&                  |   d|||d|	\  }}||z   } ||f||||d|	}Q | j)                  |      }|S )	Nr   r   r*   r)   r  r  r  r   )r1   r  r   r   ra   r   r3  r-   r0   r(   rc   rd   r  r   r   r   r   r   r  r  r5  )r   r  r  r  r   r  r   r   r  r   rB   r   r   r   r#  r$  r   logitss                     r7   r   zBltLocalDecoder.forward  s    #((+
%66|D#++**1-0H0HH$++JaJa
 #D,C,C)L8M]003M<P<PQ[[\]^eefprtu  #oom\J		-4;;3F3FQUQ^Q^_!$++.HAuAv::,ED,B,B1,E -"/+7#9- 	-)& !.0F F!$7- /- M /" =)r9   NNNNNNNN)r   r   r   r!   r   r   r   r   r-   r   r   r   r   r   r   r   r   s   @r7   r1  r1     s    !!4 0  1504/31537+/599=0E,,-0  -0 u||,	0
 !.0 u//00 "%0 !!1!120 !) 60 +,0 0r9   r1  c                        e Zd ZU eed<   d eedd      iZdef fdZ	 	 	 	 dde	j                  dee	j                     d	ee	j                     d
ee   dee	j                     dee   fdZ xZS )BltGlobalTransformerr   global_attentionsr   global_transformerr   c                    t         |   |       || _        t        j                         | _        t        |j                        D ]'  }| j
                  j                  t        ||             ) t        |      | _        t        |dd       2t        j                  |j                  |j                  d      | _        nt        j"                         | _        | j%                          y )Nr  encoder_cross_output_sizeFr  )r   r   r   r	  r
  r   rS   r  r   r   r   r  getattrr  r>  r   token_embedding_projectionIdentityr  r   s      r7   r   zBltGlobalTransformer.__init__U  s     mmov778IKK269EF 9,F; 66=I.0ii00&2D2D5/D+ /1kkmD+r9   input_embedsr   r  r   r   r   c           	         |j                   \  }}}	| j                  |      }
t        j                  |
| j                  j                  | j
                        }
|Mt        j                  |j                   d   |j                        j                  d      j                  |d      }| j                  |
|      }t        | j                        D ]  \  }} ||
f||||d|}
 |
S )Nr  r   r*   r   r)   r  )r1   rA  r   r   r   r   r-   r0   r(   rc   rd   r  r   r   )r   rC  r   r  r   r   r   rB   rC   r   r   r   r   r#  s                 r7   r   zBltGlobalTransformer.forwardg  s     ".!3!3
GQ77E		-4;;3F3FQUQ^Q^_\//2<;N;NOYYZ[\ccdnprs  #oom\J!$++.HAu!$7- /- M / r9   r   )r   r   r   r    r   r   r   r   r   r-   r   r   r   r   r   r   r   r   r   s   @r7   r:  r:  O  s    &&^,<ARfg9 * 2637+/59ll !. u//0	
 "% !!1!12 +,r9   r:  c                   6    e Zd ZU eed<   def fdZ	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee
   deej                     dee   d	eej                     d
ee   dee   dee   dee   fdZe	 	 dd       Z xZS )
BltPatcherr   c                    t         |   |       t        | j                        | _        t        j                         | _        t        | j                  j                        D ]1  }| j                  j                  t        | j                  |             3 t        j                  | j                  j                  | j                  j                        | _        t!        | j                  j                  | j                  j"                        | _        t        j&                  | j                  j                  | j                  j                  d      | _        y )Nr  r   Fr?  )r   r   r   r   r  r	  r
  r   rS   r  r   r   r  r  r   rR   r   r   r5  r  lm_headr   s      r7   r   zBltPatcher.__init__  s     ,DKK@mmot{{<<=IKK24;;	JK >LL)?)?AXAXYt{{66DKK<T<TU	yyKK##KK""
r9   r  r   r  r   r  r   r   
patch_size	thresholdrv   r   c                 &   |d u |d uz  rt        d      || j                  |      }|r|
t               }|F||j                         nd}t	        j
                  |||j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }|}| j                  ||      }| j                  D ]  } ||||      } | j                  | j                  |            }t        j                  j!                  |      j#                         }|j                  d d \  }}|| j%                  ||||		      }n.t	        j&                  ||f|j(                  |j                  
      }t+        ||
      }|||fS )N:You must specify exactly one of input_ids or inputs_embedsr   r   r*   r   rC  r   r   r   r  )r   r   )r7  r   )	entropiesr_   rI  rJ  r&   )rf   rR   r   get_seq_lengthr-   r0   r1   r(   rc   r	   r   r  r   rH  r5  distributionsCategoricalentropypatch_lengths_from_entropiesonesr'   r   )r   r  r   r  r   r  r   r   rI  rJ  rv   r   past_seen_tokenscausal_maskr   r   r#  r7  prediction_entropiesrB   r_   ru   s                         r7   r   zBltPatcher.forward  s    -t";<YZZ  --i8M0*nO!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;&))+%
 &"oom\J[[E!-EXituM ! dii67$22>>f>MUUW&3&9&9"1&=#
O! ==. /%#	 > M "JJ_-]5H5HQ^QeQeM .m=MN#]F::r9   c                    | j                   d   }t        j                  ddgt        j                  | j                        j                  d      j                  |d      }|j                   d   }| ddddf   } | |kD  }|j                   d   }t        j                  || j                        j                  d      j                  |d      }	t        j                  |	|      }
t        j                  |	|
gd      }t        j                  || gd      }||   j                  ||      }|j                  d      j                         }|ddd|f   }t        j                  |||z   fd      }t        j                  |ddddf   |dz
        }t        j                  |ddddf   dz
  |fd      }||z
  dz   }|S )z
        Computes patch lengths from token entropies.

        Depending on whether a threshold is provided, the function uses either:
        - Thresholding the entropy values (when `threshold` is set).
        r   r   r&   Nr*   r)   r+   )r1   r-   r.   longr(   rc   repeatr0   rd   	full_liker@   r   r2   r   )rN  r_   rI  rJ  rB   init_tokensoffset
patch_maskrC   token_indicessentinelpadded_indicespadded_maskpatch_startsmax_valid_patchespatch_start_ids
last_token
patch_endsru   s                      r7   rS  z'BltPatcher.patch_lengths_from_entropies  s    __Q'
 LL!Quzz):J:JKUUVWX__`jlmn 	 ""1% ae$	 *
""1% WY5E5EFPPQRSZZ[egij??=':M8#<!D iij[ 9qA &k2:::wO&NNqN1557#A'9(9'9$9:  ))[,2G$HaP ___QU%;_q=PQ
YY12 6 :JGQO
"_4q8r9   r/  )NN)r   r   r   r#   r   r   r   r-   r   r   r   FloatTensorri   r   floatr   r   r   staticmethodrS  r   r   s   @r7   rF  rF    s   
/ 
  151537+/59$(59$(%)*.?;E,,-?; !.?; u//0	?;
 "%?;   1 12?; D>?; !!1!12?; SM?; E??; #3-?; +,?;B  	3 3r9   rF  c                   f    e Zd Zdef fdZe	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     dee
   deej                     d	ee   d
eej                     dee   defd       Zd Zd Zdej                  dedej                  fdZ xZS )BltModelr   c                    t         |   |       d| _        || _        t	        |j
                        | _        t        |j                        | _	        t        |j                        | _        |j                  t        |j                        z  }|j                   |z  }t#        j$                  ||j
                  j&                        | _        | j                  j*                  r[t-        |j.                        | _        | j0                  j3                          | j0                  j5                         D ]	  }d|_         nd | _        | j9                          y )NF)r   r   r  r   r   encoder_configrU   r:  global_configr<  r1  decoder_configr   rL   rT   rM   rN   r	  r  r   rK   patch_in_forwardrF  patcher_configpatchereval
parametersrequires_gradr  )r   r   num_embeddingstotal_vocab_sizeparamr   s        r7   r   zBltModel.__init__  s    &+#,V-B-BC"6v7K7K"L,V-B-BCDDs6KnKnGoo!??.P*,,,7GI^I^IjIj*k';;''%f&;&;<DLLL002&+# 3  DLr9   r  ru   r   r  r   r  r   r   r   rO   c	                 b   |d u |d uz  rt        d      ||}
|j                  \  }}}no|j                  \  }}t        || j                  | j                  | j
                  j                  | j
                  j                  | j
                  j                        }
|| j
                  j                  dk(  r| j                  |t        d      | j                  || j
                  j                  | j
                  j                  | j
                  j                  | j
                  j                  |j                        \  }}}no||j                  n|j                  }||j                   n|j                   }t#        t%        j&                  ||dz   f||      | j
                  j                        }| j)                  ||      }|F||j+                         nd}t%        j,                  |||
j                  d   z   |
j                        }||j/                  d      }t1        | j
                  |
||||	      }t3        ||j                  d   |d
| j
                  j4                  |
j                         } | j                  d||
||||j                  d   |d|	\  }}|j7                  ||j                  d   d      }t%        j,                  d|j                  d   |j                        }|j/                  d      }t1        | j
                  |d |d d 	      } | j8                  d|||d|	}| j)                  |d d dd f   |      }t3        ||j                  d   |d| j
                  j4                  |
j                         } | j:                  d||||||||d|	}t=        ||      S )NrL  rR  z0input_ids is required for entropy-based patching)rI  rJ  rv   patching_batch_sizer(   r   r&   r   r*   rM  T)r]   r^   r_   r`   ra   r'   )r  r  r   r  r  r^   r]   r)   )rC  r   r  F)r  r  r  r   r  r   r   r  )last_hidden_stater   r   )rf   r1   r\   rU   rK   r   rL   rM   rN   patching_moders  rI  patching_thresholdrv   r{  r(   r'   r   r-   rT  _patch_ids_from_lengthsrO  r0   rc   r	   rt   ra   r   r<  r   r
   )r   r  ru   r   r  r   r  r   r   r   encoder_embedsrB   r_   r   r(   r'   r]   rU  rV  cross_attn_mask_encencoder_hidden_statesr%  global_cache_positionglobal_position_idsglobal_causal_maskglobal_hidden_statesdecoder_patch_idscross_attn_mask_decoutputs                                r7   r   zBltModel.forward"  s    -t";<YZZ $*N-:-@-@*J*3//'J4""//@@8899N  {{((I5$,,:R$$%WXX&*ll#{{55"kk<<%)[[%A%A(,(G(G$++ '3 '#=! .7-B))H\H\+4+@	mFYFY 5JJ
Oa,?@V\]KK00! 00P	!CRC^==?de"\\ "2^5I5I!5L"LUcUjUjN )33A6L(;;'))+%
 B%++A.+#11 &&
 7Id6H6H 	7
(&%#6%++A.	7
 	7
33  488]EXEXYZE[]_` %Q0D0J0J10MVjVqVq r3==a@/;;-0 
  7t66  
--, 
 	 
 !88q!"u9M_A'%++A.+$11 &&
 $## 

/-&%+)#6

 

 '$+
 	
r9   c                 .    | j                   j                  S ry   rU   rR   )r   s    r7   get_input_embeddingszBltModel.get_input_embeddings  s    !!...r9   c                 &    || j                   _        y ry   r  )r   values     r7   set_input_embeddingszBltModel.set_input_embeddings  s    */'r9   rC   c                    |j                   d   }t        j                  t        j                  |d|j                  |j
                        |j                  d      d d d df   gd      }t        j                  ||j
                        }|j                  d      |j                  d      j                  d      k  j                  d      dz
  S )Nr   r   r&   r)   r+   r*   )
r1   r-   r@   r?   r'   r(   cumsumr0   rc   r2   )r   ru   rC   rB   rc  token_positionss         r7   r  z BltModel._patch_ids_from_lengths  s    "((+
yyJ1D1D]MaMab$$$,QV4 
  ,,w}7K7KL&&q)_-F-Fq-I-S-STV-WW\\ac\dghhhr9   r8  )r   r   r   r   r   r   r   r-   r   r   r   rh  ri   r   r   r
   r   r  r  r   r  r   r   s   @r7   rl  rl    s+   y (  15041537+/59$(59~
E,,-~
  -~
 !.	~

 u//0~
 "%~
   1 12~
 D>~
 !!1!12~
 +,~
 
!~
 ~
@/0
iU\\ 
iC 
iTYT`T` 
ir9   rl  c                       e Zd ZU eed<   dZdZdgZdef fdZ	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     d	e	e
j                     d
e	e
j                     de	ee
j                  e
j                  f      de	eeee
j"                     f      de	e
j"                     de	e
j                     de	e   de	e
j                     deee
j                  f   dee   deeef   fdZ xZS )BltForCausalLMr   Fmodelzlm_head.weightc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  j                  |j                  d      | _	        | j                          y )NFr?  )r   r   r  rl  r  r	  r  rp  r   rH  r  )r   r   r   s     r7   r   zBltForCausalLM.__init__  sZ      ++f%
yy!6!6!B!BFDUDU\abr9   r  r   r  r   rp   full_text_row_masked_out_maskr   r  labelsr   r   logits_to_keepr   rO   c                     | j                   d||||||||
|d	|}|j                  }t        |t              rt	        | d       n|}| j                  |d d |d d f         j                         }d }|	 | j                  ||	| j                  fi |}t        |||j                  |j                  |j                        S )N)	r  r   r  rp   r  r   r  r   r   )lossr7  r   r   r   r   )r  r|  
isinstancer   slicerH  ri  loss_functionr  r   r   r   r   )r   r  r   r  r   rp   r  r   r  r  r   r   r  r   outputsr   slice_indicesr7  r  s                      r7   r   zBltForCausalLM.forward  s    " $** 
)%!5*G+')
 
  118B>SV8W~ot4]kmA}a,?@AGGI%4%%ffdooPPD%#33!//))
 	
r9   )NNNNNNNNNNNr   )r   r   r   r   r   _can_compile_fullgraphbase_model_prefix_tied_weights_keysr   r   r-   r   r   tupler   r   listrh  ri   r   r   r   r   r   r   r   s   @r7   r  r    s   "*+y  151537=A;?UYKO59-1$(5934,
E,,-,
 !.,
 u//0	,

 !))9)9 :,
 'u'7'78,
 (0ellELL6P0Q'R,
 "%tE4E4E/F(F"GH,
   1 12,
 ))*,
 D>,
 !!1!12,
 c5<</0,
 +,,
 
u,,	-,
r9   r  )r   rl  rF  r  )rQ   )r   rQ   i0u  )Or   typingr   r   r   r-   torch.distributionstorch.nnr	  torch.nn.functional
functionalr   cache_utilsr   r   masking_utilsr	   modeling_outputsr
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   r   cohere2.modeling_cohere2r   r   mllama.modeling_mllamar   r   r   r   r   r   r   r   configuration_bltr   r    r!   r"   r#   
get_loggerr   loggerr   r8   r   rI   r  r  r\   float32ri   r'   r  rt   r   r   r   r   r   r   r   r   r   r1  r:  rF  rl  r  __all__r   r9   r7   <module>r     sg   C , ,      . / O 5 & @ @ ?	 	 	  
		H	%: :< \a||),9<UX$#,,# !## +.	#
 #'# $'# \\#T  %K ||K K  K  	K 
 K  ;;K  5<<%&K \) )RU )[`[g[g )X	] 		" 		/ 	`9 `
. 
4<)0 <)~ 7. 7 7*p"( p"fL( L^2- 2jF# FRfi! fiR9
& 9
xr9   