
    <h>                        S SK r S SKJr  S SKJrJrJr  S SKrSSK	J
r
Jr  \" 5       (       a  S SKr\
" 5       (       a  S SKrSSKJrJr  SSKJr  SSKJrJrJrJr  SS	KJrJr   " S
 S\SS9r " S S\SS9r " S S\5      rS/rg)    N)Path)AnyOptionalUnion   )is_soundfile_availableis_torch_available)
AudioInputmake_list_of_audio)BatchFeature)AudioKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   0    \ rS rSr% \\\\4      \S'   Sr	g)CsmAudioKwargs%   encoded_length_kwargs N)
__name__
__module____qualname____firstlineno__r   dictstrr   __annotations____static_attributes__r       ^/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/csm/processing_csm.pyr   r   %   s    #DcN33r    r   F)totalc                   T    \ rS rSr% \\S'   SSSS./ SQ/ SQ/ S	QSS
.SS.SS0S.rSrg)CsmProcessorKwargs)   audio_kwargsTleftF)paddingpadding_sideadd_special_tokens)   r         r   r,   
   r   r,      r   r,      r      )r,   r,   r,   r1   r,   r,      r,   r,      r,   r,   r-   r,      )r,   r,   r,   r,   r,   r,   r,   r,   r,   r,   r,   r,   r,   r,   r,   )kernel_sizesstrides	dilationsuse_causal_convi]  )r   sampling_ratereturn_tensorspt)text_kwargsr&   common_kwargsr   N)r   r   r   r   r   r   	_defaultsr   r   r    r!   r$   r$   )   sG       ""'
 !QHJ#'	& #
 +D1Ir    r$   c                      ^  \ rS rSrSrSS/rSrSr SU 4S jjr\	SS j5       r
S	\S
\\\\\\\4      4   S\\   4S jr   SS\\\\\\   \\   4      S	\\   S\\   S\\   S\\   4
S jjrSrU =r$ )CsmProcessor>   a  
Constructs a Csm processor which wraps [`EncodecFeatureExtractor`] and
[`PretrainedTokenizerFast`] into a single processor that inherits both the audio feature extraction and
tokenizer functionalities. See the [`~CsmProcessor.__call__`] for more
information.
The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
    ```python
    from transformers import CsmProcessor
    from datasets import load_dataset

    ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
    audio = ds[0]["audio"]["array"]

    processor = CsmProcessor.from_pretrained("sesame/csm-1b")

    processor(
        text=["<|begin_of_text|>[0]What are you working on?<|end_of_text|><|AUDIO|><|audio_eos|><|begin_of_text|>[1]I'm figuring out my budget.<|end_of_text|>"],
        audio=audio,
        text_kwargs = {"padding": False},
        audio_kwargs = {"sampling_rate": 16000},
        common_kwargs = {"return_tensors": "pt"},
    )
    # this should error out because EncodecFeatureExtractor expects a 24kHz audio :)
    ```

Args:
    feature_extractor ([`EncodecFeatureExtractor`]):
        The feature extractor is a required input.
    tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
        The tokenizer is a required input.
    chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
        in a chat into a tokenizable string.

feature_extractor	tokenizerEncodecFeatureExtractorPreTrainedTokenizerFastc                   > [        US5      (       d(  SU l        UR                  U R                  5      U l        O"UR                  U l        UR                  U l        [        US5      (       d(  SU l        UR                  U R                  5      U l        O"UR                  U l        UR
                  U l        [        TU ]  XUS9  g )Naudio_tokenz	<|AUDIO|>audio_eos_tokenz<|audio_eos|>)chat_template)hasattrrG   convert_tokens_to_idsaudio_token_idrH   audio_eos_token_idsuper__init__)selfrB   rC   rI   	__class__s       r!   rO   CsmProcessor.__init__f   s     y-00*D"+"A"A$BRBR"SD(44D"+":":Dy"344#2D &/&E&EdFZFZ&[D##,#<#<D &/&B&BD#*]Sr    c                 B   U nUb	  Ub  Ub  Uc  U$ [        XU5       H~  u  pgnUS-
  U-  S-   n	Xg-
  n
U
S-  nX-
  nXY-
  U
-   U-  S-   n[        R                  " U5      S-
  nX-  U-   U
-
  nX-
  nU(       a  U
nUnOUnX-   nX\-   U-   nXXUS-
  -  -
  S-
  U-  S-   nM     U$ )aD  
Compute the length of the encoded audio sequence.

Args:
    audio_length (int): The length of the audio sequence.
    kernel_sizes (list[int]): The kernel sizes for the convolutional layers.
    strides (list[int]): The strides for the convolutional layers.
    use_causal_conv (bool): Whether to use causal convolutions.
r,   r4   )zipmathceil)audio_lengthr5   r6   r7   r8   
cur_lengthkernel_sizestridedilationeffective_kernel_sizepadding_totalpadding_rightpadding_leftn_framesideal_lengthextra_paddings                   r!   _get_encoded_length CsmProcessor._get_encoded_length|   s     "
7?i6G?Kb-0	-R)K%01_$@1$D!'0M)Q.M(8L":]JfTWXXHyy*Q.H#,{:]JL(5M, -+ - =#2]BJ$;?'CCaGFRUVVJ' .S* r    audiosaving_pathkwargsc                 z   [        5       (       d  [        S5      e[        U5      n[        U[        [
        45      (       a  U/nO=[        U[        [        45      (       a  [        S U 5       5      (       d  [        S5      e[        U5      [        U5      :w  a  [        S5      eU R                  " [        40 UD6nUS   nUS   n[        X5       Hg  u  px[        U[        R                  5      (       a,  UR!                  5       R#                  5       R%                  5       n[&        R(                  " XU5        Mi     g )Nz/Please install `soundfile` to save audio files.c              3   L   #    U H  n[        U[        [        45      v   M     g 7fN)
isinstancer   r   ).0ps     r!   	<genexpr>*CsmProcessor.save_audio.<locals>.<genexpr>   s#     @qep`aAPSUY{A[A[eps   "$zAInvalid input path. Please provide a string, or a list of stringsz5The number of audio and saving paths must be the samer&   r9   )r   ImportErrorr   rk   r   r   listtupleall
ValueErrorlen_merge_kwargsr$   rT   torchTensorcpufloatnumpysfwrite)	rP   re   rf   rg   output_kwargsr&   r9   audio_valuerm   s	            r!   
save_audioCsmProcessor.save_audio   s	    &''OPP #5) kC;//&-K[4-88S@qep@q=q=q`aau:[))TUU**

 %^4$_5!%5NK+u||44)oo/557==?HHQ]3 6r    textoutput_labelsdepth_decoder_labels_ratioc                 
   U R                   " [        4SU R                  R                  0UD6nUS   nUS   nUS   n	U	R	                  SS5      n
U
S:w  a"  [        U R                  R                   S35      e[        U[        5      (       a  U/nO=[        U[        [        45      (       a  [        S	 U 5       5      (       d  [        S
5      eU Vs/ sH  oR                  U R                  5      PM     nnSnUb  [        U5      n[!        U5      n[#        U5      S:  a/  U[#        U5      :w  a   Uc  [        S5      e[        SU SU S35      eUGb  UR	                  S0 5      nU Vs/ sH"  oR$                  " UR&                  S   40 UD6PM$     nnUR)                  5       n/ nU H  n/ nU R                  U;   a`  UR	                  S5      nU R                  U-  nUR+                  U5        UR-                  U R                  SS5      nU R                  U;   a  M`  SU;   a*  UR-                  SUR	                  S5      S5      nSU;   a  M*  UR+                  U5        M     UnU R                  " U40 UD6n0 nUR/                  U5        UGb  UR	                  SS5        / / nnSnU GH(  nUS:X  aM  UR+                  [0        R2                  " S5      5        UR+                  [4        R6                  " S/5      5        MW  UR+                  [0        R8                  " UUUU-     Vs/ sHB  n[        U[4        R:                  5      (       a  UR=                  5       R?                  5       OUPMD     snSS95        UR+                  [4        R6                  " UUUU-     Vs/ sH  nUR&                  S   PM     sn5      RA                  SS95        UU-  nGM+     U RB                  " U40 UD6nUR	                  SS5        UR/                  U5        [E        S U 5       5      nU Vs/ sH>  n[4        RF                  RH                  RK                  USUUR&                  S   -
  4SS9PM@     nn[4        RL                  " USS9US'   U(       a  US   U RN                  :H  RQ                  5       n U R&                  S   n!US::  a.  [4        RR                  " U!5      S[U        U!SU-
  -  5       n"U U"   n#OU n#[4        RV                  " US   U RN                  :H  US   U RX                  :H  -  US   S5      n$SU$U#SS2S4   U#SS2S4   4'   U$US'   [[        UU
S 9$ s  snf s  snf s  snf s  snf s  snf )!a	  
Main method to prepare text(s) and audio to be fed as input to the model. This method forwards the `text`
arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode
the text. To prepare the audio, this method forwards the `audio` arguments to
EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`]. Please refer
to the docstring of the above two methods for more information.

Args:
    audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch
        tensor.
    text (`str`, `list[str]`, `list[list[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    output_labels (bool, *optional*, default=False):
        Whether to return labels for training. Indices will be in `[config.audio_token_id, -100, -101]`.
        - `config.audio_token_id` indicates an audio frame (considering sequence length elements as frames)
        - `-100` will be ignored in the loss computation
        - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)
    depth_decoder_labels_ratio (float, *optional*, default=1.0):
        The ratio of audio frames to keep for the depth decoder labels.
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:
            - `'tf'`: Return TensorFlow `tf.constant` objects.
            - `'pt'`: Return PyTorch `torch.Tensor` objects.
            - `'np'`: Return NumPy `np.ndarray` objects.
            - `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **input_values** -- List of audio values to be fed to a model. Returned when `audio` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **labels** -- List of labels for the audio frames. Returned when `output_labels=True`.
tokenizer_init_kwargsr<   r&   r=   r:   Nr;   z% only supports `return_tensors='pt'`.c              3   @   #    U H  n[        U[        5      v   M     g 7frj   )rk   r   )rl   ts     r!   rn   (CsmProcessor.__call__.<locals>.<genexpr>  s     9[VZQR*Q:L:LVZs   zAInvalid input text. Please provide a string, or a list of stringsr   z@No audio were provided, but there are audio tokens in the promptz)The number of audio tokens in each text (z7) should be the same as the number of provided audios (z).r   z<placeholder>r,   return_attention_mask)axis)dimpadding_maskc              3   <   #    U H  oR                   S    v   M     g7f)r   N)shape)rl   cut_idxss     r!   rn   r   O  s     R=Q..,=Qs   )valueinput_values_cutoffs	input_ids      ?iilabels)datatensor_type).rv   r$   rC   init_kwargspoprt   rQ   r   rk   r   rq   rr   rs   countrG   r   ru   sumrc   r   copyappendreplaceupdatenpzerosrw   tensorconcatenaterx   ry   r{   cumsumrB   maxnn
functionalpadstackrL   nonzerorandpermintwhererM   r   )%rP   r   re   r   r   rg   r~   r<   r&   r=   r:   r   n_audio_in_textn_audior   audio_arraynum_audio_tokens_listnum_audio_tokens_list_copyexpanded_textsamplereplace_strnum_audio_tokensexpanded_audio_tokenencodingr   concatenated_audior   offsetelaudio_inputsmax_lenr   audio_frame_idxsn_audio_frames	rand_idxsskip_frames_idxsr   s%                                        r!   __call__CsmProcessor.__call__   s   ^ **
"&.."<"<
 
 $M2$^4%o6&**+;TBT! 7 788]^__dC  6DTD%=11c9[VZ9[6[6[`aa>BCd774#3#34dC&u-E%jG!#33G(G} !cdd ??P Q229"> 
 $0$4$45Lb$Q!lq%lq]h(():):2)>XBWXlq " % *?)C)C)E& M &&&0'A'E'Ea'H$+/+;+;>N+N(&&';<#^^D,<,<oqQF &&&0 &/#^^O[__Q=OQRSF &/$$V,  !D>>$6+6H4d;792 4F*a<&--bhhqk:(//bT0BC&-- +0'9I*J*JB 5?r5<<4P4P 0VX X*J "$ )//U6FU\L\=]%^=]rbhhrl=]%^_ffkmfn g%F# +&  112DUUL^T2KK% R=QRRG !5$ 4H ##''1gr@R6R2S[]'^ 4 ! $ ,1;;7KQR+SD'( $[ 1T5H5H HQQS-33A6N)S0!NN>:;sSSTWqSqAr=st	#3I#> #3 [[k"d&9&99d;>OSWSjSj>jk[!F
 FJF#AqD)+;AqD+AAB#DN>BBG D$%L &_$s    #U)(U#AUU AU )rH   rM   rG   rL   rj   )NNNN)NFr   )r   r   r   r   __doc__
attributesfeature_extractor_classtokenizer_classrO   staticmethodrc   r
   r   r   r   rq   r   r$   r   r   r   r   boolrz   r   r   __classcell__)rQ   s   @r!   r@   r@   >   s   !F &{3J7/O 	T, $ $L 4 4 3d5d+;&<<= 4 +,	 4J '+(-69dCuY(94	?DQbLccdedC 
#dC  ~	dC
 %-UOdC +,dC dCr    r@   ) rU   pathlibr   typingr   r   r   r{   r   utilsr   r	   rw   	soundfiler|   audio_utilsr
   r   feature_extraction_utilsr   processing_utilsr   r   r   r   tokenization_utils_baser   r   r   r$   r@   __all__r   r    r!   <module>r      sz       ' '  ?  9 4 U U C4[ 4) *kC> kC\	 
r    