o
    T©Zhû  ã                   @   s   d dl Zdd„ Zddd„ZdS )é    Nc                 C   s¼   ||  |   ¡  }t |¡}t||  ¡  ƒ}|dkrX|| }t t |¡¡ddd… }|D ]+}t ||k¡\}	tt|	ƒ|ƒ}
|j	|	|
dd}	||	  d7  < ||
8 }|dkrW nq,| 
tj¡S )aÌ  Computes approximate mode of multivariate hypergeometric.
    This is an approximation to the mode of the multivariate
    hypergeometric given by class_counts and n_draws.
    It shouldn't be off by more than one.
    It is the mostly likely outcome of drawing n_draws many
    samples from the population given by class_counts.
    Args
    ----------
    class_counts : ndarray of int
        Population per class.
    n_draws : int
        Number of draws (samples to draw) from the overall population.
    rng : random state
        Used to break ties.
    Returns
    -------
    sampled_classes : ndarray of int
        Number of samples drawn from each class.
        np.sum(sampled_classes) == n_draws

    r   NéÿÿÿÿF)ÚsizeÚreplaceé   )ÚsumÚnpÚfloorÚintÚsortÚuniqueÚwhereÚminÚlenÚchoiceZastypeZint64)Úclass_countsZn_drawsÚrngZ
continuousZflooredZneed_to_addÚ	remainderÚvaluesÚvalueZindsZadd_now© r   úN/var/www/html/lang_env/lib/python3.10/site-packages/datasets/utils/stratify.pyÚapproximate_mode   s    
ÿr   é
   c              	   c   sP   t j| dd\}}|jd }t  |¡}t  |¡dk rtdƒ‚||k r+td||f ƒ‚||k r7td||f ƒ‚t  t j|dd	t  |¡d
d… ¡}	t	|ƒD ]X}
t
|||ƒ}|| }t
|||ƒ}g }g }t	|ƒD ].}| || ¡}|	| j|dd}| |d
|| … ¡ | ||| || ||  … ¡ qg| |¡}| |¡}||fV  qMd
S )aŒ  

    Provides train/test indices to split data in train/test sets.
    It's reference is taken from StratifiedShuffleSplit implementation
    of scikit-learn library.

    Args
    ----------

    n_train : int,
        represents the absolute number of train samples.

    n_test : int,
        represents the absolute number of test samples.

    random_state : int or RandomState instance, default=None
        Controls the randomness of the training and testing indices produced.
        Pass an int for reproducible output across multiple function calls.

    n_splits : int, default=10
        Number of re-shuffling & splitting iterations.
    T)Zreturn_inverser   é   zMinimum class count errorzLThe train_size = %d should be greater or equal to the number of classes = %dzKThe test_size = %d should be greater or equal to the number of classes = %dZ	mergesort)ÚkindNr   Zclip)Úmode)r   r   ÚshapeZbincountr   Ú
ValueErrorÚsplitZargsortZcumsumÚranger   ÚpermutationZtakeÚextend)ÚyZn_trainZn_testr   Zn_splitsÚclassesZ	y_indicesZ	n_classesr   Zclass_indicesÚ_Zn_iZclass_counts_remainingZt_iÚtrainÚtestÚir    Zperm_indices_class_ir   r   r   Ú)stratified_shuffle_split_generate_indices6   s<   €


ÿ
ÿ$$

ðr(   )r   )Únumpyr   r   r(   r   r   r   r   Ú<module>   s    2