o
    +if                      @   sj   d Z ddlmZ ddlmZ deeeef defddZG dd	 d	e	Z
G d
d de	ZG dd dZdS )a  
Hashing function for dataset keys using `hashlib.md5`

Requirements for the hash function:

- Provides a uniformly distributed hash from random space
- Adequately fast speed
- Working with multiple input types (in this case, `str`, `int` or `bytes`)
- Should be platform independent (generates same hash on different OS and systems)

The hashing function provides a unique 128-bit integer hash of the key provided.

The split name is being used here as the hash salt to avoid having same hashes
in different splits due to same keys
    )Union)insecure_hashlib	hash_datareturnc                 C   sL   t | tr| S t | tr| dd} nt | trt| } nt| | dS )z|
    Returns the input hash_data in its bytes form

    Args:
    hash_data: the hash salt/key to be converted to bytes
    \/zutf-8)
isinstancebytesstrreplaceintInvalidKeyErrorencode)r    r   I/var/www/html/corbot_env/lib/python3.10/site-packages/datasets/keyhash.py	_as_bytes&   s   




r   c                       s    e Zd ZdZ fddZ  ZS )r   z6Raises an error when given key is of invalid datatype.c                    sD   d| _ d| dt| | _d| _t | j  | j | j  d S )Nz7
FAILURE TO GENERATE DATASET: Invalid key type detectedz
Found Key z	 of type z-
Keys should be either str, int or bytes type)prefixtypeerr_msgsuffixsuper__init__)selfr   	__class__r   r   r   @   s   "zInvalidKeyError.__init____name__
__module____qualname____doc__r   __classcell__r   r   r   r   r   =   s    r   c                       s"   e Zd ZdZd fdd	Z  ZS )DuplicatedKeysErrorz(Raise an error when duplicate key found. c                    s   || _ || _|| _d| _t|dkrdd| d| | _ndd|d d  dt|d  d| | _|r<d| nd	| _t 	| j | j | j  d S )
Nz3Found multiple examples generated with the same key   z
The examples at index z, z have the key z... (z more) have the key 
r"   )
keyduplicate_key_indicesfix_msgr   lenjoinr   r   r   r   )r   r%   r&   r'   r   r   r   r   J   s   ."zDuplicatedKeysError.__init__)r"   r   r   r   r   r   r!   G   s    r!   c                   @   s:   e Zd ZdZdefddZdeeeef defddZ	d	S )
	KeyHasherz,KeyHasher class for providing hash using md5	hash_saltc                 C   s   t t|| _d S )N)r   md5r   
_split_md5)r   r+   r   r   r   r   Z   s   zKeyHasher.__init__r%   r   c                 C   s*   | j  }t|}|| t| dS )zReturns 128-bits unique hash of input key

        Args:
        key: the input key to be hashed (should be str, int or bytes)

        Returns: 128-bit int hash key   )r-   copyr   updater   	hexdigest)r   r%   r,   byte_keyr   r   r   hash]   s   

zKeyHasher.hashN)
r   r   r   r   r
   r   r   r   r	   r3   r   r   r   r   r*   W   s     r*   N)r   typingr   huggingface_hub.utilsr   r
   r   r	   r   	Exceptionr   r!   r*   r   r   r   r   <module>   s   
