o
    h"                 
   @   sb  d dl Z d dlZd dlZd dlZd dlmZmZ d dlmZ d dl	Z	d dl	m
Z
mZ d dlmZ d dlmZ d dlmZ dd	lmZ g d
Ze
je
jdZd&dedededefddZededd&dedededefddZG dd deZG dd deZG dd deZG dd deZG dd deZ G d d! d!e Z!G d"d# d#e Z"G d$d% d%e Z#dS )'    N)Optionaloverload)
deprecated)_VFTensor)init)	Parameter)PackedSequence   )Module)RNNBaseRNNLSTMGRURNNCellBaseRNNCellLSTMCellGRUCell)RNN_TANHRNN_RELUtensorpermutationdimreturnc                 C   s   |  ||S N)index_selectr   r   r    r   h/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/torch/nn/modules/rnn.py_apply_permutation$   s   r   z]`apply_permutation` is deprecated, please use `tensor.index_select(dim, permutation)` instead)categoryc                 C   s   t | ||S r   r   r   r   r   r   apply_permutation(   s   r"   c                       s  e Zd ZU dZg dZdgZeed< eed< eed< eed< e	ed< e	ed	< e
ed
< e	ed< eed< 								d@dedededede	d	e	d
e
de	deddf fddZdd Z fddZdAddZdB fdd	ZdAddZd ed!ee ddfd"d#Zd ed!ee deeeef fd$d%Z	&dCd'ed(eeeef d)eddfd*d+Zd,d- Zd ed.ed!ee fd/d0Zd'ed1ee fd2d3Zdefd4d5Zd6d7 Zd8d9 Z fd:d;Zedeee   fd<d=Z! fd>d?Z"  Z#S )Dr   a  Base class for RNN modules (RNN, LSTM, GRU).

    Implements aspects of RNNs shared by the RNN, LSTM, and GRU classes, such as module initialization
    and utility methods for parameter storage management.

    .. note::
        The forward method is not implemented by the RNNBase class.

    .. note::
        LSTM and GRU classes override some methods implemented by RNNBase.
    )	mode
input_sizehidden_size
num_layersbiasbatch_firstdropoutbidirectional	proj_sizeall_weightsr#   r$   r%   r&   r'   r(   r)   r*   r+   r
   TF        r   Nr   c                    s$  |
|d}t    || _|| _|| _|| _|| _|| _t|| _	|| _
|	| _g | _|r.dnd}t|tjrJd|  kr@dkrJn tdt|trNtd|dkra|dkratd| d|  t|tsptdt|j |dkrxtd	|dkrtd
|	dk rtd|	|krtd|dkrd| }n|dkrd| }n|dkr|}n|dkr|}ntd| g | _g | _t|D ]ǉ t|D ]}|	dkr|	n|} dkr|n|| }ttj||ffi |}ttj||ffi |}ttj|fi |}ttj|fi |}d}| jdkr!|r||||f}n"||f}nttj|	|ffi |}|r9|||||f}n|||f}|dkrEdndddg}|rT|ddg7 }| jdkr_|dg7 } fdd|D }t||D ]\}}t| || qn| j | | j!| qq| "  | #  d S )Ndevicedtype   r
   r   zbdropout should be a number in range [0, 1] representing the probability of an element being zeroedzdropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=z and num_layers=z(hidden_size should be of type int, got: z%hidden_size must be greater than zeroz$num_layers must be greater than zerozEproj_size should be a positive integer or zero to disable projectionsz,proj_size has to be smaller than hidden_sizer      r      r   r   zUnrecognized RNN mode: r   _reverse weight_ih_l{}{}weight_hh_l{}{}bias_ih_l{}{}bias_hh_l{}{}weight_hr_l{}{}c                       g | ]}|  qS r   format.0xlayersuffixr   r   
<listcomp>       z$RNNBase.__init__.<locals>.<listcomp>)$super__init__r#   r$   r%   r&   r'   r(   floatr)   r*   r+   _flat_weight_refs
isinstancenumbersNumberbool
ValueErrorwarningswarnint	TypeErrortype__name___flat_weights_names_all_weightsranger   torchemptyzipsetattrextendappend_init_flat_weightsreset_parameters)selfr#   r$   r%   r&   r'   r(   r)   r*   r+   r/   r0   factory_kwargsnum_directions	gate_size	directionreal_hidden_sizelayer_input_sizew_ihw_hhb_ihb_hhlayer_paramsw_hrparam_namesnameparam	__class__rA   r   rG   T   s   









,zRNNBase.__init__c                    s4    fdd j D  _dd  jD  _   d S )Nc                    $   g | ]}t  |rt |nd qS r   hasattrgetattrr?   wnr`   r   r   rD          z.RNNBase._init_flat_weights.<locals>.<listcomp>c                 S   "   g | ]}|d urt |nd qS r   weakrefrefr?   wr   r   r   rD          )rU   _flat_weightsrI   flatten_parametersrx   r   rx   r   r^      s   
zRNNBase._init_flat_weightsc                    s<   t | dr|| jv r| j|}|| j|< t || d S )NrU   )rt   rU   indexr   rF   __setattr__)r`   attrvalueidxrp   r   r   r      s   
zRNNBase.__setattr__c                 C   s  t | jt | jkrdS | jD ]
}t|ts dS q| jd }|j}| jD ]}t|tr;|j|kr;|jr;tjj	
|s> dS q%dd | jD }t |t | jkrRdS tj|_ ddlm  m	  m} t 6 t r| jrsdnd}| jdkr~|d7 }t| j|| j|| j| j| j| j| jt| j	 W d   n1 sw   Y  W d   dS W d   dS 1 sw   Y  dS )zReset parameter data pointer so that they can use faster code paths.

        Right now, this works only if the module is on the GPU and cuDNN is enabled.
        Otherwise, it's a no-op.
        Nr   c                 S   s   h | ]}|  qS r   )data_ptr)r?   pr   r   r   	<setcomp>   s    z-RNNBase.flatten_parameters.<locals>.<setcomp>r2   r1   r
   )lenr   rU   rJ   r   r0   is_cudarX   backendscudnnis_acceptablecuda	device_oftorch.backends.cudnn.rnnrnnno_grad_use_cudnn_rnn_flatten_weightr'   r+   _cudnn_rnn_flatten_weightr$   get_cudnn_moder#   r%   r&   r(   rM   r*   )r`   r   first_fwr0   fwunique_data_ptrsr   num_weightsr   r   r   r      s\   








"zRNNBase.flatten_parametersc                    s    g | _ t ||}|   |S r   )rI   rF   _applyr^   )r`   fnrecurseretrp   r   r   r     s   zRNNBase._applyc                 C   @   | j dkrdt| j  nd}|  D ]
}t|| | qd S Nr   g      ?r%   mathsqrt
parametersr   uniform_r`   stdvweightr   r   r   r_   &     zRNNBase.reset_parametersinputbatch_sizesc                 C   s   t j s"|j| jd jkr"t j s"td| jd j d|j |d ur(dnd}| |kr<t	d| d|  | j
|dkrRt	d	| j
 d
|d d S )Nr   zinput must have the type z, got type r1   r3   zinput must have z dimensions, got z5input.size(-1) must be equal to input_size. Expected z, got )rX   jitis_scriptingr0   r   _C_is_any_autocast_enabledrN   r   RuntimeErrorr$   size)r`   r   r   expected_input_dimr   r   r   check_input+  s"   
zRNNBase.check_inputc                 C   st   |d urt |d }n| jr|dn|d}| jrdnd}| jdkr/| j| || jf}|S | j| || jf}|S Nr   r
   r1   )rQ   r(   r   r*   r+   r&   r%   r`   r   r   
mini_batchrb   expected_hidden_sizer   r   r   get_expected_hidden_size>  s   
z RNNBase.get_expected_hidden_sizeExpected hidden size {}, got {}hxr   msgc                 C   s(   |  |krt||t|  d S r   )r   r   r=   list)r`   r   r   r   r   r   r   check_hidden_sizeT  s   zRNNBase.check_hidden_sizec                 C   s\   d}t | j| jD ]"\}}t| |rt| |nd }|d ur+|d ur+| |ur+d} |S q	|S )NFT)rZ   rI   rU   rt   ru   )r`   weights_changedr}   rn   r   r   r   r   _weights_have_changed]  s    zRNNBase._weights_have_changedhiddenc                 C   s(   |  || | ||}| || d S r   )r   r   r   )r`   r   r   r   r   r   r   r   check_forward_argsh  s   zRNNBase.check_forward_argsr   c                 C   s   |d u r|S t ||S r   r!   r`   r   r   r   r   r   permute_hiddenp  s   
zRNNBase.permute_hiddenc                 C   s   d}| j dkr|d7 }| jdkr|d7 }| jdur|d7 }| jdur&|d	7 }| jdkr/|d
7 }| jdur8|d7 }|jdi | jS )N{input_size}, {hidden_size}r   z, proj_size={proj_size}r
   z, num_layers={num_layers}T, bias={bias}Fz, batch_first={batch_first}z, dropout={dropout}z, bidirectional={bidirectional}r   )r+   r&   r'   r(   r)   r*   r=   __dict__r`   sr   r   r   
extra_repru  s   





zRNNBase.extra_reprc                 C   s&   t j s|  r|   d S d S d S r   )rX   r   r   r   r^   rx   r   r   r   _update_flat_weights  s
   
zRNNBase._update_flat_weightsc                 C   s   |    | j }|d= |S )NrI   )r   r   copy)r`   stater   r   r   __getstate__  s   
zRNNBase.__getstate__c                    s  t  | d|v r|d _d|vrd_tjd d ts͈j}jr(dnd}g _g _t	|D ] t	|D ]}|dkrBdndg d} fd	d
|D }j
rjdkri j|g7  _j| q: j|d d g7  _j|d d  q:jdkr j|d d g|dd  g 7  _j|d d |dd  g  q: j|d d g7  _j|d d  q:q4fdd
jD _dd
 jD _d S )Nr,   r+   r   r1   r
   r4   r5   )r6   r7   r8   r9   r:   c                    r;   r   r<   r>   rA   r   r   rD     rE   z(RNNBase.__setstate__.<locals>.<listcomp>r2   r   c                    rr   r   rs   rv   rx   r   r   rD     ry   c                 S   rz   r   r{   r~   r   r   r   rD     r   )rF   __setstate__rV   r+   rJ   strr&   r*   rU   rW   r'   r\   r   rI   )r`   dr&   rb   rd   weightsrp   )rB   r`   rC   r   r     sF   


&
zRNNBase.__setstate__c                    s    fdd j D S )Nc                    s   g | ]} fd d|D qS )c                    s   g | ]}t  |qS r   )ru   )r?   r   rx   r   r   rD     s    z2RNNBase.all_weights.<locals>.<listcomp>.<listcomp>r   )r?   r   rx   r   r   rD     s    z'RNNBase.all_weights.<locals>.<listcomp>)rV   rx   r   rx   r   r,     s   
zRNNBase.all_weightsc                    s.   t   }|jd d  |_|jd d  |_|S r   )rF   _replicate_for_data_parallelr   rU   )r`   replicarp   r   r   r     s   
z$RNNBase._replicate_for_data_parallelr
   TFr-   Fr   NNr   N)T)r   )$rT   
__module____qualname____doc____constants____jit_unused_properties__r   __annotations__rQ   rM   rH   rG   r^   r   r   r   r_   r   r   r   tupler   r   r   r   r   r   r   r   r   propertyr   r   r,   r   __classcell__r   r   rp   r   r   0   s   
 	
{

;


	
3r   c                       s   e Zd ZdZe								dded	ed
ededededededdfddZ	edd Z	 fddZ	ee
jj	ddedee deeef fddZee
jj	ddedee deeef fddZdddZ  ZS )r   a  __init__(input_size,hidden_size,num_layers=1,nonlinearity='tanh',bias=True,batch_first=False,dropout=0.0,bidirectional=False,device=None,dtype=None)

    Apply a multi-layer Elman RNN with :math:`\tanh` or :math:`\text{ReLU}`
    non-linearity to an input sequence. For each element in the input sequence,
    each layer computes the following function:

    .. math::
        h_t = \tanh(x_t W_{ih}^T + b_{ih} + h_{t-1}W_{hh}^T + b_{hh})

    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is
    the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the
    previous layer at time `t-1` or the initial hidden state at time `0`.
    If :attr:`nonlinearity` is ``'relu'``, then :math:`\text{ReLU}` is used instead of :math:`\tanh`.

    .. code-block:: python

        # Efficient implementation equivalent to the following with bidirectional=False
        def forward(x, hx=None):
            if batch_first:
                x = x.transpose(0, 1)
            seq_len, batch_size, _ = x.size()
            if hx is None:
                hx = torch.zeros(num_layers, batch_size, hidden_size)
            h_t_minus_1 = hx
            h_t = hx
            output = []
            for t in range(seq_len):
                for layer in range(num_layers):
                    h_t[layer] = torch.tanh(
                        x[t] @ weight_ih[layer].T
                        + bias_ih[layer]
                        + h_t_minus_1[layer] @ weight_hh[layer].T
                        + bias_hh[layer]
                    )
                output.append(h_t[-1])
                h_t_minus_1 = h_t
            output = torch.stack(output)
            if batch_first:
                output = output.transpose(0, 1)
            return output, h_t

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
            would mean stacking two RNNs together to form a `stacked RNN`,
            with the second RNN taking in outputs of the first RNN and
            computing the final results. Default: 1
        nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``
        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
            Default: ``True``
        batch_first: If ``True``, then the input and output tensors are provided
            as `(batch, seq, feature)` instead of `(seq, batch, feature)`.
            Note that this does not apply to hidden or cell states. See the
            Inputs/Outputs sections below for details.  Default: ``False``
        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
            RNN layer except the last layer, with dropout probability equal to
            :attr:`dropout`. Default: 0
        bidirectional: If ``True``, becomes a bidirectional RNN. Default: ``False``

    Inputs: input, hx
        * **input**: tensor of shape :math:`(L, H_{in})` for unbatched input,
          :math:`(L, N, H_{in})` when ``batch_first=False`` or
          :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of
          the input sequence.  The input can also be a packed variable length sequence.
          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
          :func:`torch.nn.utils.rnn.pack_sequence` for details.
        * **hx**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
          :math:`(D * \text{num\_layers}, N, H_{out})` containing the initial hidden
          state for the input sequence batch. Defaults to zeros if not provided.

        where:

        .. math::
            \begin{aligned}
                N ={} & \text{batch size} \\
                L ={} & \text{sequence length} \\
                D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\
                H_{in} ={} & \text{input\_size} \\
                H_{out} ={} & \text{hidden\_size}
            \end{aligned}

    Outputs: output, h_n
        * **output**: tensor of shape :math:`(L, D * H_{out})` for unbatched input,
          :math:`(L, N, D * H_{out})` when ``batch_first=False`` or
          :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features
          `(h_t)` from the last layer of the RNN, for each `t`. If a
          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output
          will also be a packed sequence.
        * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
          :math:`(D * \text{num\_layers}, N, H_{out})` containing the final hidden state
          for each element in the batch.

    Attributes:
        weight_ih_l[k]: the learnable input-hidden weights of the k-th layer,
            of shape `(hidden_size, input_size)` for `k = 0`. Otherwise, the shape is
            `(hidden_size, num_directions * hidden_size)`
        weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer,
            of shape `(hidden_size, hidden_size)`
        bias_ih_l[k]: the learnable input-hidden bias of the k-th layer,
            of shape `(hidden_size)`
        bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer,
            of shape `(hidden_size)`

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    .. note::
        For bidirectional RNNs, forward and backward are directions 0 and 1 respectively.
        Example of splitting the output layers when ``batch_first=False``:
        ``output.view(seq_len, batch, num_directions, hidden_size)``.

    .. note::
        ``batch_first`` argument is ignored for unbatched inputs.

    .. include:: ../cudnn_rnn_determinism.rst

    .. include:: ../cudnn_persistent_rnn.rst

    Examples::

        >>> rnn = nn.RNN(10, 20, 2)
        >>> input = torch.randn(5, 3, 10)
        >>> h0 = torch.randn(2, 3, 20)
        >>> output, hn = rnn(input, h0)
    r
   tanhTFr-   Nr$   r%   r&   nonlinearityr'   r(   r)   r*   r   c                 C      d S r   r   )r`   r$   r%   r&   r   r'   r(   r)   r*   r/   r0   r   r   r   rG   V     zRNN.__init__c                 O   r   r   r   r`   argskwargsr   r   r   rG   f     c                    s   d|v rt dt|dkr |d | _|d d |dd   }n|dd| _| jdkr/d}n| jdkr7d	}n	t d
| j dt j|g|R i | d S )Nr+   =proj_size argument is only supported for LSTM, not RNN or GRUr3   r2   r   r   r   relur   zUnknown nonlinearity 'z '. Select from 'tanh' or 'relu'.)rN   r   r   poprF   rG   )r`   r   r   r#   rp   r   r   rG   j  s    


r   r   c                 C   r   r   r   r`   r   r   r   r   r   forward~     zRNN.forwardc                 C   r   r   r   r   r   r   r   r     r   c                 C   s  |    | jr	dnd}|}t|tr8|\}}}}|d }|d u r1tj| j| || j|j|j	d}n| 
||}nd }| dvrJtd|  d| dk}	| jrUdnd}
|	sx||
}|d urw| dkrrtd	|  d
|d}n|d ur| dkrtd|  d
| jr|dn|d}d }d }|d u rtj| j| || j|j|j	d}n| 
||}|d usJ | ||| | jdks| jdksJ |d u r| jdkrt||| j| j| j| j| j| j| j	}nBt||| j| j| j| j| j| j| j	}n-| jdkrt|||| j| j| j| j| j| j	}nt|||| j| j| j| j| j| j	}|d }|d }t|trOt||||}|| 
||fS |	s\||
}|d}|| 
||fS )Nr1   r
   r   r0   r/   r1   r3   z(RNN: Expected input to be 2D or 3D, got zD tensor insteadr3   7For unbatched 2-D input, hx should also be 2-D but got 	-D tensor5For batched 3-D input, hx should also be 3-D but got r   r   )r   r*   rJ   r	   rX   zerosr&   r%   r0   r/   r   r   rN   r(   	unsqueezer   r   r   r#   r   rnn_tanhr   r'   r)   trainingrnn_relusqueeze)r`   r   r   rb   
orig_inputr   sorted_indicesunsorted_indicesmax_batch_size
is_batched	batch_dimresultoutputr   output_packedr   r   r   r     s   








)r
   r   TFr-   FNNr   )rT   r   r   r   r   rQ   r   rM   rH   rG   rX   _jit_internal_overload_methodr   r   r   r   r	   r   r   r   rp   r   r     sn     	


r   c                       sn  e Zd ZdZe								d$ded	ed
ededededededdfddZedd Z fddZde	de
e	 deeeef fddZde	dee	e	f de
e	 fddZdee	e	f de
e	 dee	e	f fddZeejj	d%de	de
ee	e	f  dee	ee	e	f f fd d!Zeejj	d%dede
ee	e	f  deeee	e	f f fd"d!Zd%d#d!Z  ZS )&r   a/%  __init__(input_size,hidden_size,num_layers=1,bias=True,batch_first=False,dropout=0.0,bidirectional=False,proj_size=0,device=None,dtype=None)

    Apply a multi-layer long short-term memory (LSTM) RNN to an input sequence.
    For each element in the input sequence, each layer computes the following
    function:

    .. math::
        \begin{array}{ll} \\
            i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{t-1} + b_{hi}) \\
            f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{t-1} + b_{hf}) \\
            g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{t-1} + b_{hg}) \\
            o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{t-1} + b_{ho}) \\
            c_t = f_t \odot c_{t-1} + i_t \odot g_t \\
            h_t = o_t \odot \tanh(c_t) \\
        \end{array}

    where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell
    state at time `t`, :math:`x_t` is the input at time `t`, :math:`h_{t-1}`
    is the hidden state of the layer at time `t-1` or the initial hidden
    state at time `0`, and :math:`i_t`, :math:`f_t`, :math:`g_t`,
    :math:`o_t` are the input, forget, cell, and output gates, respectively.
    :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product.

    In a multilayer LSTM, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
    (:math:`l \ge 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
    dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
    variable which is :math:`0` with probability :attr:`dropout`.

    If ``proj_size > 0`` is specified, LSTM with projections will be used. This changes
    the LSTM cell in the following way. First, the dimension of :math:`h_t` will be changed from
    ``hidden_size`` to ``proj_size`` (dimensions of :math:`W_{hi}` will be changed accordingly).
    Second, the output hidden state of each layer will be multiplied by a learnable projection
    matrix: :math:`h_t = W_{hr}h_t`. Note that as a consequence of this, the output
    of LSTM network will be of different shape as well. See Inputs/Outputs sections below for exact
    dimensions of all variables. You can find more details in https://arxiv.org/abs/1402.1128.

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
            would mean stacking two LSTMs together to form a `stacked LSTM`,
            with the second LSTM taking in outputs of the first LSTM and
            computing the final results. Default: 1
        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
            Default: ``True``
        batch_first: If ``True``, then the input and output tensors are provided
            as `(batch, seq, feature)` instead of `(seq, batch, feature)`.
            Note that this does not apply to hidden or cell states. See the
            Inputs/Outputs sections below for details.  Default: ``False``
        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
            LSTM layer except the last layer, with dropout probability equal to
            :attr:`dropout`. Default: 0
        bidirectional: If ``True``, becomes a bidirectional LSTM. Default: ``False``
        proj_size: If ``> 0``, will use LSTM with projections of corresponding size. Default: 0

    Inputs: input, (h_0, c_0)
        * **input**: tensor of shape :math:`(L, H_{in})` for unbatched input,
          :math:`(L, N, H_{in})` when ``batch_first=False`` or
          :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of
          the input sequence.  The input can also be a packed variable length sequence.
          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
          :func:`torch.nn.utils.rnn.pack_sequence` for details.
        * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
          :math:`(D * \text{num\_layers}, N, H_{out})` containing the
          initial hidden state for each element in the input sequence.
          Defaults to zeros if (h_0, c_0) is not provided.
        * **c_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{cell})` for unbatched input or
          :math:`(D * \text{num\_layers}, N, H_{cell})` containing the
          initial cell state for each element in the input sequence.
          Defaults to zeros if (h_0, c_0) is not provided.

        where:

        .. math::
            \begin{aligned}
                N ={} & \text{batch size} \\
                L ={} & \text{sequence length} \\
                D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\
                H_{in} ={} & \text{input\_size} \\
                H_{cell} ={} & \text{hidden\_size} \\
                H_{out} ={} & \text{proj\_size if } \text{proj\_size}>0 \text{ otherwise hidden\_size} \\
            \end{aligned}

    Outputs: output, (h_n, c_n)
        * **output**: tensor of shape :math:`(L, D * H_{out})` for unbatched input,
          :math:`(L, N, D * H_{out})` when ``batch_first=False`` or
          :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features
          `(h_t)` from the last layer of the LSTM, for each `t`. If a
          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output
          will also be a packed sequence. When ``bidirectional=True``, `output` will contain
          a concatenation of the forward and reverse hidden states at each time step in the sequence.
        * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
          :math:`(D * \text{num\_layers}, N, H_{out})` containing the
          final hidden state for each element in the sequence. When ``bidirectional=True``,
          `h_n` will contain a concatenation of the final forward and reverse hidden states, respectively.
        * **c_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{cell})` for unbatched input or
          :math:`(D * \text{num\_layers}, N, H_{cell})` containing the
          final cell state for each element in the sequence. When ``bidirectional=True``,
          `c_n` will contain a concatenation of the final forward and reverse cell states, respectively.

    Attributes:
        weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
            `(W_ii|W_if|W_ig|W_io)`, of shape `(4*hidden_size, input_size)` for `k = 0`.
            Otherwise, the shape is `(4*hidden_size, num_directions * hidden_size)`. If
            ``proj_size > 0`` was specified, the shape will be
            `(4*hidden_size, num_directions * proj_size)` for `k > 0`
        weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
            `(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size, hidden_size)`. If ``proj_size > 0``
            was specified, the shape will be `(4*hidden_size, proj_size)`.
        bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
            `(b_ii|b_if|b_ig|b_io)`, of shape `(4*hidden_size)`
        bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
            `(b_hi|b_hf|b_hg|b_ho)`, of shape `(4*hidden_size)`
        weight_hr_l[k] : the learnable projection weights of the :math:`\text{k}^{th}` layer
            of shape `(proj_size, hidden_size)`. Only present when ``proj_size > 0`` was
            specified.
        weight_ih_l[k]_reverse: Analogous to `weight_ih_l[k]` for the reverse direction.
            Only present when ``bidirectional=True``.
        weight_hh_l[k]_reverse:  Analogous to `weight_hh_l[k]` for the reverse direction.
            Only present when ``bidirectional=True``.
        bias_ih_l[k]_reverse:  Analogous to `bias_ih_l[k]` for the reverse direction.
            Only present when ``bidirectional=True``.
        bias_hh_l[k]_reverse:  Analogous to `bias_hh_l[k]` for the reverse direction.
            Only present when ``bidirectional=True``.
        weight_hr_l[k]_reverse:  Analogous to `weight_hr_l[k]` for the reverse direction.
            Only present when ``bidirectional=True`` and ``proj_size > 0`` was specified.

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    .. note::
        For bidirectional LSTMs, forward and backward are directions 0 and 1 respectively.
        Example of splitting the output layers when ``batch_first=False``:
        ``output.view(seq_len, batch, num_directions, hidden_size)``.

    .. note::
        For bidirectional LSTMs, `h_n` is not equivalent to the last element of `output`; the
        former contains the final forward and reverse hidden states, while the latter contains the
        final forward hidden state and the initial reverse hidden state.

    .. note::
        ``batch_first`` argument is ignored for unbatched inputs.

    .. note::
        ``proj_size`` should be smaller than ``hidden_size``.

    .. include:: ../cudnn_rnn_determinism.rst

    .. include:: ../cudnn_persistent_rnn.rst

    Examples::

        >>> rnn = nn.LSTM(10, 20, 2)
        >>> input = torch.randn(5, 3, 10)
        >>> h0 = torch.randn(2, 3, 20)
        >>> c0 = torch.randn(2, 3, 20)
        >>> output, (hn, cn) = rnn(input, (h0, c0))
    r
   TFr-   r   Nr$   r%   r&   r'   r(   r)   r*   r+   r   c                 C   r   r   r   )r`   r$   r%   r&   r'   r(   r)   r*   r+   r/   r0   r   r   r   rG     r   zLSTM.__init__c                 O   r   r   r   r   r   r   r   rG     r   c                    s   t  jdg|R i | d S )Nr   rF   rG   r   rp   r   r   rG     s   r   r   c                 C   sT   |d urt |d }n| jr|dn|d}| jrdnd}| j| || jf}|S r   )rQ   r(   r   r*   r&   r%   r   r   r   r   get_expected_cell_size  s   zLSTM.get_expected_cell_sizer   c                 C   sD   |  || | |d | ||d | |d | ||d d S )Nr   z"Expected hidden[0] size {}, got {}r
   z"Expected hidden[1] size {}, got {})r   r   r   r  )r`   r   r   r   r   r   r   r     s   

zLSTM.check_forward_argsr   r   c                 C   s(   |d u r|S t |d |t |d |fS )Nr   r
   r!   r   r   r   r   r     s
   zLSTM.permute_hiddenc                 C   r   r   r   r   r   r   r   r     r   zLSTM.forwardc                 C   r   r   r   r   r   r   r   r     r   c                 C   s  |    |}d }| jrdnd}| jdkr| jn| j}t|trX|\}}}}|d }	|d u rQtj| j| |	||j	|j
d}
tj| j| |	| j|j	|j
d}|
|f}n| ||}n| dvrhtd|  d| dk}| jrsdnd}|s|||}| jr|dn|d}	d }d }|d u rtj| j| |	||j	|j
d}
tj| j| |	| j|j	|j
d}|
|f}| ||| ni|r|d  dks|d  dkrd	|d   d
|d   d}t|n4|d  dks|d  dkr
d|d   d
|d   d}t||d d|d df}| ||| | ||}|d u r?t||| j| j| j| j| j| j| j	}nt|||| j| j| j| j| j| j	}|d }|dd  }t|trqt||||}|| ||fS |s||}|d d|d df}|| ||fS )Nr1   r
   r   r   r   z)LSTM: Expected input to be 2D or 3D, got 	D insteadr3   z=For batched 3-D input, hx and cx should also be 3-D but got (z-D, z-D) tensorsz?For unbatched 2-D input, hx and cx should also be 2-D but got ()r   r*   r+   r%   rJ   r	   rX   r   r&   r0   r/   r   r   rN   r(   r   r   r   r   r   lstmr   r'   r)   r   r   )r`   r   r   r   r   rb   re   r   r   r   h_zerosc_zerosr   r   r   r   r  r   r  r   r   r   r     s   


 

"



r   r   )rT   r   r   r   r   rQ   rM   rH   rG   r   r   r   r  r   r   rX   r  r  r   r	   r   r   r   rp   r   r     s     !	






r   c                       s   e Zd ZdZe							ddeded	ed
ededededdfddZedd Z fddZee	j
j	ddedee deeef fddZee	j
j	ddedee deeef fddZdddZ  ZS )r   a  __init__(input_size,hidden_size,num_layers=1,bias=True,batch_first=False,dropout=0.0,bidirectional=False,device=None,dtype=None)

    Apply a multi-layer gated recurrent unit (GRU) RNN to an input sequence.
    For each element in the input sequence, each layer computes the following
    function:

    .. math::
        \begin{array}{ll}
            r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
            z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
            n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)}+ b_{hn})) \\
            h_t = (1 - z_t) \odot n_t + z_t \odot h_{(t-1)}
        \end{array}

    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the input
    at time `t`, :math:`h_{(t-1)}` is the hidden state of the layer
    at time `t-1` or the initial hidden state at time `0`, and :math:`r_t`,
    :math:`z_t`, :math:`n_t` are the reset, update, and new gates, respectively.
    :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product.

    In a multilayer GRU, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
    (:math:`l \ge 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
    dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
    variable which is :math:`0` with probability :attr:`dropout`.

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
            would mean stacking two GRUs together to form a `stacked GRU`,
            with the second GRU taking in outputs of the first GRU and
            computing the final results. Default: 1
        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
            Default: ``True``
        batch_first: If ``True``, then the input and output tensors are provided
            as `(batch, seq, feature)` instead of `(seq, batch, feature)`.
            Note that this does not apply to hidden or cell states. See the
            Inputs/Outputs sections below for details.  Default: ``False``
        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
            GRU layer except the last layer, with dropout probability equal to
            :attr:`dropout`. Default: 0
        bidirectional: If ``True``, becomes a bidirectional GRU. Default: ``False``

    Inputs: input, h_0
        * **input**: tensor of shape :math:`(L, H_{in})` for unbatched input,
          :math:`(L, N, H_{in})` when ``batch_first=False`` or
          :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of
          the input sequence.  The input can also be a packed variable length sequence.
          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
          :func:`torch.nn.utils.rnn.pack_sequence` for details.
        * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` or
          :math:`(D * \text{num\_layers}, N, H_{out})`
          containing the initial hidden state for the input sequence. Defaults to zeros if not provided.

        where:

        .. math::
            \begin{aligned}
                N ={} & \text{batch size} \\
                L ={} & \text{sequence length} \\
                D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\
                H_{in} ={} & \text{input\_size} \\
                H_{out} ={} & \text{hidden\_size}
            \end{aligned}

    Outputs: output, h_n
        * **output**: tensor of shape :math:`(L, D * H_{out})` for unbatched input,
          :math:`(L, N, D * H_{out})` when ``batch_first=False`` or
          :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features
          `(h_t)` from the last layer of the GRU, for each `t`. If a
          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output
          will also be a packed sequence.
        * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` or
          :math:`(D * \text{num\_layers}, N, H_{out})` containing the final hidden state
          for the input sequence.

    Attributes:
        weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
            (W_ir|W_iz|W_in), of shape `(3*hidden_size, input_size)` for `k = 0`.
            Otherwise, the shape is `(3*hidden_size, num_directions * hidden_size)`
        weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
            (W_hr|W_hz|W_hn), of shape `(3*hidden_size, hidden_size)`
        bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
            (b_ir|b_iz|b_in), of shape `(3*hidden_size)`
        bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
            (b_hr|b_hz|b_hn), of shape `(3*hidden_size)`

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    .. note::
        For bidirectional GRUs, forward and backward are directions 0 and 1 respectively.
        Example of splitting the output layers when ``batch_first=False``:
        ``output.view(seq_len, batch, num_directions, hidden_size)``.

    .. note::
        ``batch_first`` argument is ignored for unbatched inputs.

    .. note::
        The calculation of new gate :math:`n_t` subtly differs from the original paper and other frameworks.
        In the original implementation, the Hadamard product :math:`(\odot)` between :math:`r_t` and the
        previous hidden state :math:`h_{(t-1)}` is done before the multiplication with the weight matrix
        `W` and addition of bias:

        .. math::
            \begin{aligned}
                n_t = \tanh(W_{in} x_t + b_{in} + W_{hn} ( r_t \odot h_{(t-1)} ) + b_{hn})
            \end{aligned}

        This is in contrast to PyTorch implementation, which is done after :math:`W_{hn} h_{(t-1)}`

        .. math::
            \begin{aligned}
                n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)}+ b_{hn}))
            \end{aligned}

        This implementation differs on purpose for efficiency.

    .. include:: ../cudnn_persistent_rnn.rst

    Examples::

        >>> rnn = nn.GRU(10, 20, 2)
        >>> input = torch.randn(5, 3, 10)
        >>> h0 = torch.randn(2, 3, 20)
        >>> output, hn = rnn(input, h0)
    r
   TFr-   Nr$   r%   r&   r'   r(   r)   r*   r   c
           
      C   r   r   r   )
r`   r$   r%   r&   r'   r(   r)   r*   r/   r0   r   r   r   rG     s   zGRU.__init__c                 O   r   r   r   r   r   r   r   rG     r   c                    s.   d|v rt dt jdg|R i | d S )Nr+   r   r   )rN   rF   rG   r   rp   r   r   rG     s
   r   r   c                 C   r   r   r   r   r   r   r   r   &  r   zGRU.forwardc                 C   r   r   r   r   r   r   r   r   -  r   c                 C   sH  |    |}t|tr8|\}}}}|d }|d u r1| jrdnd}tj| j| || j|j|j	d}n| 
||}nd }| dvrJtd|  d| dk}	| jrUdnd}
|	sx||
}|d urw| dkrrtd	|  d
|d}n|d ur| dkrtd|  d
| jr|dn|d}d }d }|d u r| jrdnd}tj| j| || j|j|j	d}n| 
||}| ||| |d u rt||| j| j| j| j| j| j| j	}nt|||| j| j| j| j| j| j	}|d }|d }t|trt||||}|| 
||fS |	s||
}|d}|| 
||fS )Nr   r1   r
   r   r   z(GRU: Expected input to be 2D or 3D, got r  r3   r   r   r   )r   rJ   r	   r*   rX   r   r&   r%   r0   r/   r   r   rN   r(   r   r   r   r   r   grur   r'   r)   r   r   )r`   r   r   r   r   r   r   r   rb   r   r   r   r  r   r  r   r   r   r   4  s   






)r
   TFr-   FNNr   )rT   r   r   r   r   rQ   rM   rH   rG   rX   r  r  r   r   r   r   r	   r   r   r   rp   r   r     sh     


r   c                       s   e Zd ZU g dZeed< eed< eed< eed< eed< 		ddedededed	df
 fd
dZd	e	fddZ
dddZ  ZS )r   )r$   r%   r'   r$   r%   r'   	weight_ih	weight_hhN
num_chunksr   c                    s   ||d}t    || _|| _|| _ttj|| |ffi || _ttj|| |ffi || _	|rRttj|| fi || _
ttj|| fi || _n| dd  | dd  |   d S )Nr.   bias_ihbias_hh)rF   rG   r$   r%   r'   r   rX   rY   r  r  r  r  register_parameterr_   )r`   r$   r%   r'   r  r/   r0   ra   rp   r   r   rG     s*   
	
zRNNCellBase.__init__c                 C   sN   d}d| j v r| jdur|d7 }d| j v r| jdkr|d7 }|jdi | j S )	Nr   r'   Tr   r   r   z, nonlinearity={nonlinearity}r   )r   r'   r   r=   r   r   r   r   r     s   zRNNCellBase.extra_reprc                 C   r   r   r   r   r   r   r   r_     r   zRNNCellBase.reset_parameters)NNr   )rT   r   r   r   rQ   r   rM   r   rG   r   r   r_   r   r   r   rp   r   r     s,   
 
!r   c                       sn   e Zd ZU dZg dZeed< 				ddeded	eded
df
 fddZ	dde
dee
 d
e
fddZ  ZS )r   ar  An Elman RNN cell with tanh or ReLU non-linearity.

    .. math::

        h' = \tanh(W_{ih} x + b_{ih}  +  W_{hh} h + b_{hh})

    If :attr:`nonlinearity` is `'relu'`, then ReLU is used in place of tanh.

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
            Default: ``True``
        nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``

    Inputs: input, hidden
        - **input**: tensor containing input features
        - **hidden**: tensor containing the initial hidden state
          Defaults to zero if not provided.

    Outputs: h'
        - **h'** of shape `(batch, hidden_size)`: tensor containing the next hidden state
          for each element in the batch

    Shape:
        - input: :math:`(N, H_{in})` or :math:`(H_{in})` tensor containing input features where
          :math:`H_{in}` = `input_size`.
        - hidden: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the initial hidden
          state where :math:`H_{out}` = `hidden_size`. Defaults to zero if not provided.
        - output: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the next hidden state.

    Attributes:
        weight_ih: the learnable input-hidden weights, of shape
            `(hidden_size, input_size)`
        weight_hh: the learnable hidden-hidden weights, of shape
            `(hidden_size, hidden_size)`
        bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`
        bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)`

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    Examples::

        >>> rnn = nn.RNNCell(10, 20)
        >>> input = torch.randn(6, 3, 10)
        >>> hx = torch.randn(3, 20)
        >>> output = []
        >>> for i in range(6):
        ...     hx = rnn(input[i], hx)
        ...     output.append(hx)
    )r$   r%   r'   r   r   Tr   Nr$   r%   r'   r   c                    s0   ||d}t  j|||fddi| || _d S )Nr.   r  r
   )rF   rG   r   )r`   r$   r%   r'   r   r/   r0   ra   rp   r   r   rG     s   
	
zRNNCell.__init__r   r   c                 C   s  |  dvrtd|   d|d ur$|  dvr$td|   d|  dk}|s1|d}|d u rEtj|d| j|j|jd}n	|sL|dn|}| j	dkrbt
||| j| j| j| j}n| j	d	krvt
||| j| j| j| j}n
|}td
| j	 |s|d}|S )Nr
   r1   z,RNNCell: Expected input to be 1D or 2D, got r  z-RNNCell: Expected hidden to be 1D or 2D, got r1   r   r   r   r   zUnknown nonlinearity: )r   rN   r   rX   r   r   r%   r0   r/   r   r   rnn_tanh_cellr  r  r  r  rnn_relu_cellr   r   r`   r   r   r   r   r   r   r   r     sN   


	
zRNNCell.forward)Tr   NNr   )rT   r   r   r   r   r   r   rQ   rM   rG   r   r   r   r   r   r   rp   r   r     s(   
 6$r   c                	       sh   e Zd ZdZ			ddedededdf fdd	Z	dd
edee	eef  de	eef fddZ
  ZS )r   a2
  A long short-term memory (LSTM) cell.

    .. math::

        \begin{array}{ll}
        i = \sigma(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
        f = \sigma(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
        g = \tanh(W_{ig} x + b_{ig} + W_{hg} h + b_{hg}) \\
        o = \sigma(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
        c' = f \odot c + i \odot g \\
        h' = o \odot \tanh(c') \\
        \end{array}

    where :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product.

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        bias: If ``False``, then the layer does not use bias weights `b_ih` and
            `b_hh`. Default: ``True``

    Inputs: input, (h_0, c_0)
        - **input** of shape `(batch, input_size)` or `(input_size)`: tensor containing input features
        - **h_0** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the initial hidden state
        - **c_0** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the initial cell state

          If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero.

    Outputs: (h_1, c_1)
        - **h_1** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the next hidden state
        - **c_1** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the next cell state

    Attributes:
        weight_ih: the learnable input-hidden weights, of shape
            `(4*hidden_size, input_size)`
        weight_hh: the learnable hidden-hidden weights, of shape
            `(4*hidden_size, hidden_size)`
        bias_ih: the learnable input-hidden bias, of shape `(4*hidden_size)`
        bias_hh: the learnable hidden-hidden bias, of shape `(4*hidden_size)`

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

    Examples::

        >>> rnn = nn.LSTMCell(10, 20)  # (input_size, hidden_size)
        >>> input = torch.randn(2, 3, 10)  # (time_steps, batch, input_size)
        >>> hx = torch.randn(3, 20)  # (batch, hidden_size)
        >>> cx = torch.randn(3, 20)
        >>> output = []
        >>> for i in range(input.size()[0]):
        ...     hx, cx = rnn(input[i], (hx, cx))
        ...     output.append(hx)
        >>> output = torch.stack(output, dim=0)
    TNr$   r%   r'   r   c                    *   ||d}t  j|||fddi| d S )Nr.   r  r2   r  r`   r$   r%   r'   r/   r0   ra   rp   r   r   rG        
 zLSTMCell.__init__r   r   c                 C   s  |  dvrtd|   d|d ur0t|D ]\}}|  dvr/td| d|   dq|  dk}|s=|d}|d u rUtj|d| j|j|j	d}||f}n|se|d d|d	 dfn|}t
||| j| j| j| j}|s|d d|d	 df}|S )
Nr  z-LSTMCell: Expected input to be 1D or 2D, got r  zLSTMCell: Expected hx[z] to be 1D or 2D, got r1   r   r   r
   )r   rN   	enumerater   rX   r   r   r%   r0   r/   r   	lstm_cellr  r  r  r  r   )r`   r   r   r   r   r   r   r   r   r   r   r     s>   

$	zLSTMCell.forwardTNNr   )rT   r   r   r   rQ   rM   rG   r   r   r   r   r   r   r   rp   r   r   J  s,    ?
r   c                	       sV   e Zd ZdZ			ddedededdf fdd	Zdd
edee defddZ	  Z
S )r   az	  A gated recurrent unit (GRU) cell.

    .. math::

        \begin{array}{ll}
        r = \sigma(W_{ir} x + b_{ir} + W_{hr} h + b_{hr}) \\
        z = \sigma(W_{iz} x + b_{iz} + W_{hz} h + b_{hz}) \\
        n = \tanh(W_{in} x + b_{in} + r \odot (W_{hn} h + b_{hn})) \\
        h' = (1 - z) \odot n + z \odot h
        \end{array}

    where :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product.

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        bias: If ``False``, then the layer does not use bias weights `b_ih` and
            `b_hh`. Default: ``True``

    Inputs: input, hidden
        - **input** : tensor containing input features
        - **hidden** : tensor containing the initial hidden
          state for each element in the batch.
          Defaults to zero if not provided.

    Outputs: h'
        - **h'** : tensor containing the next hidden state
          for each element in the batch

    Shape:
        - input: :math:`(N, H_{in})` or :math:`(H_{in})` tensor containing input features where
          :math:`H_{in}` = `input_size`.
        - hidden: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the initial hidden
          state where :math:`H_{out}` = `hidden_size`. Defaults to zero if not provided.
        - output: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the next hidden state.

    Attributes:
        weight_ih: the learnable input-hidden weights, of shape
            `(3*hidden_size, input_size)`
        weight_hh: the learnable hidden-hidden weights, of shape
            `(3*hidden_size, hidden_size)`
        bias_ih: the learnable input-hidden bias, of shape `(3*hidden_size)`
        bias_hh: the learnable hidden-hidden bias, of shape `(3*hidden_size)`

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

    Examples::

        >>> rnn = nn.GRUCell(10, 20)
        >>> input = torch.randn(6, 3, 10)
        >>> hx = torch.randn(3, 20)
        >>> output = []
        >>> for i in range(6):
        ...     hx = rnn(input[i], hx)
        ...     output.append(hx)
    TNr$   r%   r'   r   c                    r  )Nr.   r  r3   r  r  rp   r   r   rG     r  zGRUCell.__init__r   r   c                 C   s   |  dvrtd|   d|d ur$|  dvr$td|   d|  dk}|s1|d}|d u rEtj|d| j|j|jd}n	|sL|dn|}t	
||| j| j| j| j}|sc|d}|S )Nr  z,GRUCell: Expected input to be 1D or 2D, got r  z-GRUCell: Expected hidden to be 1D or 2D, got r1   r   r   )r   rN   r   rX   r   r   r%   r0   r/   r   gru_cellr  r  r  r  r   r  r   r   r   r     s6   
	
zGRUCell.forwardr  r   )rT   r   r   r   rQ   rM   rG   r   r   r   r   r   r   rp   r   r     s    A$r   )r
   )$r   rK   rO   r|   typingr   r   typing_extensionsr   rX   r   r   torch.nnr   torch.nn.parameterr   torch.nn.utils.rnnr	   moduler   __all__r   r   
_rnn_implsrQ   r   FutureWarningr"   r   r   r   r   r   r   r   r   r   r   r   r   <module>   sN      (  H  q  :wn