diff --git a/AdaptiveAvgPool2d.patch b/AdaptiveAvgPool2d.patch
deleted file mode 100644
index e7dc4ac..0000000
--- a/AdaptiveAvgPool2d.patch
+++ /dev/null
@@ -1,29 +0,0 @@
---- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/pooling.py
-+++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/pooling.py
-@@ -6,7 +6,7 @@
- 
-     Args:
-         output_size: the target output size of the image of the form H x W.
--                     Can be a tuple (H, W) or a single H for a square image H x H.
-+                     Can be a tuple (H, W) or a single H for a square image H x H
-                      H and W can be either a ``int``, or ``None`` which means the size will
-                      be the same as that of the input.
- 
-@@ -20,14 +20,13 @@
-         >>> input = torch.randn(1, 64, 10, 9)
-         >>> output = m(input)
-         >>> # target output size of 10x7
--        >>> m = nn.AdaptiveAvgPool2d((None, 7))
-+        >>> m = nn.AdaptiveMaxPool2d((None, 7))
-         >>> input = torch.randn(1, 64, 10, 9)
-         >>> output = m(input)
- 
-     """
- 
--    output_size: _size_2_t
--
--    def forward(self, input: Tensor) -> Tensor:
-+    @weak_script_method
-+    def forward(self, input):
-         return F.adaptive_avg_pool2d(input, self.output_size)
- 
\ No newline at end of file
diff --git a/BatchNorm1d.patch b/BatchNorm1d.patch
deleted file mode 100644
index f16cb73..0000000
--- a/BatchNorm1d.patch
+++ /dev/null
@@ -1,59 +0,0 @@
---- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/batchnorm.py
-+++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/batchnorm.py
-@@ -1,8 +1,7 @@
- class BatchNorm1d(_BatchNorm):
-     r"""Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D
-     inputs with optional additional channel dimension) as described in the paper
--    `Batch Normalization: Accelerating Deep Network Training by Reducing
--    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
-+    `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift`_ .
- 
-     .. math::
- 
-@@ -10,9 +9,8 @@
- 
-     The mean and standard-deviation are calculated per-dimension over
-     the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
--    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
--    to 1 and the elements of :math:`\beta` are set to 0. The standard-deviation is calculated
--    via the biased estimator, equivalent to `torch.var(input, unbiased=False)`.
-+    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are sampled
-+    from :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
- 
-     Also by default, during training this layer keeps running estimates of its
-     computed mean and variance, which are then used for normalization during
-@@ -27,7 +25,7 @@
-         This :attr:`momentum` argument is different from one used in optimizer
-         classes and the conventional notion of momentum. Mathematically, the
-         update rule for running statistics here is
--        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
-+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momemtum} \times x_t`,
-         where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
-         new observed value.
- 
-@@ -46,10 +44,8 @@
-             learnable affine parameters. Default: ``True``
-         track_running_stats: a boolean value that when set to ``True``, this
-             module tracks the running mean and variance, and when set to ``False``,
--            this module does not track such statistics, and initializes statistics
--            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
--            When these buffers are ``None``, this module always uses batch statistics.
--            in both training and eval modes. Default: ``True``
-+            this module does not track such statistics and always uses batch
-+            statistics in both training and eval modes. Default: ``True``
- 
-     Shape:
-         - Input: :math:`(N, C)` or :math:`(N, C, L)`
-@@ -63,8 +59,12 @@
-         >>> m = nn.BatchNorm1d(100, affine=False)
-         >>> input = torch.randn(20, 100)
-         >>> output = m(input)
-+
-+    .. _`Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift`:
-+        https://arxiv.org/abs/1502.03167
-     """
- 
-+    @weak_script_method
-     def _check_input_dim(self, input):
-         if input.dim() != 2 and input.dim() != 3:
-             raise ValueError('expected 2D or 3D input (got {}D input)'
\ No newline at end of file
diff --git a/BatchNorm2d.patch b/BatchNorm2d.patch
deleted file mode 100644
index c280325..0000000
--- a/BatchNorm2d.patch
+++ /dev/null
@@ -1,59 +0,0 @@
---- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/batchnorm.py
-+++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/batchnorm.py
-@@ -1,8 +1,7 @@
- class BatchNorm2d(_BatchNorm):
-     r"""Applies Batch Normalization over a 4D input (a mini-batch of 2D inputs
-     with additional channel dimension) as described in the paper
--    `Batch Normalization: Accelerating Deep Network Training by Reducing
--    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
-+    `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift`_ .
- 
-     .. math::
- 
-@@ -10,9 +9,8 @@
- 
-     The mean and standard-deviation are calculated per-dimension over
-     the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
--    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
--    to 1 and the elements of :math:`\beta` are set to 0. The standard-deviation is calculated
--    via the biased estimator, equivalent to `torch.var(input, unbiased=False)`.
-+    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are sampled
-+    from :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
- 
-     Also by default, during training this layer keeps running estimates of its
-     computed mean and variance, which are then used for normalization during
-@@ -27,7 +25,7 @@
-         This :attr:`momentum` argument is different from one used in optimizer
-         classes and the conventional notion of momentum. Mathematically, the
-         update rule for running statistics here is
--        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
-+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momemtum} \times x_t`,
-         where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
-         new observed value.
- 
-@@ -46,10 +44,8 @@
-             learnable affine parameters. Default: ``True``
-         track_running_stats: a boolean value that when set to ``True``, this
-             module tracks the running mean and variance, and when set to ``False``,
--            this module does not track such statistics, and initializes statistics
--            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
--            When these buffers are ``None``, this module always uses batch statistics.
--            in both training and eval modes. Default: ``True``
-+            this module does not track such statistics and always uses batch
-+            statistics in both training and eval modes. Default: ``True``
- 
-     Shape:
-         - Input: :math:`(N, C, H, W)`
-@@ -63,8 +59,12 @@
-         >>> m = nn.BatchNorm2d(100, affine=False)
-         >>> input = torch.randn(20, 100, 35, 45)
-         >>> output = m(input)
-+
-+    .. _`Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift`:
-+        https://arxiv.org/abs/1502.03167
-     """
- 
-+    @weak_script_method
-     def _check_input_dim(self, input):
-         if input.dim() != 4:
-             raise ValueError('expected 4D input (got {}D input)'
\ No newline at end of file
diff --git a/Conv2d.patch b/Conv2d.patch
deleted file mode 100644
index a2228e4..0000000
--- a/Conv2d.patch
+++ /dev/null
@@ -1,140 +0,0 @@
---- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/conv.py
-+++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/conv.py
-@@ -15,8 +15,6 @@
-     :math:`N` is a batch size, :math:`C` denotes a number of channels,
-     :math:`H` is a height of input planes in pixels, and :math:`W` is
-     width in pixels.
--
--    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
- 
-     * :attr:`stride` controls the stride for the cross-correlation, a single
-       number or a tuple.
-@@ -39,7 +37,7 @@
-           concatenated.
-         * At groups= :attr:`in_channels`, each input channel is convolved with
-           its own set of filters, of size:
--          :math:`\left\lfloor\frac{out\_channels}{in\_channels}\right\rfloor`.
-+          :math:`\left\lfloor\frac{C_\text{out}}{C_\text{in}}\right\rfloor`.
- 
-     The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
- 
-@@ -47,14 +45,14 @@
-         - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
-           and the second `int` for the width dimension
- 
--    Note:
-+    .. note::
- 
-          Depending of the size of your kernel, several (of the last)
-          columns of the input might be lost, because it is a valid `cross-correlation`_,
-          and not a full `cross-correlation`_.
-          It is up to the user to add proper padding.
- 
--    Note:
-+    .. note::
- 
-         When `groups == in_channels` and `out_channels == K * in_channels`,
-         where `K` is a positive integer, this operation is also termed in
-@@ -64,29 +62,17 @@
-         a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments
-         :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.
- 
--    Note:
--        In some circumstances when using the CUDA backend with CuDNN, this operator
--        may select a nondeterministic algorithm to increase performance. If this is
--        undesirable, you can try to make the operation deterministic (potentially at
--        a performance cost) by setting ``torch.backends.cudnn.deterministic =
--        True``.
--        Please see the notes on :doc:`/notes/randomness` for background.
--
-+    .. include:: cudnn_deterministic.rst
- 
-     Args:
-         in_channels (int): Number of channels in the input image
-         out_channels (int): Number of channels produced by the convolution
-         kernel_size (int or tuple): Size of the convolving kernel
-         stride (int or tuple, optional): Stride of the convolution. Default: 1
--        padding (int or tuple, optional): Zero-padding added to both sides of
--            the input. Default: 0
--        padding_mode (string, optional): ``'zeros'``, ``'reflect'``,
--            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
-+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
-         dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
--        groups (int, optional): Number of blocked connections from input
--            channels to output channels. Default: 1
--        bias (bool, optional): If ``True``, adds a learnable bias to the
--            output. Default: ``True``
-+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
-+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
- 
-     Shape:
-         - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
-@@ -102,18 +88,16 @@
- 
-     Attributes:
-         weight (Tensor): the learnable weights of the module of shape
--            :math:`(\text{out\_channels}, \frac{\text{in\_channels}}{\text{groups}},`
--            :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]})`.
--            The values of these weights are sampled from
--            :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
--            :math:`k = \frac{groups}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
--        bias (Tensor):   the learnable bias of the module of shape
--            (out_channels). If :attr:`bias` is ``True``,
--            then the values of these weights are
--            sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
--            :math:`k = \frac{groups}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
-+                         (out_channels, in_channels, kernel_size[0], kernel_size[1]).
-+                         The values of these weights are sampled from
-+                         :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-+                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
-+        bias (Tensor):   the learnable bias of the module of shape (out_channels). If :attr:`bias` is ``True``,
-+                         then the values of these weights are
-+                         sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-+                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
- 
--    Examples:
-+    Examples::
- 
-         >>> # With square kernels and equal stride
-         >>> m = nn.Conv2d(16, 33, 3, stride=2)
-@@ -130,34 +114,18 @@
-     .. _link:
-         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
-     """
--    def __init__(
--        self,
--        in_channels: int,
--        out_channels: int,
--        kernel_size: _size_2_t,
--        stride: _size_2_t = 1,
--        padding: _size_2_t = 0,
--        dilation: _size_2_t = 1,
--        groups: int = 1,
--        bias: bool = True,
--        padding_mode: str = 'zeros'  # TODO: refine this type
--    ):
-+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
-+                 padding=0, dilation=1, groups=1, bias=True):
-         kernel_size = _pair(kernel_size)
-         stride = _pair(stride)
-         padding = _pair(padding)
-         dilation = _pair(dilation)
-         super(Conv2d, self).__init__(
-             in_channels, out_channels, kernel_size, stride, padding, dilation,
--            False, _pair(0), groups, bias, padding_mode)
-+            False, _pair(0), groups, bias)
- 
--    def _conv_forward(self, input, weight):
--        if self.padding_mode != 'zeros':
--            return F.conv2d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
--                            weight, self.bias, self.stride,
--                            _pair(0), self.dilation, self.groups)
--        return F.conv2d(input, weight, self.bias, self.stride,
-+    @weak_script_method
-+    def forward(self, input):
-+        return F.conv2d(input, self.weight, self.bias, self.stride,
-                         self.padding, self.dilation, self.groups)
- 
--    def forward(self, input: Tensor) -> Tensor:
--        return self._conv_forward(input, self.weight)
--
\ No newline at end of file
diff --git a/DataParallel.patch b/DataParallel.patch
deleted file mode 100644
index 8fddc8e..0000000
--- a/DataParallel.patch
+++ /dev/null
@@ -1,97 +0,0 @@
---- /usr/local/lib/python3.5/dist-packages/torch/nn/parallel/data_parallel.py
-+++ /usr/local/lib/python3.5/dist-packages/torch/nn/parallel/data_parallel.py
-@@ -10,16 +10,13 @@
- 
-     The batch size should be larger than the number of GPUs used.
- 
--    .. warning::
--        It is recommended to use :class:`~torch.nn.parallel.DistributedDataParallel`,
--        instead of this class, to do multi-GPU training, even if there is only a single
--        node. See: :ref:`cuda-nn-ddp-instead` and :ref:`ddp`.
-+    See also: :ref:`cuda-nn-dataparallel-instead`
- 
-     Arbitrary positional and keyword inputs are allowed to be passed into
--    DataParallel but some types are specially handled. tensors will be
--    **scattered** on dim specified (default 0). tuple, list and dict types will
--    be shallow copied. The other types will be shared among different threads
--    and can be corrupted if written to in the model's forward pass.
-+    DataParallel EXCEPT Tensors. All tensors will be scattered on dim
-+    specified (default 0). Primitive types will be broadcasted, but all
-+    other types will be a shallow copy and can be corrupted if written to in
-+    the model's forward pass.
- 
-     The parallelized :attr:`module` must have its parameters and buffers on
-     ``device_ids[0]`` before running this :class:`~torch.nn.DataParallel`
-@@ -27,9 +24,9 @@
- 
-     .. warning::
-         In each forward, :attr:`module` is **replicated** on each device, so any
--        updates to the running module in ``forward`` will be lost. For example,
-+        updates to the runing module in ``forward`` will be lost. For example,
-         if :attr:`module` has a counter attribute that is incremented in each
--        ``forward``, it will always stay at the initial value because the update
-+        ``forward``, it will always stay at the initial value becasue the update
-         is done on the replicas which are destroyed after ``forward``. However,
-         :class:`~torch.nn.DataParallel` guarantees that the replica on
-         ``device[0]`` will have its parameters and buffers sharing storage with
-@@ -74,7 +71,7 @@
-     Example::
- 
-         >>> net = torch.nn.DataParallel(model, device_ids=[0, 1, 2])
--        >>> output = net(input_var)  # input_var can be on any device, including CPU
-+        >>> output = net(input_var)
-     """
- 
-     # TODO: update notes/cuda.rst when this class handles 8+ GPUs well
-@@ -82,15 +79,13 @@
-     def __init__(self, module, device_ids=None, output_device=None, dim=0):
-         super(DataParallel, self).__init__()
- 
--        device_type = _get_available_device_type()
--        if device_type is None:
-+        if not torch.cuda.is_available():
-             self.module = module
-             self.device_ids = []
-             return
- 
-         if device_ids is None:
--            device_ids = _get_all_device_indices()
--
-+            device_ids = list(range(torch.cuda.device_count()))
-         if output_device is None:
-             output_device = device_ids[0]
- 
-@@ -98,23 +93,15 @@
-         self.module = module
-         self.device_ids = list(map(lambda x: _get_device_index(x, True), device_ids))
-         self.output_device = _get_device_index(output_device, True)
--        self.src_device_obj = torch.device(device_type, self.device_ids[0])
- 
-         _check_balance(self.device_ids)
- 
-         if len(self.device_ids) == 1:
--            self.module.to(self.src_device_obj)
-+            self.module.cuda(device_ids[0])
- 
-     def forward(self, *inputs, **kwargs):
-         if not self.device_ids:
-             return self.module(*inputs, **kwargs)
--
--        for t in chain(self.module.parameters(), self.module.buffers()):
--            if t.device != self.src_device_obj:
--                raise RuntimeError("module must have its parameters and buffers "
--                                   "on device {} (device_ids[0]) but found one of "
--                                   "them on device: {}".format(self.src_device_obj, t.device))
--
-         inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
-         if len(self.device_ids) == 1:
-             return self.module(*inputs[0], **kwargs[0])
-@@ -123,7 +110,7 @@
-         return self.gather(outputs, self.output_device)
- 
-     def replicate(self, module, device_ids):
--        return replicate(module, device_ids, not torch.is_grad_enabled())
-+        return replicate(module, device_ids)
- 
-     def scatter(self, inputs, kwargs, device_ids):
-         return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
\ No newline at end of file
diff --git a/Dropout.patch b/Dropout.patch
deleted file mode 100644
index e4345bf..0000000
--- a/Dropout.patch
+++ /dev/null
@@ -1,22 +0,0 @@
---- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/dropout.py
-+++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/dropout.py
-@@ -18,8 +18,8 @@
-         inplace: If set to ``True``, will do this operation in-place. Default: ``False``
- 
-     Shape:
--        - Input: :math:`(*)`. Input can be of any shape
--        - Output: :math:`(*)`. Output is of the same shape as input
-+        - Input: `Any`. Input can be of any shape
-+        - Output: `Same`. Output is of the same shape as input
- 
-     Examples::
- 
-@@ -31,6 +31,7 @@
-         detectors: https://arxiv.org/abs/1207.0580
-     """
- 
--    def forward(self, input: Tensor) -> Tensor:
-+    @weak_script_method
-+    def forward(self, input):
-         return F.dropout(input, self.p, self.training, self.inplace)
- 
\ No newline at end of file
diff --git a/Linear.patch b/Linear.patch
deleted file mode 100644
index ef25bbe..0000000
--- a/Linear.patch
+++ /dev/null
@@ -1,64 +0,0 @@
---- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/linear.py
-+++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/linear.py
-@@ -1,19 +1,17 @@
- class Linear(Module):
-     r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
--
--    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
- 
-     Args:
-         in_features: size of each input sample
-         out_features: size of each output sample
--        bias: If set to ``False``, the layer will not learn an additive bias.
-+        bias: If set to False, the layer will not learn an additive bias.
-             Default: ``True``
- 
-     Shape:
--        - Input: :math:`(N, *, H_{in})` where :math:`*` means any number of
--          additional dimensions and :math:`H_{in} = \text{in\_features}`
--        - Output: :math:`(N, *, H_{out})` where all but the last dimension
--          are the same shape as the input and :math:`H_{out} = \text{out\_features}`.
-+        - Input: :math:`(N, *, \text{in\_features})` where :math:`*` means any number of
-+          additional dimensions
-+        - Output: :math:`(N, *, \text{out\_features})` where all but the last dimension
-+          are the same shape as the input.
- 
-     Attributes:
-         weight: the learnable weights of the module of shape
-@@ -33,12 +31,9 @@
-         >>> print(output.size())
-         torch.Size([128, 30])
-     """
--    __constants__ = ['in_features', 'out_features']
--    in_features: int
--    out_features: int
--    weight: Tensor
-+    __constants__ = ['bias']
- 
--    def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
-+    def __init__(self, in_features, out_features, bias=True):
-         super(Linear, self).__init__()
-         self.in_features = in_features
-         self.out_features = out_features
-@@ -49,17 +44,18 @@
-             self.register_parameter('bias', None)
-         self.reset_parameters()
- 
--    def reset_parameters(self) -> None:
-+    def reset_parameters(self):
-         init.kaiming_uniform_(self.weight, a=math.sqrt(5))
-         if self.bias is not None:
-             fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
-             bound = 1 / math.sqrt(fan_in)
-             init.uniform_(self.bias, -bound, bound)
- 
--    def forward(self, input: Tensor) -> Tensor:
-+    @weak_script_method
-+    def forward(self, input):
-         return F.linear(input, self.weight, self.bias)
- 
--    def extra_repr(self) -> str:
-+    def extra_repr(self):
-         return 'in_features={}, out_features={}, bias={}'.format(
-             self.in_features, self.out_features, self.bias is not None
-         )
\ No newline at end of file
diff --git a/MaxPool2d.patch b/MaxPool2d.patch
deleted file mode 100644
index 5a991b0..0000000
--- a/MaxPool2d.patch
+++ /dev/null
@@ -1,17 +0,0 @@
---- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/pooling.py
-+++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/pooling.py
-@@ -57,12 +57,8 @@
-         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
-     """
- 
--    kernel_size: _size_2_t
--    stride: _size_2_t
--    padding: _size_2_t
--    dilation: _size_2_t
--
--    def forward(self, input: Tensor) -> Tensor:
-+    @weak_script_method
-+    def forward(self, input):
-         return F.max_pool2d(input, self.kernel_size, self.stride,
-                             self.padding, self.dilation, self.ceil_mode,
-                             self.return_indices)
\ No newline at end of file
diff --git a/PReLU.patch b/PReLU.patch
deleted file mode 100644
index d74cce1..0000000
--- a/PReLU.patch
+++ /dev/null
@@ -1,37 +0,0 @@
---- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/activation.py
-+++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/activation.py
-@@ -37,9 +37,10 @@
-         - Output: :math:`(N, *)`, same shape as the input
- 
-     Attributes:
--        weight (Tensor): the learnable weights of shape (:attr:`num_parameters`).
-+        weight (Tensor): the learnable weights of shape (attr:`num_parameters`).
-+            The attr:`dtype` is default to
- 
--    .. image:: ../scripts/activation_images/PReLU.png
-+    .. image:: scripts/activation_images/PReLU.png
- 
-     Examples::
- 
-@@ -47,17 +48,16 @@
-         >>> input = torch.randn(2)
-         >>> output = m(input)
-     """
--    __constants__ = ['num_parameters']
--    num_parameters: int
- 
--    def __init__(self, num_parameters: int = 1, init: float = 0.25) -> None:
-+    def __init__(self, num_parameters=1, init=0.25):
-         self.num_parameters = num_parameters
-         super(PReLU, self).__init__()
-         self.weight = Parameter(torch.Tensor(num_parameters).fill_(init))
- 
--    def forward(self, input: Tensor) -> Tensor:
-+    @weak_script_method
-+    def forward(self, input):
-         return F.prelu(input, self.weight)
- 
--    def extra_repr(self) -> str:
-+    def extra_repr(self):
-         return 'num_parameters={}'.format(self.num_parameters)
- 
\ No newline at end of file
diff --git a/Sequential.patch b/Sequential.patch
deleted file mode 100644
index 6c7f6ac..0000000
--- a/Sequential.patch
+++ /dev/null
@@ -1,70 +0,0 @@
---- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/container.py
-+++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/container.py
-@@ -22,15 +22,7 @@
-                 ]))
-     """
- 
--    @overload
--    def __init__(self, *args: Module) -> None:
--        ...
--
--    @overload
--    def __init__(self, arg: 'OrderedDict[str, Module]') -> None:
--        ...
--
--    def __init__(self, *args: Any):
-+    def __init__(self, *args):
-         super(Sequential, self).__init__()
-         if len(args) == 1 and isinstance(args[0], OrderedDict):
-             for key, module in args[0].items():
-@@ -48,18 +40,17 @@
-         idx %= size
-         return next(islice(iterator, idx, None))
- 
--    @_copy_to_script_wrapper
--    def __getitem__(self: T, idx) -> T:
-+    def __getitem__(self, idx):
-         if isinstance(idx, slice):
-             return self.__class__(OrderedDict(list(self._modules.items())[idx]))
-         else:
-             return self._get_item_by_idx(self._modules.values(), idx)
- 
--    def __setitem__(self, idx: int, module: Module) -> None:
-+    def __setitem__(self, idx, module):
-         key = self._get_item_by_idx(self._modules.keys(), idx)
-         return setattr(self, key, module)
- 
--    def __delitem__(self, idx: Union[slice, int]) -> None:
-+    def __delitem__(self, idx):
-         if isinstance(idx, slice):
-             for key in list(self._modules.keys())[idx]:
-                 delattr(self, key)
-@@ -67,26 +58,16 @@
-             key = self._get_item_by_idx(self._modules.keys(), idx)
-             delattr(self, key)
- 
--    @_copy_to_script_wrapper
--    def __len__(self) -> int:
-+    def __len__(self):
-         return len(self._modules)
- 
--    @_copy_to_script_wrapper
-     def __dir__(self):
-         keys = super(Sequential, self).__dir__()
-         keys = [key for key in keys if not key.isdigit()]
-         return keys
- 
--    @_copy_to_script_wrapper
--    def __iter__(self) -> Iterator[Module]:
--        return iter(self._modules.values())
--
--    # NB: We can't really type check this function as the type of input
--    # may change dynamically (as is tested in
--    # TestScript.test_sequential_intermediary_types).  Cannot annotate
--    # with Any as TorchScript expects a more precise type
-     def forward(self, input):
--        for module in self:
-+        for module in self._modules.values():
-             input = module(input)
-         return input
- 
\ No newline at end of file
diff --git a/Sigmoid.patch b/Sigmoid.patch
deleted file mode 100644
index 9ad9766..0000000
--- a/Sigmoid.patch
+++ /dev/null
@@ -1,29 +0,0 @@
---- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/activation.py
-+++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/activation.py
-@@ -2,7 +2,7 @@
-     r"""Applies the element-wise function:
- 
-     .. math::
--        \text{Sigmoid}(x) = \sigma(x) = \frac{1}{1 + \exp(-x)}
-+        \text{Sigmoid}(x) = \frac{1}{1 + \exp(-x)}
- 
- 
-     Shape:
-@@ -10,7 +10,7 @@
-           dimensions
-         - Output: :math:`(N, *)`, same shape as the input
- 
--    .. image:: ../scripts/activation_images/Sigmoid.png
-+    .. image:: scripts/activation_images/Sigmoid.png
- 
-     Examples::
- 
-@@ -19,6 +19,7 @@
-         >>> output = m(input)
-     """
- 
--    def forward(self, input: Tensor) -> Tensor:
-+    @weak_script_method
-+    def forward(self, input):
-         return torch.sigmoid(input)
- 
\ No newline at end of file