diff --git a/.gitignore b/.gitignore
index b6e4761..c38a594 100644
--- a/.gitignore
+++ b/.gitignore
@@ -127,3 +127,7 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+checkpoints/
+*.tar
+*.zip
diff --git a/AdaptiveAvgPool2d.patch b/AdaptiveAvgPool2d.patch
new file mode 100644
index 0000000..e7dc4ac
--- /dev/null
+++ b/AdaptiveAvgPool2d.patch
@@ -0,0 +1,29 @@
+--- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/pooling.py
++++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/pooling.py
+@@ -6,7 +6,7 @@
+ 
+     Args:
+         output_size: the target output size of the image of the form H x W.
+-                     Can be a tuple (H, W) or a single H for a square image H x H.
++                     Can be a tuple (H, W) or a single H for a square image H x H
+                      H and W can be either a ``int``, or ``None`` which means the size will
+                      be the same as that of the input.
+ 
+@@ -20,14 +20,13 @@
+         >>> input = torch.randn(1, 64, 10, 9)
+         >>> output = m(input)
+         >>> # target output size of 10x7
+-        >>> m = nn.AdaptiveAvgPool2d((None, 7))
++        >>> m = nn.AdaptiveMaxPool2d((None, 7))
+         >>> input = torch.randn(1, 64, 10, 9)
+         >>> output = m(input)
+ 
+     """
+ 
+-    output_size: _size_2_t
+-
+-    def forward(self, input: Tensor) -> Tensor:
++    @weak_script_method
++    def forward(self, input):
+         return F.adaptive_avg_pool2d(input, self.output_size)
+ 
\ No newline at end of file
diff --git a/BatchNorm1d.patch b/BatchNorm1d.patch
new file mode 100644
index 0000000..f16cb73
--- /dev/null
+++ b/BatchNorm1d.patch
@@ -0,0 +1,59 @@
+--- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/batchnorm.py
++++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/batchnorm.py
+@@ -1,8 +1,7 @@
+ class BatchNorm1d(_BatchNorm):
+     r"""Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D
+     inputs with optional additional channel dimension) as described in the paper
+-    `Batch Normalization: Accelerating Deep Network Training by Reducing
+-    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
++    `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift`_ .
+ 
+     .. math::
+ 
+@@ -10,9 +9,8 @@
+ 
+     The mean and standard-deviation are calculated per-dimension over
+     the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+-    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
+-    to 1 and the elements of :math:`\beta` are set to 0. The standard-deviation is calculated
+-    via the biased estimator, equivalent to `torch.var(input, unbiased=False)`.
++    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are sampled
++    from :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
+ 
+     Also by default, during training this layer keeps running estimates of its
+     computed mean and variance, which are then used for normalization during
+@@ -27,7 +25,7 @@
+         This :attr:`momentum` argument is different from one used in optimizer
+         classes and the conventional notion of momentum. Mathematically, the
+         update rule for running statistics here is
+-        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
++        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momemtum} \times x_t`,
+         where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+         new observed value.
+ 
+@@ -46,10 +44,8 @@
+             learnable affine parameters. Default: ``True``
+         track_running_stats: a boolean value that when set to ``True``, this
+             module tracks the running mean and variance, and when set to ``False``,
+-            this module does not track such statistics, and initializes statistics
+-            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+-            When these buffers are ``None``, this module always uses batch statistics.
+-            in both training and eval modes. Default: ``True``
++            this module does not track such statistics and always uses batch
++            statistics in both training and eval modes. Default: ``True``
+ 
+     Shape:
+         - Input: :math:`(N, C)` or :math:`(N, C, L)`
+@@ -63,8 +59,12 @@
+         >>> m = nn.BatchNorm1d(100, affine=False)
+         >>> input = torch.randn(20, 100)
+         >>> output = m(input)
++
++    .. _`Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift`:
++        https://arxiv.org/abs/1502.03167
+     """
+ 
++    @weak_script_method
+     def _check_input_dim(self, input):
+         if input.dim() != 2 and input.dim() != 3:
+             raise ValueError('expected 2D or 3D input (got {}D input)'
\ No newline at end of file
diff --git a/BatchNorm2d.patch b/BatchNorm2d.patch
new file mode 100644
index 0000000..c280325
--- /dev/null
+++ b/BatchNorm2d.patch
@@ -0,0 +1,59 @@
+--- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/batchnorm.py
++++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/batchnorm.py
+@@ -1,8 +1,7 @@
+ class BatchNorm2d(_BatchNorm):
+     r"""Applies Batch Normalization over a 4D input (a mini-batch of 2D inputs
+     with additional channel dimension) as described in the paper
+-    `Batch Normalization: Accelerating Deep Network Training by Reducing
+-    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
++    `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift`_ .
+ 
+     .. math::
+ 
+@@ -10,9 +9,8 @@
+ 
+     The mean and standard-deviation are calculated per-dimension over
+     the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+-    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
+-    to 1 and the elements of :math:`\beta` are set to 0. The standard-deviation is calculated
+-    via the biased estimator, equivalent to `torch.var(input, unbiased=False)`.
++    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are sampled
++    from :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
+ 
+     Also by default, during training this layer keeps running estimates of its
+     computed mean and variance, which are then used for normalization during
+@@ -27,7 +25,7 @@
+         This :attr:`momentum` argument is different from one used in optimizer
+         classes and the conventional notion of momentum. Mathematically, the
+         update rule for running statistics here is
+-        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
++        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momemtum} \times x_t`,
+         where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+         new observed value.
+ 
+@@ -46,10 +44,8 @@
+             learnable affine parameters. Default: ``True``
+         track_running_stats: a boolean value that when set to ``True``, this
+             module tracks the running mean and variance, and when set to ``False``,
+-            this module does not track such statistics, and initializes statistics
+-            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+-            When these buffers are ``None``, this module always uses batch statistics.
+-            in both training and eval modes. Default: ``True``
++            this module does not track such statistics and always uses batch
++            statistics in both training and eval modes. Default: ``True``
+ 
+     Shape:
+         - Input: :math:`(N, C, H, W)`
+@@ -63,8 +59,12 @@
+         >>> m = nn.BatchNorm2d(100, affine=False)
+         >>> input = torch.randn(20, 100, 35, 45)
+         >>> output = m(input)
++
++    .. _`Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift`:
++        https://arxiv.org/abs/1502.03167
+     """
+ 
++    @weak_script_method
+     def _check_input_dim(self, input):
+         if input.dim() != 4:
+             raise ValueError('expected 4D input (got {}D input)'
\ No newline at end of file
diff --git a/Conv2d.patch b/Conv2d.patch
new file mode 100644
index 0000000..a2228e4
--- /dev/null
+++ b/Conv2d.patch
@@ -0,0 +1,140 @@
+--- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/conv.py
++++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/conv.py
+@@ -15,8 +15,6 @@
+     :math:`N` is a batch size, :math:`C` denotes a number of channels,
+     :math:`H` is a height of input planes in pixels, and :math:`W` is
+     width in pixels.
+-
+-    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
+ 
+     * :attr:`stride` controls the stride for the cross-correlation, a single
+       number or a tuple.
+@@ -39,7 +37,7 @@
+           concatenated.
+         * At groups= :attr:`in_channels`, each input channel is convolved with
+           its own set of filters, of size:
+-          :math:`\left\lfloor\frac{out\_channels}{in\_channels}\right\rfloor`.
++          :math:`\left\lfloor\frac{C_\text{out}}{C_\text{in}}\right\rfloor`.
+ 
+     The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+ 
+@@ -47,14 +45,14 @@
+         - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+           and the second `int` for the width dimension
+ 
+-    Note:
++    .. note::
+ 
+          Depending of the size of your kernel, several (of the last)
+          columns of the input might be lost, because it is a valid `cross-correlation`_,
+          and not a full `cross-correlation`_.
+          It is up to the user to add proper padding.
+ 
+-    Note:
++    .. note::
+ 
+         When `groups == in_channels` and `out_channels == K * in_channels`,
+         where `K` is a positive integer, this operation is also termed in
+@@ -64,29 +62,17 @@
+         a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments
+         :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.
+ 
+-    Note:
+-        In some circumstances when using the CUDA backend with CuDNN, this operator
+-        may select a nondeterministic algorithm to increase performance. If this is
+-        undesirable, you can try to make the operation deterministic (potentially at
+-        a performance cost) by setting ``torch.backends.cudnn.deterministic =
+-        True``.
+-        Please see the notes on :doc:`/notes/randomness` for background.
+-
++    .. include:: cudnn_deterministic.rst
+ 
+     Args:
+         in_channels (int): Number of channels in the input image
+         out_channels (int): Number of channels produced by the convolution
+         kernel_size (int or tuple): Size of the convolving kernel
+         stride (int or tuple, optional): Stride of the convolution. Default: 1
+-        padding (int or tuple, optional): Zero-padding added to both sides of
+-            the input. Default: 0
+-        padding_mode (string, optional): ``'zeros'``, ``'reflect'``,
+-            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
++        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+         dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+-        groups (int, optional): Number of blocked connections from input
+-            channels to output channels. Default: 1
+-        bias (bool, optional): If ``True``, adds a learnable bias to the
+-            output. Default: ``True``
++        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
++        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+ 
+     Shape:
+         - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
+@@ -102,18 +88,16 @@
+ 
+     Attributes:
+         weight (Tensor): the learnable weights of the module of shape
+-            :math:`(\text{out\_channels}, \frac{\text{in\_channels}}{\text{groups}},`
+-            :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]})`.
+-            The values of these weights are sampled from
+-            :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+-            :math:`k = \frac{groups}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
+-        bias (Tensor):   the learnable bias of the module of shape
+-            (out_channels). If :attr:`bias` is ``True``,
+-            then the values of these weights are
+-            sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+-            :math:`k = \frac{groups}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
++                         (out_channels, in_channels, kernel_size[0], kernel_size[1]).
++                         The values of these weights are sampled from
++                         :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
++                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
++        bias (Tensor):   the learnable bias of the module of shape (out_channels). If :attr:`bias` is ``True``,
++                         then the values of these weights are
++                         sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
++                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
+ 
+-    Examples:
++    Examples::
+ 
+         >>> # With square kernels and equal stride
+         >>> m = nn.Conv2d(16, 33, 3, stride=2)
+@@ -130,34 +114,18 @@
+     .. _link:
+         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+     """
+-    def __init__(
+-        self,
+-        in_channels: int,
+-        out_channels: int,
+-        kernel_size: _size_2_t,
+-        stride: _size_2_t = 1,
+-        padding: _size_2_t = 0,
+-        dilation: _size_2_t = 1,
+-        groups: int = 1,
+-        bias: bool = True,
+-        padding_mode: str = 'zeros'  # TODO: refine this type
+-    ):
++    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
++                 padding=0, dilation=1, groups=1, bias=True):
+         kernel_size = _pair(kernel_size)
+         stride = _pair(stride)
+         padding = _pair(padding)
+         dilation = _pair(dilation)
+         super(Conv2d, self).__init__(
+             in_channels, out_channels, kernel_size, stride, padding, dilation,
+-            False, _pair(0), groups, bias, padding_mode)
++            False, _pair(0), groups, bias)
+ 
+-    def _conv_forward(self, input, weight):
+-        if self.padding_mode != 'zeros':
+-            return F.conv2d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
+-                            weight, self.bias, self.stride,
+-                            _pair(0), self.dilation, self.groups)
+-        return F.conv2d(input, weight, self.bias, self.stride,
++    @weak_script_method
++    def forward(self, input):
++        return F.conv2d(input, self.weight, self.bias, self.stride,
+                         self.padding, self.dilation, self.groups)
+ 
+-    def forward(self, input: Tensor) -> Tensor:
+-        return self._conv_forward(input, self.weight)
+-
\ No newline at end of file
diff --git a/DataParallel.patch b/DataParallel.patch
new file mode 100644
index 0000000..8fddc8e
--- /dev/null
+++ b/DataParallel.patch
@@ -0,0 +1,97 @@
+--- /usr/local/lib/python3.5/dist-packages/torch/nn/parallel/data_parallel.py
++++ /usr/local/lib/python3.5/dist-packages/torch/nn/parallel/data_parallel.py
+@@ -10,16 +10,13 @@
+ 
+     The batch size should be larger than the number of GPUs used.
+ 
+-    .. warning::
+-        It is recommended to use :class:`~torch.nn.parallel.DistributedDataParallel`,
+-        instead of this class, to do multi-GPU training, even if there is only a single
+-        node. See: :ref:`cuda-nn-ddp-instead` and :ref:`ddp`.
++    See also: :ref:`cuda-nn-dataparallel-instead`
+ 
+     Arbitrary positional and keyword inputs are allowed to be passed into
+-    DataParallel but some types are specially handled. tensors will be
+-    **scattered** on dim specified (default 0). tuple, list and dict types will
+-    be shallow copied. The other types will be shared among different threads
+-    and can be corrupted if written to in the model's forward pass.
++    DataParallel EXCEPT Tensors. All tensors will be scattered on dim
++    specified (default 0). Primitive types will be broadcasted, but all
++    other types will be a shallow copy and can be corrupted if written to in
++    the model's forward pass.
+ 
+     The parallelized :attr:`module` must have its parameters and buffers on
+     ``device_ids[0]`` before running this :class:`~torch.nn.DataParallel`
+@@ -27,9 +24,9 @@
+ 
+     .. warning::
+         In each forward, :attr:`module` is **replicated** on each device, so any
+-        updates to the running module in ``forward`` will be lost. For example,
++        updates to the runing module in ``forward`` will be lost. For example,
+         if :attr:`module` has a counter attribute that is incremented in each
+-        ``forward``, it will always stay at the initial value because the update
++        ``forward``, it will always stay at the initial value becasue the update
+         is done on the replicas which are destroyed after ``forward``. However,
+         :class:`~torch.nn.DataParallel` guarantees that the replica on
+         ``device[0]`` will have its parameters and buffers sharing storage with
+@@ -74,7 +71,7 @@
+     Example::
+ 
+         >>> net = torch.nn.DataParallel(model, device_ids=[0, 1, 2])
+-        >>> output = net(input_var)  # input_var can be on any device, including CPU
++        >>> output = net(input_var)
+     """
+ 
+     # TODO: update notes/cuda.rst when this class handles 8+ GPUs well
+@@ -82,15 +79,13 @@
+     def __init__(self, module, device_ids=None, output_device=None, dim=0):
+         super(DataParallel, self).__init__()
+ 
+-        device_type = _get_available_device_type()
+-        if device_type is None:
++        if not torch.cuda.is_available():
+             self.module = module
+             self.device_ids = []
+             return
+ 
+         if device_ids is None:
+-            device_ids = _get_all_device_indices()
+-
++            device_ids = list(range(torch.cuda.device_count()))
+         if output_device is None:
+             output_device = device_ids[0]
+ 
+@@ -98,23 +93,15 @@
+         self.module = module
+         self.device_ids = list(map(lambda x: _get_device_index(x, True), device_ids))
+         self.output_device = _get_device_index(output_device, True)
+-        self.src_device_obj = torch.device(device_type, self.device_ids[0])
+ 
+         _check_balance(self.device_ids)
+ 
+         if len(self.device_ids) == 1:
+-            self.module.to(self.src_device_obj)
++            self.module.cuda(device_ids[0])
+ 
+     def forward(self, *inputs, **kwargs):
+         if not self.device_ids:
+             return self.module(*inputs, **kwargs)
+-
+-        for t in chain(self.module.parameters(), self.module.buffers()):
+-            if t.device != self.src_device_obj:
+-                raise RuntimeError("module must have its parameters and buffers "
+-                                   "on device {} (device_ids[0]) but found one of "
+-                                   "them on device: {}".format(self.src_device_obj, t.device))
+-
+         inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+         if len(self.device_ids) == 1:
+             return self.module(*inputs[0], **kwargs[0])
+@@ -123,7 +110,7 @@
+         return self.gather(outputs, self.output_device)
+ 
+     def replicate(self, module, device_ids):
+-        return replicate(module, device_ids, not torch.is_grad_enabled())
++        return replicate(module, device_ids)
+ 
+     def scatter(self, inputs, kwargs, device_ids):
+         return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
\ No newline at end of file
diff --git a/Dropout.patch b/Dropout.patch
new file mode 100644
index 0000000..e4345bf
--- /dev/null
+++ b/Dropout.patch
@@ -0,0 +1,22 @@
+--- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/dropout.py
++++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/dropout.py
+@@ -18,8 +18,8 @@
+         inplace: If set to ``True``, will do this operation in-place. Default: ``False``
+ 
+     Shape:
+-        - Input: :math:`(*)`. Input can be of any shape
+-        - Output: :math:`(*)`. Output is of the same shape as input
++        - Input: `Any`. Input can be of any shape
++        - Output: `Same`. Output is of the same shape as input
+ 
+     Examples::
+ 
+@@ -31,6 +31,7 @@
+         detectors: https://arxiv.org/abs/1207.0580
+     """
+ 
+-    def forward(self, input: Tensor) -> Tensor:
++    @weak_script_method
++    def forward(self, input):
+         return F.dropout(input, self.p, self.training, self.inplace)
+ 
\ No newline at end of file
diff --git a/Linear.patch b/Linear.patch
new file mode 100644
index 0000000..ef25bbe
--- /dev/null
+++ b/Linear.patch
@@ -0,0 +1,64 @@
+--- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/linear.py
++++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/linear.py
+@@ -1,19 +1,17 @@
+ class Linear(Module):
+     r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
+-
+-    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
+ 
+     Args:
+         in_features: size of each input sample
+         out_features: size of each output sample
+-        bias: If set to ``False``, the layer will not learn an additive bias.
++        bias: If set to False, the layer will not learn an additive bias.
+             Default: ``True``
+ 
+     Shape:
+-        - Input: :math:`(N, *, H_{in})` where :math:`*` means any number of
+-          additional dimensions and :math:`H_{in} = \text{in\_features}`
+-        - Output: :math:`(N, *, H_{out})` where all but the last dimension
+-          are the same shape as the input and :math:`H_{out} = \text{out\_features}`.
++        - Input: :math:`(N, *, \text{in\_features})` where :math:`*` means any number of
++          additional dimensions
++        - Output: :math:`(N, *, \text{out\_features})` where all but the last dimension
++          are the same shape as the input.
+ 
+     Attributes:
+         weight: the learnable weights of the module of shape
+@@ -33,12 +31,9 @@
+         >>> print(output.size())
+         torch.Size([128, 30])
+     """
+-    __constants__ = ['in_features', 'out_features']
+-    in_features: int
+-    out_features: int
+-    weight: Tensor
++    __constants__ = ['bias']
+ 
+-    def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
++    def __init__(self, in_features, out_features, bias=True):
+         super(Linear, self).__init__()
+         self.in_features = in_features
+         self.out_features = out_features
+@@ -49,17 +44,18 @@
+             self.register_parameter('bias', None)
+         self.reset_parameters()
+ 
+-    def reset_parameters(self) -> None:
++    def reset_parameters(self):
+         init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+         if self.bias is not None:
+             fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
+             bound = 1 / math.sqrt(fan_in)
+             init.uniform_(self.bias, -bound, bound)
+ 
+-    def forward(self, input: Tensor) -> Tensor:
++    @weak_script_method
++    def forward(self, input):
+         return F.linear(input, self.weight, self.bias)
+ 
+-    def extra_repr(self) -> str:
++    def extra_repr(self):
+         return 'in_features={}, out_features={}, bias={}'.format(
+             self.in_features, self.out_features, self.bias is not None
+         )
\ No newline at end of file
diff --git a/MaxPool2d.patch b/MaxPool2d.patch
new file mode 100644
index 0000000..5a991b0
--- /dev/null
+++ b/MaxPool2d.patch
@@ -0,0 +1,17 @@
+--- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/pooling.py
++++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/pooling.py
+@@ -57,12 +57,8 @@
+         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+     """
+ 
+-    kernel_size: _size_2_t
+-    stride: _size_2_t
+-    padding: _size_2_t
+-    dilation: _size_2_t
+-
+-    def forward(self, input: Tensor) -> Tensor:
++    @weak_script_method
++    def forward(self, input):
+         return F.max_pool2d(input, self.kernel_size, self.stride,
+                             self.padding, self.dilation, self.ceil_mode,
+                             self.return_indices)
\ No newline at end of file
diff --git a/PReLU.patch b/PReLU.patch
new file mode 100644
index 0000000..d74cce1
--- /dev/null
+++ b/PReLU.patch
@@ -0,0 +1,37 @@
+--- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/activation.py
++++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/activation.py
+@@ -37,9 +37,10 @@
+         - Output: :math:`(N, *)`, same shape as the input
+ 
+     Attributes:
+-        weight (Tensor): the learnable weights of shape (:attr:`num_parameters`).
++        weight (Tensor): the learnable weights of shape (attr:`num_parameters`).
++            The attr:`dtype` is default to
+ 
+-    .. image:: ../scripts/activation_images/PReLU.png
++    .. image:: scripts/activation_images/PReLU.png
+ 
+     Examples::
+ 
+@@ -47,17 +48,16 @@
+         >>> input = torch.randn(2)
+         >>> output = m(input)
+     """
+-    __constants__ = ['num_parameters']
+-    num_parameters: int
+ 
+-    def __init__(self, num_parameters: int = 1, init: float = 0.25) -> None:
++    def __init__(self, num_parameters=1, init=0.25):
+         self.num_parameters = num_parameters
+         super(PReLU, self).__init__()
+         self.weight = Parameter(torch.Tensor(num_parameters).fill_(init))
+ 
+-    def forward(self, input: Tensor) -> Tensor:
++    @weak_script_method
++    def forward(self, input):
+         return F.prelu(input, self.weight)
+ 
+-    def extra_repr(self) -> str:
++    def extra_repr(self):
+         return 'num_parameters={}'.format(self.num_parameters)
+ 
\ No newline at end of file
diff --git a/README.md b/README.md
index d714539..639fb56 100644
--- a/README.md
+++ b/README.md
@@ -32,12 +32,19 @@ Currently, only the test code is available, and training scripts are coming soon
 ## Usage
 ### To test the pretrained model
 ```
-python test_one_image.py --isTrain false  --name people --Arc_path models/BEST_checkpoint.tar --pic_a_path crop_224/mars.jpg --pic_b_path crop_224/ds.jpg --output_path output/
+python test_one_image.py --isTrain false  --name people --Arc_path arcface_model/arcface_checkpoint.tar --pic_a_path crop_224/6.jpg --pic_b_path crop_224/ds.jpg --output_path output/
 ```
 
---name refers to the checkpoint name.
+--name refers to the SimSwap training logs name.
 
 ## Pretrained model
+
+### Usage
+There are two archive files in the drive: **checkpoints.zip** and **arcface_checkpoint.tar**
+
+- **Copy the arcface_checkpoint.tar into ./arcface_model**
+- **Unzip checkpoints.zip, place it in the root dir ./**
+
 [[Google Drive]](https://dl.acm.org/doi/10.1145/3394171.3413630)
 
 [[Baidu Drive]](https://pan.baidu.com/s/1wFV11RVZMHqd-ky4YpLdcA) Password: ```jd2v```
diff --git a/Sequential.patch b/Sequential.patch
new file mode 100644
index 0000000..6c7f6ac
--- /dev/null
+++ b/Sequential.patch
@@ -0,0 +1,70 @@
+--- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/container.py
++++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/container.py
+@@ -22,15 +22,7 @@
+                 ]))
+     """
+ 
+-    @overload
+-    def __init__(self, *args: Module) -> None:
+-        ...
+-
+-    @overload
+-    def __init__(self, arg: 'OrderedDict[str, Module]') -> None:
+-        ...
+-
+-    def __init__(self, *args: Any):
++    def __init__(self, *args):
+         super(Sequential, self).__init__()
+         if len(args) == 1 and isinstance(args[0], OrderedDict):
+             for key, module in args[0].items():
+@@ -48,18 +40,17 @@
+         idx %= size
+         return next(islice(iterator, idx, None))
+ 
+-    @_copy_to_script_wrapper
+-    def __getitem__(self: T, idx) -> T:
++    def __getitem__(self, idx):
+         if isinstance(idx, slice):
+             return self.__class__(OrderedDict(list(self._modules.items())[idx]))
+         else:
+             return self._get_item_by_idx(self._modules.values(), idx)
+ 
+-    def __setitem__(self, idx: int, module: Module) -> None:
++    def __setitem__(self, idx, module):
+         key = self._get_item_by_idx(self._modules.keys(), idx)
+         return setattr(self, key, module)
+ 
+-    def __delitem__(self, idx: Union[slice, int]) -> None:
++    def __delitem__(self, idx):
+         if isinstance(idx, slice):
+             for key in list(self._modules.keys())[idx]:
+                 delattr(self, key)
+@@ -67,26 +58,16 @@
+             key = self._get_item_by_idx(self._modules.keys(), idx)
+             delattr(self, key)
+ 
+-    @_copy_to_script_wrapper
+-    def __len__(self) -> int:
++    def __len__(self):
+         return len(self._modules)
+ 
+-    @_copy_to_script_wrapper
+     def __dir__(self):
+         keys = super(Sequential, self).__dir__()
+         keys = [key for key in keys if not key.isdigit()]
+         return keys
+ 
+-    @_copy_to_script_wrapper
+-    def __iter__(self) -> Iterator[Module]:
+-        return iter(self._modules.values())
+-
+-    # NB: We can't really type check this function as the type of input
+-    # may change dynamically (as is tested in
+-    # TestScript.test_sequential_intermediary_types).  Cannot annotate
+-    # with Any as TorchScript expects a more precise type
+     def forward(self, input):
+-        for module in self:
++        for module in self._modules.values():
+             input = module(input)
+         return input
+ 
\ No newline at end of file
diff --git a/Sigmoid.patch b/Sigmoid.patch
new file mode 100644
index 0000000..9ad9766
--- /dev/null
+++ b/Sigmoid.patch
@@ -0,0 +1,29 @@
+--- /usr/local/lib/python3.5/dist-packages/torch/nn/modules/activation.py
++++ /usr/local/lib/python3.5/dist-packages/torch/nn/modules/activation.py
+@@ -2,7 +2,7 @@
+     r"""Applies the element-wise function:
+ 
+     .. math::
+-        \text{Sigmoid}(x) = \sigma(x) = \frac{1}{1 + \exp(-x)}
++        \text{Sigmoid}(x) = \frac{1}{1 + \exp(-x)}
+ 
+ 
+     Shape:
+@@ -10,7 +10,7 @@
+           dimensions
+         - Output: :math:`(N, *)`, same shape as the input
+ 
+-    .. image:: ../scripts/activation_images/Sigmoid.png
++    .. image:: scripts/activation_images/Sigmoid.png
+ 
+     Examples::
+ 
+@@ -19,6 +19,7 @@
+         >>> output = m(input)
+     """
+ 
+-    def forward(self, input: Tensor) -> Tensor:
++    @weak_script_method
++    def forward(self, input):
+         return torch.sigmoid(input)
+ 
\ No newline at end of file
diff --git a/crop_224/1_source.jpg b/crop_224/1_source.jpg
new file mode 100644
index 0000000..de06097
Binary files /dev/null and b/crop_224/1_source.jpg differ
diff --git a/crop_224/2.jpg b/crop_224/2.jpg
new file mode 100644
index 0000000..1a5fbf1
Binary files /dev/null and b/crop_224/2.jpg differ
diff --git a/crop_224/6.jpg b/crop_224/6.jpg
new file mode 100644
index 0000000..958c554
Binary files /dev/null and b/crop_224/6.jpg differ
diff --git a/crop_224/cage.jpg b/crop_224/cage.jpg
new file mode 100644
index 0000000..5ee0f57
Binary files /dev/null and b/crop_224/cage.jpg differ
diff --git a/crop_224/dnl.jpg b/crop_224/dnl.jpg
new file mode 100644
index 0000000..e502d08
Binary files /dev/null and b/crop_224/dnl.jpg differ
diff --git a/crop_224/ds.jpg b/crop_224/ds.jpg
new file mode 100644
index 0000000..ba3b965
Binary files /dev/null and b/crop_224/ds.jpg differ
diff --git a/crop_224/gdg.jpg b/crop_224/gdg.jpg
new file mode 100644
index 0000000..8e15a7b
Binary files /dev/null and b/crop_224/gdg.jpg differ
diff --git a/crop_224/gy.jpg b/crop_224/gy.jpg
new file mode 100644
index 0000000..0be6964
Binary files /dev/null and b/crop_224/gy.jpg differ
diff --git a/crop_224/hzc.jpg b/crop_224/hzc.jpg
new file mode 100644
index 0000000..b76e509
Binary files /dev/null and b/crop_224/hzc.jpg differ
diff --git a/crop_224/hzxc.jpg b/crop_224/hzxc.jpg
new file mode 100644
index 0000000..abc5581
Binary files /dev/null and b/crop_224/hzxc.jpg differ
diff --git a/crop_224/james.jpg b/crop_224/james.jpg
new file mode 100644
index 0000000..a36e305
Binary files /dev/null and b/crop_224/james.jpg differ
diff --git a/crop_224/jl.jpg b/crop_224/jl.jpg
new file mode 100644
index 0000000..dbe942f
Binary files /dev/null and b/crop_224/jl.jpg differ
diff --git a/crop_224/lcw.jpg b/crop_224/lcw.jpg
new file mode 100644
index 0000000..30955c1
Binary files /dev/null and b/crop_224/lcw.jpg differ
diff --git a/crop_224/ljm.jpg b/crop_224/ljm.jpg
new file mode 100644
index 0000000..d452733
Binary files /dev/null and b/crop_224/ljm.jpg differ
diff --git a/crop_224/ljm2.jpg b/crop_224/ljm2.jpg
new file mode 100644
index 0000000..cdab34d
Binary files /dev/null and b/crop_224/ljm2.jpg differ
diff --git a/crop_224/ljm3.jpg b/crop_224/ljm3.jpg
new file mode 100644
index 0000000..e977708
Binary files /dev/null and b/crop_224/ljm3.jpg differ
diff --git a/crop_224/mars2.jpg b/crop_224/mars2.jpg
new file mode 100644
index 0000000..19088a5
Binary files /dev/null and b/crop_224/mars2.jpg differ
diff --git a/crop_224/mouth_open.jpg b/crop_224/mouth_open.jpg
new file mode 100644
index 0000000..241eac1
Binary files /dev/null and b/crop_224/mouth_open.jpg differ
diff --git a/crop_224/mtdm.jpg b/crop_224/mtdm.jpg
new file mode 100644
index 0000000..dca6be5
Binary files /dev/null and b/crop_224/mtdm.jpg differ
diff --git a/crop_224/trump.jpg b/crop_224/trump.jpg
new file mode 100644
index 0000000..442fe0e
Binary files /dev/null and b/crop_224/trump.jpg differ
diff --git a/crop_224/wlh.jpg b/crop_224/wlh.jpg
new file mode 100644
index 0000000..a22ab0b
Binary files /dev/null and b/crop_224/wlh.jpg differ
diff --git a/crop_224/zjl.jpg b/crop_224/zjl.jpg
new file mode 100644
index 0000000..e3719dc
Binary files /dev/null and b/crop_224/zjl.jpg differ
diff --git a/crop_224/zrf.jpg b/crop_224/zrf.jpg
new file mode 100644
index 0000000..0166ae1
Binary files /dev/null and b/crop_224/zrf.jpg differ
diff --git a/crop_224/zxy.jpg b/crop_224/zxy.jpg
new file mode 100644
index 0000000..c696347
Binary files /dev/null and b/crop_224/zxy.jpg differ
diff --git a/output/result.jpg b/output/result.jpg
new file mode 100644
index 0000000..edd5322
Binary files /dev/null and b/output/result.jpg differ
diff --git a/train.py b/train.py
deleted file mode 100644
index 691d8c4..0000000
--- a/train.py
+++ /dev/null
@@ -1,148 +0,0 @@
-import time
-import os
-import numpy as np
-import torch
-from torch.autograd import Variable
-from collections import OrderedDict
-from subprocess import call
-import fractions
-from options.train_options import TrainOptions
-from data.data_loader import CreateDataLoader
-from data.dataset_class import FaceDataSet
-from torch.utils.data import DataLoader
-from models.models import create_model
-import util.util as util
-from util.visualizer import Visualizer
-import cv2
-from torchvision import transforms
-
-def lcm(a,b): return abs(a * b)/fractions.gcd(a,b) if a and b else 0
-
-
-detransformer = transforms.Compose([
-        transforms.Normalize([0, 0, 0], [1/0.229, 1/0.224, 1/0.225]),
-        transforms.Normalize([-0.485, -0.456, -0.406], [1, 1, 1])
-    ])
-
-opt = TrainOptions().parse()
-iter_path = os.path.join(opt.checkpoints_dir, opt.name, 'iter.txt')
-
-if opt.continue_train:
-    try:
-        start_epoch, epoch_iter = np.loadtxt(iter_path , delimiter=',', dtype=int)
-    except:
-        start_epoch, epoch_iter = 1, 0
-    print('Resuming from epoch %d at iteration %d' % (start_epoch, epoch_iter))        
-else:    
-    start_epoch, epoch_iter = 1, 0
-
-opt.print_freq = lcm(opt.print_freq, opt.batchSize)    
-if opt.debug:
-    opt.display_freq = 1
-    opt.print_freq = 1
-    opt.niter = 1
-    opt.niter_decay = 0
-    opt.max_dataset_size = 10
-
-
-dataset = FaceDataSet('people_list.txt', opt.batchSize)
-data_loader = DataLoader(dataset, batch_size = opt.batchSize, shuffle=True)
-dataset_size = len(data_loader)
-
-device = torch.device("cuda:0")
-
-
-model = create_model(opt)
-visualizer = Visualizer(opt)
-
-optimizer_G, optimizer_D = model.module.optimizer_G, model.module.optimizer_D
-
-total_steps = (start_epoch-1) * 8608 + epoch_iter
-
-display_delta = total_steps % opt.display_freq
-print_delta = total_steps % opt.print_freq
-save_delta = total_steps % opt.save_latest_freq
-
-loss_avg = 0
-refresh_count = 0
-
-for epoch in range(start_epoch, opt.niter + opt.niter_decay + 1):
-    epoch_start_time = time.time()
-    if epoch != start_epoch:
-        epoch_iter = epoch_iter % dataset_size
-    for i, (img_id, img_att, latent_id, latent_att, data_type) in enumerate(data_loader):
-        if total_steps % opt.print_freq == print_delta:
-            iter_start_time = time.time()
-        total_steps += opt.batchSize
-        epoch_iter += opt.batchSize
-
-        # convert numpy to tensor
-        img_id = img_id.to(device)
-        img_att = img_att.to(device)
-        latent_id = latent_id.to(device)
-        latent_att = latent_att.to(device)
-
-
-        # whether to collect output images
-        save_fake = total_steps % opt.display_freq == display_delta
-
-        ############## Forward Pass ######################
-
-        losses, img_fake = model(img_id, img_att, latent_id, latent_att, for_G=True)
-
-        # update Generator weights
-        losses = [ torch.mean(x) if not isinstance(x, int) else x for x in losses ]
-        loss_dict = dict(zip(model.module.loss_names, losses))
-
-        loss_G = loss_dict['G_GAN'] + loss_dict.get('G_GAN_Feat', 0) + loss_dict['G_ID'] * opt.lambda_id
-        if data_type[0] == 0:
-            loss_G += loss_dict['G_Rec']
-
-        optimizer_G.zero_grad()
-        loss_G.backward(retain_graph=True)
-        optimizer_G.step()
-
-        loss_D = (loss_dict['D_fake'] + loss_dict['D_real']) * 0.5 + loss_dict['D_GP']
-        optimizer_D.zero_grad()
-        loss_D.backward()
-        optimizer_D.step()
-
-        ############## Display results and errors ##########
-        ### print out errors
-        if total_steps % opt.print_freq == print_delta:
-            errors = {k: v.data.item() if not isinstance(v, int) else v for k, v in loss_dict.items()}
-            t = (time.time() - iter_start_time) / opt.print_freq
-            visualizer.print_current_errors(epoch, epoch_iter, errors, t)
-            visualizer.plot_current_errors(errors, total_steps)
-
-        ### display output images
-        if save_fake:
-            '''visuals = OrderedDict([('input_label', util.tensor2label(data['label'][0], opt.label_nc)),
-                                   ('synthesized_image', util.tensor2im(generated.data[0])),
-                                   ('real_image', util.tensor2im(data['image'][0]))])'''
-            for i in range(img_id.shape[0]):
-                if i == 0:
-                    row1 = img_id[i]
-                    row2 = img_att[i]
-                    row3 = img_fake[i]
-                else:
-                    row1 = torch.cat([row1, img_id[i]], dim=2)
-                    row2 = torch.cat([row2, img_att[i]], dim=2)
-                    row3 = torch.cat([row3, img_fake[i]], dim=2)
-            full = torch.cat([row1, row2, row3], dim=1).detach()
-            full = full.permute(1, 2, 0)
-            output = full.to('cpu')
-            output = np.array(output)*255
-            output = output[..., ::-1]
-            cv2.imwrite('samples/step_'+str(total_steps)+'.jpg', output)
-
-        ### save latest model
-        if total_steps % opt.save_latest_freq == save_delta:
-            print('saving the latest model (epoch %d, total_steps %d)' % (epoch, total_steps))
-            model.module.save('latest')            
-            np.savetxt(iter_path, (epoch, epoch_iter), delimiter=',', fmt='%d')
-       
-    # end of epoch 
-    iter_end_time = time.time()
-    print('End of epoch %d / %d \t Time Taken: %d sec' %
-          (epoch, opt.niter + opt.niter_decay, time.time() - epoch_start_time))
\ No newline at end of file