mdz/pytorch/ETTrack/1_scripts/1_save.py

import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import sys
import time
from collections import OrderedDict
import types
from functools import partial
import math
import numpy as np

prj_path = os.path.join(os.path.dirname(__file__), '..')
if prj_path not in sys.path:
    sys.path.append(prj_path)

from pytracking.evaluation import get_dataset
from pytracking.evaluation.running import _save_tracker_output
from pytracking.evaluation import Tracker

from tracking.basic_model.et_tracker import ET_Tracker
from tracking.basic_model.exemplar_transformer import ExemplarTransformer, AveragePooler, SqueezeExcite, _pair
from tracking.basic_model.exemplar_transformer import resolve_se_args,_get_activation_fn,get_initializer
from pytracking.utils import TrackerParams
from pytracking.tracker.et_tracker.et_tracker import TransconverTracker

from lib.models.super_model_DP import Super_model_DP
from lib.models.model_parts import *
import lib.models.models as lighttrack_model
from lib.utils.utils import load_lighttrack_model

# ICRAFT NOTE:
# 为了消除floor_divide算子，需要构建4个不同参数的ExemplarTransformer
class ExemplarTransformer256_5(nn.Module):

    def __init__(self, in_channels=256,
                       out_channels=256,
                       dw_padding=2,
                       pw_padding=0,
                       dw_stride=1,
                       pw_stride=1,
                       e_exemplars=4,
                       temperature=2,
                       hidden_dim=256,
                       dw_kernel_size=5,
                       pw_kernel_size=1,
                       layer_norm_eps = 1e-05,
                       dim_feedforward = 1024, # 2048,
                       ff_dropout = 0.1,
                       ff_activation = "relu",
                       num_heads = 8,
                       seq_red = 1,
                       se_ratio = 0.5,
                       se_kwargs = None,
                       se_act_layer = "relu",
                       norm_layer = nn.BatchNorm2d,
                       norm_kwargs = None,
                       sm_normalization = True,
                       dropout = False,
                       dropout_rate = 0.1) -> None:
        super(ExemplarTransformer256_5, self).__init__()

        '''

        Sub Models:
            - average_pooler: attention module
            - K (keys): Representing the last layer of the average pooler
                        K is used for the computation of the mixing weights.
                        The mixing weights are used for the both the spatial as well as the
                        pointwise convolution
            - V (values): Representing the different kernels.
                          There have to be two sets of values, one for the spatial and one for the pointwise
                          convolution. The shape of the kernels differ.


        Args:
            - in_channels: number of input channels
            - out_channels: number of output channels
            - padding: input padding for when applying kernel
            - stride: stride for kernel application
            - e_exemplars: number of expert kernels
            - temperature: temperature for softmax
            - hidden_dim: hidden dimension used in the average pooler
            - kernel_size: kernel size used for the weight shape computation
            - layernorm eps: used for layer norm after the convolution operation
            - dim_feedforward: dimension for FF network after attention module,
            - ff_dropout: dropout rate for FF network after attention module
            - activation: activation function for FF network after attention module
            - num_heads: number of heads
            - seq_red: sequence reduction dimension for the global average pooling operation


        '''

        ## general parameters
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.e_exemplars = e_exemplars
        norm_kwargs = norm_kwargs or {}
        self.hidden_dim = hidden_dim
        self.sm_norm = sm_normalization
        self.K = nn.Parameter(torch.randn(e_exemplars, hidden_dim)) # could be an embedding / a mapping from X to K instead of pre-learned
        self.K_T = None
        self.dropout = dropout
        self.do = nn.Dropout(dropout_rate)

        ## average pool
        self.temperature = temperature
        self.average_pooler = AveragePooler(seq_red=seq_red, c_dim=in_channels, hidden_dim=hidden_dim) #.cuda()
        self.softmax = nn.Softmax(dim=-1)

        ## multihead setting
        self.H = num_heads
        self.head_dim = self.hidden_dim // self.H

        ## depthwise convolution parameters
        self.dw_groups = self.out_channels
        self.dw_kernel_size = _pair(dw_kernel_size)
        self.dw_padding = dw_padding
        self.dw_stride = dw_stride
        self.dw_weight_shape = (self.out_channels, self.in_channels // self.dw_groups) + self.dw_kernel_size
        dw_weight_num_param = 1
        for wd in self.dw_weight_shape:
            dw_weight_num_param *= wd
        self.V_dw = nn.Parameter(torch.Tensor(e_exemplars, dw_weight_num_param))
        self.dw_bn = norm_layer(self.in_channels, **norm_kwargs)
        self.dw_act = nn.ReLU(inplace=True)

        ## pointwise convolution parameters
        self.pw_groups = 1
        self.pw_kernel_size = _pair(pw_kernel_size)
        self.pw_padding = pw_padding
        self.pw_stride = pw_stride
        self.pw_weight_shape = (self.out_channels, self.in_channels // self.pw_groups) + self.pw_kernel_size
        pw_weight_num_param = 1
        for wd in self.pw_weight_shape:
            pw_weight_num_param *= wd
        self.V_pw = nn.Parameter(torch.Tensor(e_exemplars, pw_weight_num_param))
        self.pw_bn = norm_layer(self.out_channels, **norm_kwargs)
        self.pw_act = nn.ReLU(inplace=False)

        ## Squeeze-and-excitation
        if se_ratio is not None and se_ratio > 0.:
            se_kwargs = resolve_se_args(se_kwargs, self.in_channels, nn.ReLU) #_get_activation_fn(se_act_layer))
            self.se = SqueezeExcite(self.in_channels, se_ratio=se_ratio, **se_kwargs)

        ## Implementation of Feedforward model after the QKV part
        self.linear1 = nn.Linear(self.out_channels, dim_feedforward)
        self.ff_dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, self.out_channels)
        self.norm1 = nn.LayerNorm(self.out_channels, eps=layer_norm_eps)
        self.norm2 = nn.LayerNorm(self.out_channels, eps=layer_norm_eps)
        self.ff_dropout1 = nn.Dropout(ff_dropout)
        self.ff_dropout2 = nn.Dropout(ff_dropout)
        self.ff_activation = _get_activation_fn(ff_activation)

        # initialize the kernels
        self.reset_parameters()

    def reset_parameters(self):
        init_weight_dw = get_initializer(
            partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.e_exemplars, self.dw_weight_shape)
        init_weight_dw(self.V_dw)

        init_weight_pw = get_initializer(
            partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.e_exemplars, self.pw_weight_shape)
        init_weight_pw(self.V_pw)

    def forward(self, x):

        residual = x

        # X: [B,C,H,W]

        # apply average pooler
        q = self.average_pooler(x)
        d_k = q.shape[-1]
        # Q: [B,S,C]

        # ICRAFT NOTE:
        # 计算Keys外积的时候，K参数的转置需要提前算好，运行时不支持。这一步在顶层ET_Tracker.template函数里完成
        # outer product with keys
        #qk = einsum('b n c, k c -> b n k', q, self.K) # K^T: [C, K] QK^T: [B,S,K]
        # qk = torch.matmul(q, self.K.T)
        qk = torch.matmul(q, self.K_T)

        # if self.sm_norm:
        qk = 1/math.sqrt(d_k) * qk

        # apply softmax
        attn = self.softmax(qk/2) # -> [batch_size, e_exemplars]

        # multiply attention map with values
        #dw_qkv_kernel = einsum('b s k, k e -> b s e', attn, self.V_dw) # V: [K, E_dw]
        #pw_qkv_kernel = einsum('b s k, k e -> b s e', attn, self.V_pw) # V: [K, E_pw]
        dw_qkv_kernel = torch.matmul(attn, self.V_dw) # V: [K, E_dw]
        pw_qkv_kernel = torch.matmul(attn, self.V_pw) # V: [K, E_pw]

        ###########################################################################################
        ####### convolve input with the output instead of adding it to it in a residual way #######
        ###########################################################################################

        ## dw conv
        B, C, H, W = x.shape #[1，256，18，18]
        # ICRAFT NOTE:
        # 为了消除floor_divide算子，预先计算好dw_weight_shape
        # dw conv
        # dw_weight_shape = (B * self.out_channels, self.in_channels // self.dw_groups) + self.dw_kernel_size

        # dw_weight = dw_qkv_kernel.view(dw_weight_shape)
        dw_weight = dw_qkv_kernel.view(256,1,5,5)
        # ICRAFT NOTE:
        # 消除无效reshape
        # reshape the input
        # x = x.reshape(1, 256, 18, 18) #(1, B * C, H, W)

        # apply convolution
        x = F.conv2d(x, dw_weight, bias=None, stride=1, padding=2, groups=256)
            # x, dw_weight, bias=None, stride=self.dw_stride, padding=self.dw_padding,
            # groups=self.dw_groups * B)

        # x = x.permute([1, 0, 2, 3]).view(B, self.out_channels, x.shape[-2], x.shape[-1])
        x = x.permute([1, 0, 2, 3]).view(1, 256, 18, 18)
        x = self.dw_bn(x)
        x = self.dw_act(x)

        ## SE
        x = self.se(x)

        ## pw conv
        B, C, H, W = x.shape #[1,256,18,18]

        # ICRAFT NOTE:
        # 为了消除floor_divide算子，预先计算好pw_weight_shape
        # dw conv
        # pw_weight_shape = (B * self.out_channels, self.in_channels // self.pw_groups) + self.pw_kernel_size #[256,256,1,1]
        # pw_weight = pw_qkv_kernel.view(pw_weight_shape)
        pw_weight = pw_qkv_kernel.view(256,256,1,1)

        # ICRAFT NOTE:
        # 消除无效view算子
        # reshape the input
        # x = x.view(1, 256, 18, 18) #(1, B * C, H, W)

        # apply convolution
        x = F.conv2d(x, pw_weight, bias=None, stride=1, padding=0, groups=1)
            # x, pw_weight, bias=None, stride=self.pw_stride, padding=self.pw_padding,
            # groups=self.pw_groups * B)

        # x = x.permute([1, 0, 2, 3]).view(B, self.out_channels, x.shape[-2], x.shape[-1])
        x = x.permute([1, 0, 2, 3]).view(1, 256, 18, 18)
        x = self.pw_bn(x)
        x = self.pw_act(x)


        # if self.dropout:
        #     x = x + self.do(residual)
        # else:
        x = x + residual

        # reshape output of convolution operation
        # out = x.view(B, self.out_channels, -1).permute(0,2,1)
        out = x.view(1, 256, -1).permute(0,2,1)

        # FF network
        out = self.norm1(out) #[1,324,256]
        out2 = self.linear2(self.ff_dropout(self.ff_activation(self.linear1(out))))
        out = out + self.ff_dropout2(out2)
        out = self.norm2(out)
        # out = out.permute(0,2,1).view(B,C,H,W)
        out = out.permute(0,2,1).view(1,256,18,18)

        return out

class ExemplarTransformer256_3(nn.Module):

    def __init__(self, in_channels=256,
                       out_channels=256,
                       dw_padding=1,
                       pw_padding=0,
                       dw_stride=1,
                       pw_stride=1,
                       e_exemplars=4,
                       temperature=2,
                       hidden_dim=256,
                       dw_kernel_size=3,
                       pw_kernel_size=1,
                       layer_norm_eps = 1e-05,
                       dim_feedforward = 1024, # 2048,
                       ff_dropout = 0.1,
                       ff_activation = "relu",
                       num_heads = 8,
                       seq_red = 1,
                       se_ratio = 0.5,
                       se_kwargs = None,
                       se_act_layer = "relu",
                       norm_layer = nn.BatchNorm2d,
                       norm_kwargs = None,
                       sm_normalization = True,
                       dropout = False,
                       dropout_rate = 0.1) -> None:
        super(ExemplarTransformer256_3, self).__init__()

        '''

        Sub Models:
            - average_pooler: attention module
            - K (keys): Representing the last layer of the average pooler
                        K is used for the computation of the mixing weights.
                        The mixing weights are used for the both the spatial as well as the
                        pointwise convolution
            - V (values): Representing the different kernels.
                          There have to be two sets of values, one for the spatial and one for the pointwise
                          convolution. The shape of the kernels differ.


        Args:
            - in_channels: number of input channels
            - out_channels: number of output channels
            - padding: input padding for when applying kernel
            - stride: stride for kernel application
            - e_exemplars: number of expert kernels
            - temperature: temperature for softmax
            - hidden_dim: hidden dimension used in the average pooler
            - kernel_size: kernel size used for the weight shape computation
            - layernorm eps: used for layer norm after the convolution operation
            - dim_feedforward: dimension for FF network after attention module,
            - ff_dropout: dropout rate for FF network after attention module
            - activation: activation function for FF network after attention module
            - num_heads: number of heads
            - seq_red: sequence reduction dimension for the global average pooling operation


        '''

        ## general parameters
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.e_exemplars = e_exemplars
        norm_kwargs = norm_kwargs or {}
        self.hidden_dim = hidden_dim
        self.sm_norm = sm_normalization
        self.K = nn.Parameter(torch.randn(e_exemplars, hidden_dim)) # could be an embedding / a mapping from X to K instead of pre-learned
        self.K_T = None
        self.dropout = dropout
        self.do = nn.Dropout(dropout_rate)

        ## average pool
        self.temperature = temperature
        self.average_pooler = AveragePooler(seq_red=seq_red, c_dim=in_channels, hidden_dim=hidden_dim) #.cuda()
        self.softmax = nn.Softmax(dim=-1)

        ## multihead setting
        self.H = num_heads
        self.head_dim = self.hidden_dim // self.H

        ## depthwise convolution parameters
        self.dw_groups = self.out_channels
        self.dw_kernel_size = _pair(dw_kernel_size)
        self.dw_padding = dw_padding
        self.dw_stride = dw_stride
        self.dw_weight_shape = (self.out_channels, self.in_channels // self.dw_groups) + self.dw_kernel_size
        dw_weight_num_param = 1
        for wd in self.dw_weight_shape:
            dw_weight_num_param *= wd
        self.V_dw = nn.Parameter(torch.Tensor(e_exemplars, dw_weight_num_param))
        self.dw_bn = norm_layer(self.in_channels, **norm_kwargs)
        self.dw_act = nn.ReLU(inplace=True)

        ## pointwise convolution parameters
        self.pw_groups = 1
        self.pw_kernel_size = _pair(pw_kernel_size)
        self.pw_padding = pw_padding
        self.pw_stride = pw_stride
        self.pw_weight_shape = (self.out_channels, self.in_channels // self.pw_groups) + self.pw_kernel_size
        pw_weight_num_param = 1
        for wd in self.pw_weight_shape:
            pw_weight_num_param *= wd
        self.V_pw = nn.Parameter(torch.Tensor(e_exemplars, pw_weight_num_param))
        self.pw_bn = norm_layer(self.out_channels, **norm_kwargs)
        self.pw_act = nn.ReLU(inplace=False)

        ## Squeeze-and-excitation
        if se_ratio is not None and se_ratio > 0.:
            se_kwargs = resolve_se_args(se_kwargs, self.in_channels, nn.ReLU) #_get_activation_fn(se_act_layer))
            self.se = SqueezeExcite(self.in_channels, se_ratio=se_ratio, **se_kwargs)

        ## Implementation of Feedforward model after the QKV part
        self.linear1 = nn.Linear(self.out_channels, dim_feedforward)
        self.ff_dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, self.out_channels)
        self.norm1 = nn.LayerNorm(self.out_channels, eps=layer_norm_eps)
        self.norm2 = nn.LayerNorm(self.out_channels, eps=layer_norm_eps)
        self.ff_dropout1 = nn.Dropout(ff_dropout)
        self.ff_dropout2 = nn.Dropout(ff_dropout)
        self.ff_activation = _get_activation_fn(ff_activation)

        # initialize the kernels
        self.reset_parameters()

    def reset_parameters(self):
        init_weight_dw = get_initializer(
            partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.e_exemplars, self.dw_weight_shape)
        init_weight_dw(self.V_dw)

        init_weight_pw = get_initializer(
            partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.e_exemplars, self.pw_weight_shape)
        init_weight_pw(self.V_pw)

    def forward(self, x):

        residual = x

        # X: [B,C,H,W]

        # apply average pooler
        q = self.average_pooler(x)
        d_k = q.shape[-1]
        # Q: [B,S,C]

        # ICRAFT NOTE:
        # 计算Keys外积的时候，K参数的转置需要提前算好，运行时不支持。这一步在顶层ET_Tracker.template函数里完成
        # outer product with keys
        #qk = einsum('b n c, k c -> b n k', q, self.K) # K^T: [C, K] QK^T: [B,S,K]
        # qk = torch.matmul(q, self.K.T)
        qk = torch.matmul(q, self.K_T)

        # if self.sm_norm:
        qk = 1/math.sqrt(d_k) * qk

        # apply softmax
        attn = self.softmax(qk/2) #self.temperature) # -> [batch_size, e_exemplars]

        # multiply attention map with values
        #dw_qkv_kernel = einsum('b s k, k e -> b s e', attn, self.V_dw) # V: [K, E_dw]
        #pw_qkv_kernel = einsum('b s k, k e -> b s e', attn, self.V_pw) # V: [K, E_pw]
        dw_qkv_kernel = torch.matmul(attn, self.V_dw) # V: [K, E_dw]
        pw_qkv_kernel = torch.matmul(attn, self.V_pw) # V: [K, E_pw]

        ###########################################################################################
        ####### convolve input with the output instead of adding it to it in a residual way #######
        ###########################################################################################

        ## dw conv
        B, C, H, W = x.shape

        # ICRAFT NOTE:
        # 为了消除floor_divide算子，预先计算好dw_weight_shape
        # dw conv
        # dw_weight_shape = (B * self.out_channels, self.in_channels // self.dw_groups) + self.dw_kernel_size
        dw_weight = dw_qkv_kernel.view(256,1,3,3)

        # reshape the input
        # x = x.reshape(1,256,18,18) #(1, B * C, H, W)
        # ICRAFT NOTE:
        # 消除无效reshape
        # apply convolution
        x = F.conv2d(x, dw_weight, bias=None, stride=1, padding=1, groups=256)
            # x, dw_weight, bias=None, stride=self.dw_stride, padding=self.dw_padding,
            # groups=self.dw_groups * B)

        x = x.permute([1, 0, 2, 3]).view(1,256,18,18) #(B, self.out_channels, x.shape[-2], x.shape[-1])
        x = self.dw_bn(x)
        x = self.dw_act(x)

        ## SE
        x = self.se(x)

        ## pw conv
        B, C, H, W = x.shape
        # ICRAFT NOTE:
        # 为了消除floor_divide算子，预先计算好pw_weight_shape
        # dw conv
        # pw_weight_shape = (B * self.out_channels, self.in_channels // self.pw_groups) + self.pw_kernel_size
        pw_weight = pw_qkv_kernel.view(256,256,1,1)
        # ICRAFT NOTE:
        # 消除无效view算子
        # reshape the input
        # x = x.view(1,256,18,18)#(1, B * C, H, W)

        # apply convolution
        x = F.conv2d(x, pw_weight, bias=None, stride=1, padding=0, groups=1)
            # x, pw_weight, bias=None, stride=self.pw_stride, padding=self.pw_padding,
            # groups=self.pw_groups * B)

        x = x.permute([1, 0, 2, 3]).view(1,256,18,18) #(B, self.out_channels, x.shape[-2], x.shape[-1])
        x = self.pw_bn(x)
        x = self.pw_act(x)

        # if self.dropout:
        #     x = x + self.do(residual)
        # else:
        x = x + residual


        # reshape output of convolution operation
        # out = x.view(B, self.out_channels, -1).permute(0,2,1)
        out = x.view(1, 256, -1).permute(0,2,1)

        # FF network
        out = self.norm1(out)
        out2 = self.linear2(self.ff_dropout(self.ff_activation(self.linear1(out))))
        out = out + self.ff_dropout2(out2)
        out = self.norm2(out)
        out = out.permute(0,2,1).view(1,256,18,18) #(B,C,H,W)

        return out

class ExemplarTransformer192_3(nn.Module):

    def __init__(self, in_channels=192,
                       out_channels=192,
                       dw_padding = 1,
                       pw_padding=0,
                       dw_stride=1,
                       pw_stride=1,
                       e_exemplars=4,
                       temperature=2,
                       hidden_dim=256,
                       dw_kernel_size=3,
                       pw_kernel_size=1,
                       layer_norm_eps = 1e-05,
                       dim_feedforward = 1024, # 2048,
                       ff_dropout = 0.1,
                       ff_activation = "relu",
                       num_heads = 8,
                       seq_red = 1,
                       se_ratio = 0.5,
                       se_kwargs = None,
                       se_act_layer = "relu",
                       norm_layer = nn.BatchNorm2d,
                       norm_kwargs = None,
                       sm_normalization = True,
                       dropout = False,
                       dropout_rate = 0.1) -> None:
        super(ExemplarTransformer192_3, self).__init__()

        '''

        Sub Models:
            - average_pooler: attention module
            - K (keys): Representing the last layer of the average pooler
                        K is used for the computation of the mixing weights.
                        The mixing weights are used for the both the spatial as well as the
                        pointwise convolution
            - V (values): Representing the different kernels.
                          There have to be two sets of values, one for the spatial and one for the pointwise
                          convolution. The shape of the kernels differ.


        Args:
            - in_channels: number of input channels
            - out_channels: number of output channels
            - padding: input padding for when applying kernel
            - stride: stride for kernel application
            - e_exemplars: number of expert kernels
            - temperature: temperature for softmax
            - hidden_dim: hidden dimension used in the average pooler
            - kernel_size: kernel size used for the weight shape computation
            - layernorm eps: used for layer norm after the convolution operation
            - dim_feedforward: dimension for FF network after attention module,
            - ff_dropout: dropout rate for FF network after attention module
            - activation: activation function for FF network after attention module
            - num_heads: number of heads
            - seq_red: sequence reduction dimension for the global average pooling operation


        '''

        ## general parameters
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.e_exemplars = e_exemplars
        norm_kwargs = norm_kwargs or {}
        self.hidden_dim = hidden_dim
        self.sm_norm = sm_normalization
        self.K = nn.Parameter(torch.randn(e_exemplars, hidden_dim)) # could be an embedding / a mapping from X to K instead of pre-learned
        self.K_T = None
        self.dropout = dropout
        self.do = nn.Dropout(dropout_rate)

        ## average pool
        self.temperature = temperature
        self.average_pooler = AveragePooler(seq_red=seq_red, c_dim=in_channels, hidden_dim=hidden_dim) #.cuda()
        self.softmax = nn.Softmax(dim=-1)

        ## multihead setting
        self.H = num_heads
        self.head_dim = self.hidden_dim // self.H

        ## depthwise convolution parameters
        self.dw_groups = self.out_channels
        self.dw_kernel_size = _pair(dw_kernel_size)
        self.dw_padding = dw_padding
        self.dw_stride = dw_stride
        self.dw_weight_shape = (self.out_channels, self.in_channels // self.dw_groups) + self.dw_kernel_size
        dw_weight_num_param = 1
        for wd in self.dw_weight_shape:
            dw_weight_num_param *= wd
        self.V_dw = nn.Parameter(torch.Tensor(e_exemplars, dw_weight_num_param))
        self.dw_bn = norm_layer(self.in_channels, **norm_kwargs)
        self.dw_act = nn.ReLU(inplace=True)

        ## pointwise convolution parameters
        self.pw_groups = 1
        self.pw_kernel_size = _pair(pw_kernel_size)
        self.pw_padding = pw_padding
        self.pw_stride = pw_stride
        self.pw_weight_shape = (self.out_channels, self.in_channels // self.pw_groups) + self.pw_kernel_size
        pw_weight_num_param = 1
        for wd in self.pw_weight_shape:
            pw_weight_num_param *= wd
        self.V_pw = nn.Parameter(torch.Tensor(e_exemplars, pw_weight_num_param))
        self.pw_bn = norm_layer(self.out_channels, **norm_kwargs)
        self.pw_act = nn.ReLU(inplace=False)

        ## Squeeze-and-excitation
        if se_ratio is not None and se_ratio > 0.:
            se_kwargs = resolve_se_args(se_kwargs, self.in_channels, nn.ReLU) #_get_activation_fn(se_act_layer))
            self.se = SqueezeExcite(self.in_channels, se_ratio=se_ratio, **se_kwargs)

        ## Implementation of Feedforward model after the QKV part
        self.linear1 = nn.Linear(self.out_channels, dim_feedforward)
        self.ff_dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, self.out_channels)
        self.norm1 = nn.LayerNorm(self.out_channels, eps=layer_norm_eps)
        self.norm2 = nn.LayerNorm(self.out_channels, eps=layer_norm_eps)
        self.ff_dropout1 = nn.Dropout(ff_dropout)
        self.ff_dropout2 = nn.Dropout(ff_dropout)
        self.ff_activation = _get_activation_fn(ff_activation)

        # initialize the kernels
        self.reset_parameters()

    def reset_parameters(self):
        init_weight_dw = get_initializer(
            partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.e_exemplars, self.dw_weight_shape)
        init_weight_dw(self.V_dw)

        init_weight_pw = get_initializer(
            partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.e_exemplars, self.pw_weight_shape)
        init_weight_pw(self.V_pw)

    def forward(self, x):
        residual = x
        # X: [B,C,H,W]
        # apply average pooler
        q = self.average_pooler(x) #(1,1,256)
        # d_k = q.shape[-1]
        # Q: [B,S,C]
        # ICRAFT NOTE:
        # 计算Keys外积的时候，K参数的转置需要提前算好，运行时不支持。这一步在顶层ET_Tracker.template函数里完成
        # outer product with keys
        #qk = einsum('b n c, k c -> b n k', q, self.K) # K^T: [C, K] QK^T: [B,S,K]
        # qk = torch.matmul(q, self.K.T)
        qk = torch.matmul(q, self.K_T)

        # if self.sm_norm:
        qk = 1/16.0 * qk #qk = 1/math.sqrt(d_k) * qk

        # apply softmax
        attn = self.softmax(qk/2) # -> [batch_size, e_exemplars]

        # multiply attention map with values
        #dw_qkv_kernel = einsum('b s k, k e -> b s e', attn, self.V_dw) # V: [K, E_dw]
        #pw_qkv_kernel = einsum('b s k, k e -> b s e', attn, self.V_pw) # V: [K, E_pw]
        dw_qkv_kernel = torch.matmul(attn, self.V_dw) # V: [K, E_dw]
        pw_qkv_kernel = torch.matmul(attn, self.V_pw) # V: [K, E_pw]

        ###########################################################################################
        ####### convolve input with the output instead of adding it to it in a residual way #######
        ###########################################################################################

        ## dw conv
        B, C, H, W = x.shape
        # ICRAFT NOTE:
        # 为了消除floor_divide算子，预先计算好dw_weight_shape
        # dw conv
        # dw_weight_shape = (B * self.out_channels, self.in_channels // self.dw_groups) + self.dw_kernel_size
        dw_weight = dw_qkv_kernel.view(192,1,3,3)
        # ICRAFT NOTE:
        # 消除无效reshape
        # reshape the input
        # x = x.reshape(1,192,18,18) #(1, B * C, H, W)

        # apply convolution
        x = F.conv2d(x, dw_weight, bias=None, stride=1, padding=1, groups=192)
            # x, dw_weight, bias=None, stride=self.dw_stride, padding=self.dw_padding,
            # groups=self.dw_groups * B)

        x = x.permute([1, 0, 2, 3]).view(1,192,18,18) #(B, self.out_channels, x.shape[-2], x.shape[-1])
        x = self.dw_bn(x)
        x = self.dw_act(x)

        ## SE
        x = self.se(x)

        ## pw conv
        B, C, H, W = x.shape
        # ICRAFT NOTE:
        # 为了消除floor_divide算子，预先计算好pw_weight_shape
        # dw conv
        # pw_weight_shape = (B * self.out_channels, self.in_channels // self.pw_groups) + self.pw_kernel_size
        pw_weight = pw_qkv_kernel.view(192,192,1,1)
        # ICRAFT NOTE:
        # 消除无效view算子
        # reshape the input
        # x = x.view(1,192,18,18) #(1, B * C, H, W)

        # apply convolution
        x = F.conv2d(x, pw_weight, bias=None, stride=1, padding=0, groups=1)
            # x, pw_weight, bias=None, stride=self.pw_stride, padding=self.pw_padding,
            # groups=self.pw_groups * B)

        x = x.permute([1, 0, 2, 3]).view(1,192,18,18) #(B, self.out_channels, x.shape[-2], x.shape[-1])
        x = self.pw_bn(x)
        x = self.pw_act(x)


        # if self.dropout:
        #     x = x + self.do(residual)
        # else:
        x = x + residual

        # reshape output of convolution operation
        # out = x.view(B, self.out_channels, -1).permute(0,2,1)
        out = x.view(1, 192, -1).permute(0,2,1)

        # FF network
        out = self.norm1(out)
        out2 = self.linear2(self.ff_dropout(self.ff_activation(self.linear1(out))))
        out = out + self.ff_dropout2(out2)
        out = self.norm2(out)
        out = out.permute(0,2,1).view(1,192,18,18) #(B,C,H,W)

        return out

class ExemplarTransformer192_5(nn.Module):

    def __init__(self, in_channels=192,
                       out_channels=192,
                       dw_padding=2,
                       pw_padding=0,
                       dw_stride=1,
                       pw_stride=1,
                       e_exemplars=4,
                       temperature=2,
                       hidden_dim=256,
                       dw_kernel_size=5,
                       pw_kernel_size=1,
                       layer_norm_eps = 1e-05,
                       dim_feedforward = 1024, # 2048,
                       ff_dropout = 0.1,
                       ff_activation = "relu",
                       num_heads = 8,
                       seq_red = 1,
                       se_ratio = 0.5,
                       se_kwargs = None,
                       se_act_layer = "relu",
                       norm_layer = nn.BatchNorm2d,
                       norm_kwargs = None,
                       sm_normalization = True,
                       dropout = False,
                       dropout_rate = 0.1) -> None:
        super(ExemplarTransformer192_5, self).__init__()

        '''

        Sub Models:
            - average_pooler: attention module
            - K (keys): Representing the last layer of the average pooler
                        K is used for the computation of the mixing weights.
                        The mixing weights are used for the both the spatial as well as the
                        pointwise convolution
            - V (values): Representing the different kernels.
                          There have to be two sets of values, one for the spatial and one for the pointwise
                          convolution. The shape of the kernels differ.


        Args:
            - in_channels: number of input channels
            - out_channels: number of output channels
            - padding: input padding for when applying kernel
            - stride: stride for kernel application
            - e_exemplars: number of expert kernels
            - temperature: temperature for softmax
            - hidden_dim: hidden dimension used in the average pooler
            - kernel_size: kernel size used for the weight shape computation
            - layernorm eps: used for layer norm after the convolution operation
            - dim_feedforward: dimension for FF network after attention module,
            - ff_dropout: dropout rate for FF network after attention module
            - activation: activation function for FF network after attention module
            - num_heads: number of heads
            - seq_red: sequence reduction dimension for the global average pooling operation


        '''

        ## general parameters
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.e_exemplars = e_exemplars
        norm_kwargs = norm_kwargs or {}
        self.hidden_dim = hidden_dim
        self.sm_norm = sm_normalization
        self.K = nn.Parameter(torch.randn(e_exemplars, hidden_dim)) # could be an embedding / a mapping from X to K instead of pre-learned
        self.K_T = None
        self.dropout = dropout
        self.do = nn.Dropout(dropout_rate)

        ## average pool
        self.temperature = temperature
        self.average_pooler = AveragePooler(seq_red=seq_red, c_dim=in_channels, hidden_dim=hidden_dim) #.cuda()
        self.softmax = nn.Softmax(dim=-1)

        ## multihead setting
        self.H = num_heads
        self.head_dim = self.hidden_dim // self.H

        ## depthwise convolution parameters
        self.dw_groups = self.out_channels
        self.dw_kernel_size = _pair(dw_kernel_size)
        self.dw_padding = dw_padding
        self.dw_stride = dw_stride
        self.dw_weight_shape = (self.out_channels, self.in_channels // self.dw_groups) + self.dw_kernel_size
        dw_weight_num_param = 1
        for wd in self.dw_weight_shape:
            dw_weight_num_param *= wd
        self.V_dw = nn.Parameter(torch.Tensor(e_exemplars, dw_weight_num_param))
        self.dw_bn = norm_layer(self.in_channels, **norm_kwargs)
        self.dw_act = nn.ReLU(inplace=True)

        ## pointwise convolution parameters
        self.pw_groups = 1
        self.pw_kernel_size = _pair(pw_kernel_size)
        self.pw_padding = pw_padding
        self.pw_stride = pw_stride
        self.pw_weight_shape = (self.out_channels, self.in_channels // self.pw_groups) + self.pw_kernel_size
        pw_weight_num_param = 1
        for wd in self.pw_weight_shape:
            pw_weight_num_param *= wd
        self.V_pw = nn.Parameter(torch.Tensor(e_exemplars, pw_weight_num_param))
        self.pw_bn = norm_layer(self.out_channels, **norm_kwargs)
        self.pw_act = nn.ReLU(inplace=False)

        ## Squeeze-and-excitation
        if se_ratio is not None and se_ratio > 0.:
            se_kwargs = resolve_se_args(se_kwargs, self.in_channels, nn.ReLU) #_get_activation_fn(se_act_layer))
            self.se = SqueezeExcite(self.in_channels, se_ratio=se_ratio, **se_kwargs)

        ## Implementation of Feedforward model after the QKV part
        self.linear1 = nn.Linear(self.out_channels, dim_feedforward)
        self.ff_dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, self.out_channels)
        self.norm1 = nn.LayerNorm(self.out_channels, eps=layer_norm_eps)
        self.norm2 = nn.LayerNorm(self.out_channels, eps=layer_norm_eps)
        self.ff_dropout1 = nn.Dropout(ff_dropout)
        self.ff_dropout2 = nn.Dropout(ff_dropout)
        self.ff_activation = _get_activation_fn(ff_activation)

        # initialize the kernels
        self.reset_parameters()

    def reset_parameters(self):
        init_weight_dw = get_initializer(
            partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.e_exemplars, self.dw_weight_shape)
        init_weight_dw(self.V_dw)

        init_weight_pw = get_initializer(
            partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.e_exemplars, self.pw_weight_shape)
        init_weight_pw(self.V_pw)

    def forward(self, x):

        residual = x

        # X: [B,C,H,W]

        # apply average pooler
        q = self.average_pooler(x)
        # d_k = q.shape[-1]
        # Q: [B,S,C]
        # ICRAFT NOTE:
        # 计算Keys外积的时候，K参数的转置需要提前算好，运行时不支持。这一步在顶层ET_Tracker.template函数里完成
        # outer product with keys
        #qk = einsum('b n c, k c -> b n k', q, self.K) # K^T: [C, K] QK^T: [B,S,K]
        # qk = torch.matmul(q, self.K.T)
        qk = torch.matmul(q, self.K_T)

        # if self.sm_norm:
        # qk = 1/math.sqrt(d_k) * qk
        qk = 1/16.0 * qk

        # apply softmax
        attn = self.softmax(qk/self.temperature) # -> [batch_size, e_exemplars]

        # multiply attention map with values
        #dw_qkv_kernel = einsum('b s k, k e -> b s e', attn, self.V_dw) # V: [K, E_dw]
        #pw_qkv_kernel = einsum('b s k, k e -> b s e', attn, self.V_pw) # V: [K, E_pw]
        dw_qkv_kernel = torch.matmul(attn, self.V_dw) # V: [K, E_dw]
        pw_qkv_kernel = torch.matmul(attn, self.V_pw) # V: [K, E_pw]

        ###########################################################################################
        ####### convolve input with the output instead of adding it to it in a residual way #######
        ###########################################################################################

        ## dw conv
        B, C, H, W = x.shape
        # ICRAFT NOTE:
        # 为了消除floor_divide算子，预先计算好dw_weight_shape
        # dw conv
        # dw_weight_shape = (B * self.out_channels, self.in_channels // self.dw_groups) + self.dw_kernel_size
        dw_weight = dw_qkv_kernel.view(192,1,5,5) #(dw_weight_shape)
        # ICRAFT NOTE:
        # 消除无效reshape
        # reshape the input
        # x = x.reshape(1,192,18,18) #(1, B * C, H, W)

        # apply convolution
        x = F.conv2d(x, dw_weight, bias=None, stride=1, padding=2,groups=192)
            # x, dw_weight, bias=None, stride=self.dw_stride, padding=self.dw_padding,
            # groups=self.dw_groups * B)

        x = x.permute([1, 0, 2, 3]).view(1,192,18,18) #(B, self.out_channels, x.shape[-2], x.shape[-1])
        x = self.dw_bn(x)
        x = self.dw_act(x)

        ## SE
        x = self.se(x)

        ## pw conv
        B, C, H, W = x.shape
        # ICRAFT NOTE:
        # 为了消除floor_divide算子，预先计算好pw_weight_shape
        # dw conv
        # pw_weight_shape = (B * self.out_channels, self.in_channels // self.pw_groups) + self.pw_kernel_size
        pw_weight = pw_qkv_kernel.view(192,192,1,1) #(pw_weight_shape)
        # ICRAFT NOTE:
        # 消除无效view算子
        # reshape the input
        # x = x.view(1,192,18,18) #(1, B * C, H, W)

        # apply convolution
        x = F.conv2d(x, pw_weight, bias=None, stride=1, padding=0, groups=1)
            # x, pw_weight, bias=None, stride=self.pw_stride, padding=self.pw_padding,
            # groups=self.pw_groups * B)

        x = x.permute([1, 0, 2, 3]).view(1,192,18,18) #(B, self.out_channels, x.shape[-2], x.shape[-1])
        x = self.pw_bn(x)
        x = self.pw_act(x)


        # if self.dropout:
        #     x = x + self.do(residual)
        # else:
        x = x + residual


        # reshape output of convolution operation
        # out = x.view(B, self.out_channels, -1).permute(0,2,1)
        out = x.view(1, 192, -1).permute(0,2,1)

        # FF network
        out = self.norm1(out)
        out2 = self.linear2(self.ff_dropout(self.ff_activation(self.linear1(out))))
        out = out + self.ff_dropout2(out2)
        out = self.norm2(out)
        out = out.permute(0,2,1).view(1,192,18,18) #(B,C,H,W)

        return out


def Point_Neck_Mobile_simple_DP_forward(self, kernel, search):  #, stride_idx=None):
    '''stride_idx: 0 or 1. 0 represents stride 8. 1 represents stride 16'''
    # oup = {}
    corr_feat = self.pw_corr[0]([kernel], [search]) # [1,64,18,18]<-[[1,96,8,8]],[[1,96,18,18]]
    #print("corr_feat shape: ", corr_feat.shape)
    #print(f'type of corr_feat: {type(corr_feat)}')
    # if self.adjust:
    corr_feat = self.adj_layer[0](corr_feat) # [1,128,18,18]<-[1,64,18,18]
    # ICRAFT NOTE:
    # 将字典传递数据展开，更改接口
    # oup['cls'], oup['reg'] = corr_feat, corr_feat
    # return oup
    return corr_feat, corr_feat
Point_Neck_Mobile_simple_DP.forward = Point_Neck_Mobile_simple_DP_forward

def ET_Tracker__init__(self, linear_reg=True,
                    search_size=256,
                    template_size=128,
                    stride=16,
                    adj_channel=128,
                    e_exemplars=4,
                    path_name='back_04502514044521042540+cls_211000022+reg_100000111_ops_32',
                    arch='LightTrackM_Subnet',
                    sm_normalization=False,
                    temperature=1,
                    dropout=False):
    super(ET_Tracker, self).__init__()

    '''
    Args:
        - sm_normalization: whether to normalize the QK^T by sqrt(C) in the MultiheadTransConver
    '''

    self.backbone_path_name = path_name

    # Backbone network
    siam_net = lighttrack_model.__dict__[arch](path_name, stride=stride)

    # Backbone
    self.backbone_net = siam_net.features

    # Neck
    self.neck = MC_BN(inp_c=[96])  # BN with multiple types of input channels

    # Feature Fusor
    self.feature_fusor = Point_Neck_Mobile_simple_DP(num_kernel_list=[64], matrix=True,
                                                            adj_channel=adj_channel)  # stride=8, stride=16

    inchannels = 128
    outchannels_cls = 256
    outchannels_reg = 192

    padding_3 = (3 - 1) // 2
    padding_5 = (5 - 1) // 2

    # ICRAFT NOTE:
    # 为了消除floor_divide算子等目的，增加4个ExemplerTransformer类
    self.cls_branch_1 = SeparableConv2d_BNReLU(inchannels, outchannels_cls, kernel_size=5, stride=1, padding=padding_5)
    # self.cls_branch_2 = ExemplarTransformer(in_channels=outchannels_cls, out_channels=outchannels_cls, dw_padding=padding_5, e_exemplars=e_exemplars, dw_kernel_size=5, pw_kernel_size=1, sm_normalization=sm_normalization, temperature=temperature, dropout=dropout)
    # self.cls_branch_3 = ExemplarTransformer(in_channels=outchannels_cls, out_channels=outchannels_cls, dw_padding=padding_3, e_exemplars=e_exemplars, dw_kernel_size=3, pw_kernel_size=1, sm_normalization=sm_normalization, temperature=temperature, dropout=dropout)
    # self.cls_branch_4 = ExemplarTransformer(in_channels=outchannels_cls, out_channels=outchannels_cls, dw_padding=padding_3, e_exemplars=e_exemplars, dw_kernel_size=3, pw_kernel_size=1, sm_normalization=sm_normalization, temperature=temperature, dropout=dropout)
    # self.cls_branch_5 = ExemplarTransformer(in_channels=outchannels_cls, out_channels=outchannels_cls, dw_padding=padding_3, e_exemplars=e_exemplars, dw_kernel_size=3, pw_kernel_size=1, sm_normalization=sm_normalization, temperature=temperature, dropout=dropout)
    self.cls_branch_2 = ExemplarTransformer256_5()
    self.cls_branch_3 = ExemplarTransformer256_3()
    self.cls_branch_4 = ExemplarTransformer256_3()
    self.cls_branch_5 = ExemplarTransformer256_3()
    self.cls_branch_6 = SeparableConv2d_BNReLU(outchannels_cls, outchannels_cls, kernel_size=3, stride=1, padding=padding_3)
    self.cls_pred_head = cls_pred_head(inchannels=outchannels_cls)

    self.bbreg_branch_1 = SeparableConv2d_BNReLU(inchannels, outchannels_reg, kernel_size=3, stride=1, padding=padding_3)
    # self.bbreg_branch_2 = ExemplarTransformer(in_channels=outchannels_reg, out_channels=outchannels_reg, dw_padding=padding_3, e_exemplars=e_exemplars, dw_kernel_size=3, pw_kernel_size=1, sm_normalization=sm_normalization, temperature=temperature, dropout=dropout)
    # self.bbreg_branch_3 = ExemplarTransformer(in_channels=outchannels_reg, out_channels=outchannels_reg, dw_padding=padding_3, e_exemplars=e_exemplars, dw_kernel_size=3, pw_kernel_size=1, sm_normalization=sm_normalization, temperature=temperature, dropout=dropout)
    # self.bbreg_branch_4 = ExemplarTransformer(in_channels=outchannels_reg, out_channels=outchannels_reg, dw_padding=padding_3, e_exemplars=e_exemplars, dw_kernel_size=3, pw_kernel_size=1, sm_normalization=sm_normalization, temperature=temperature, dropout=dropout)
    # self.bbreg_branch_5 = ExemplarTransformer(in_channels=outchannels_reg, out_channels=outchannels_reg, dw_padding=padding_3, e_exemplars=e_exemplars, dw_kernel_size=3, pw_kernel_size=1, sm_normalization=sm_normalization, temperature=temperature, dropout=dropout)
    # self.bbreg_branch_6 = ExemplarTransformer(in_channels=outchannels_reg, out_channels=outchannels_reg, dw_padding=padding_5, e_exemplars=e_exemplars, dw_kernel_size=5, pw_kernel_size=1, sm_normalization=sm_normalization, temperature=temperature, dropout=dropout)
    # self.bbreg_branch_7 = ExemplarTransformer(in_channels=outchannels_reg, out_channels=outchannels_reg, dw_padding=padding_5, e_exemplars=e_exemplars, dw_kernel_size=5, pw_kernel_size=1, sm_normalization=sm_normalization, temperature=temperature, dropout=dropout)
    self.bbreg_branch_2 = ExemplarTransformer192_3()
    self.bbreg_branch_3 = ExemplarTransformer192_3()
    self.bbreg_branch_4 = ExemplarTransformer192_3()
    self.bbreg_branch_5 = ExemplarTransformer192_3()
    self.bbreg_branch_6 = ExemplarTransformer192_5()
    self.bbreg_branch_7 = ExemplarTransformer192_5()
    self.bbreg_branch_8 = SeparableConv2d_BNReLU(outchannels_reg, outchannels_reg, kernel_size=5, stride=1, padding=padding_5)
    self.reg_pred_head = reg_pred_head(inchannels=outchannels_reg, linear_reg=linear_reg)

def ET_Tracker_template(self, z):
    '''
    Used during the tracking -> computes the embedding of the target in the first frame.
    '''
    # ICRAFT NOTE:
    # 提前做好K参数的转置
    self.cls_branch_2.K_T = self.cls_branch_2.K.T.contiguous().detach()
    self.cls_branch_3.K_T = self.cls_branch_3.K.T.contiguous().detach()
    self.cls_branch_4.K_T = self.cls_branch_4.K.T.contiguous().detach()
    self.cls_branch_5.K_T = self.cls_branch_5.K.T.contiguous().detach()
    self.bbreg_branch_2.K_T = self.bbreg_branch_2.K.T.contiguous().detach()
    self.bbreg_branch_3.K_T = self.bbreg_branch_3.K.T.contiguous().detach()
    self.bbreg_branch_4.K_T = self.bbreg_branch_4.K.T.contiguous().detach()
    self.bbreg_branch_5.K_T = self.bbreg_branch_5.K.T.contiguous().detach()
    self.bbreg_branch_6.K_T = self.bbreg_branch_6.K.T.contiguous().detach()
    self.bbreg_branch_7.K_T = self.bbreg_branch_7.K.T.contiguous().detach()
    with torch.no_grad():
        # ICRAFT NOTE:
        # 将template网络导出为pt
        t_z = torch.randn((1,3,127,127))
        # t_z.numpy().astype(np.float32).tofile('./ettrack/template_1_3_127_127.ftmp')#GPU环境下导出为handtanh
        ettrack_template_backbone_t = torch.jit.trace(self.backbone_net, t_z)# cpu环境下会将handtanh变为relu6
        torch.jit.save(ettrack_template_backbone_t,TRACE_PATH + 'ettrack_net1_1x3x127x127_traced.pt')
        print('net1 traced')
        # ICRAFT NOTE:
        # 将z和zf导出ftmp，用于构建量化校准集
        # z.cpu().contiguous().numpy().astype(np.float32).tofile('icraft/calibration/airplane-1_z.ftmp')
        # self.zf.cpu().contiguous().numpy().astype(np.float32).tofile('icraft/calibration/airplane-1_zf.ftmp')
        self.zf = self.backbone_net(z) # [1,96, 8, 8]

# ICRAFT NOTE:
# 因为ET_Tracker.forward函数中zf是模板计算好后的特征，为了导出CNN+TFM网络需要增加zf输入
def ET_Tracker_forward(self, x, zf):
    # [1,3,288,288]
    xf = self.backbone_net(x)
    # [1,96,16,16]
    # Batch Normalization before Corr
    # ICRAFT NOTE:
    # 为了部署，将成员变量作为前向输入
    # zf, xf = self.neck(self.zf, xf)  #[1,96,8,8] [1,96,16,16]<-[1,96,8,8] [1,96,16,16]
    zf, xf = self.neck(zf, xf)  #[1,96,8,8] [1,96,16,16]<-[1,96,8,8] [1,96,16,16]

    # feat_dict = self.feature_fusor(zf, xf) # cls:[1,128,16,16],[1,128,16,16]<-[1,96,8,8] [1,96,16,16]
    feat_cls, feat_reg = self.feature_fusor(zf, xf) # cls:[1,128,16,16],[1,128,16,16]<-[1,96,8,8] [1,96,16,16]

    c = self.cls_branch_1(feat_cls)#(feat_dict['cls'])
    c = self.cls_branch_2(c)
    c = self.cls_branch_3(c)
    c = self.cls_branch_4(c)
    c = self.cls_branch_5(c)
    c = self.cls_branch_6(c)
    c = self.cls_pred_head(c) # [1,1,16,16]

    b = self.bbreg_branch_1(feat_reg)#(feat_dict['reg'])
    b = self.bbreg_branch_2(b)
    b = self.bbreg_branch_3(b)
    b = self.bbreg_branch_4(b)
    b = self.bbreg_branch_5(b)
    b = self.bbreg_branch_6(b)
    b = self.bbreg_branch_7(b)
    b = self.bbreg_branch_8(b)
    b = self.reg_pred_head(b) # [1,4,16,16]
    return c, b

ET_Tracker.__init__ = ET_Tracker__init__
ET_Tracker.template = ET_Tracker_template
ET_Tracker.forward = ET_Tracker_forward

from lib.utils.utils import get_subwindow_tracking, python2round
from lib.utils.utils import cxy_wh_2_rect, get_axis_aligned_bbox
from pytracking.tracker.et_tracker.et_tracker import Config
# ICRAFT NOTE:
# 重新定义initialize改变流程
def TransconverTracker_initialize(self, image, info: dict) -> dict:
    ''' initialize the model '''

    state_dict = dict()

    # Initialize some stuff
    self.frame_num = 1
    if not self.params.has('device'):
        self.params.device = 'cuda' if self.params.use_gpu else 'cpu'

    self.mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
    self.std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)

    # Initialize network
    # verify that the model is correctly initialized:
    self.initialize_features()

    # The Baseline network
    self.net = self.params.net
    self.net.eval()
    self.net.to(self.params.device)

    self.weight_style = self.params.get('weight_style', 'regular')
    print(f'tracker weight style: {self.weight_style}')

    # Time initialization
    tic = time.time()

    # Get target position and size
    state = torch.tensor(info['init_bbox']) # x,y,w,h
    cx, cy, w, h = get_axis_aligned_bbox(state)
    self.target_pos = np.array([cx,cy])
    self.target_sz = np.array([w,h])
    #self.target_pos = torch.Tensor([state[1] + (state[3] - 1)/2, state[0] + (state[2] - 1)/2])
    #self.target_sz = torch.Tensor([state[3], state[2]])

    # Get object id
    self.object_id = info.get('object_ids', [None])[0]
    self.id_str = '' if self.object_id is None else ' {}'.format(self.object_id)

    # Set sizes
    self.image_sz = torch.Tensor([image.shape[0], image.shape[1]])
    sz = self.params.image_sample_size # search size (256, 256)
    sz = torch.Tensor([sz, sz] if isinstance(sz, int) else sz)
    self.img_sample_sz = sz
    self.img_support_sz = self.img_sample_sz
    self.stride = self.params.stride

    # LightTrack specific parameters
    p = Config(stride=self.stride, even=self.params.even)

    state_dict['im_h'] = image.shape[0]
    state_dict['im_w'] = image.shape[1]
    # ICRAFT NOTE:
    # 原来流程会根据初始框大小在原始帧占比决定网格是按照18x18（小）还是16x16（大），统一改为18x18
    # if ((self.target_sz[0] * self.target_sz[1]) / float(state_dict['im_h'] * state_dict['im_w'])) < 0.004:
    #     p.instance_size = self.params.big_sz # cfg_benchmark['big_sz']  # -> p.instance_size = 288
    #     p.renew()
    # else:
    #     p.instance_size = self.params.small_sz # cfg_benchmark['small_sz'] # -> p.instance_size = 256
    #     p.renew()
    ### ICRAFT NOTE: Force to use big_sz 288
    p.instance_size = self.params.big_sz # cfg_benchmark['big_sz']  # -> p.instance_size = 288
    p.renew()

    # compute grids
    self.grids(p)

    wc_z = self.target_sz[0] + p.context_amount * sum(self.target_sz)
    hc_z = self.target_sz[1] + p.context_amount * sum(self.target_sz)
    s_z = round(np.sqrt(wc_z * hc_z).item())

    avg_chans = np.mean(image, axis=(0, 1))
    z_crop, _ = get_subwindow_tracking(image, self.target_pos, p.exemplar_size, s_z, avg_chans)
    z_crop = self.normalize(z_crop)
    z = z_crop.unsqueeze(0)
    self.net.template(z.to(self.params.device))

    if p.windowing == 'cosine':
        window = np.outer(np.hanning(p.score_size), np.hanning(p.score_size))  # [17,17]
    elif p.windowing == 'uniform':
        window = np.ones(int(p.score_size), int(p.score_size))
    else:
        raise ValueError("Unsupported window type")

    state_dict['p'] = p
    state_dict['avg_chans'] = avg_chans
    state_dict['window'] = window
    state_dict['target_pos'] = self.target_pos
    state_dict['target_sz'] = self.target_sz
    state_dict['time'] = time.time() - tic
    return state_dict

def TransconverTracker_update(self, x_crops, target_pos, target_sz, window, scale_z, p, debug=False, writer=None):
    with torch.no_grad():
        # ICRAFT NOTE:
        # 导出到pt和onnx的代码
        # ICRAFT NOTE: PT
        x = torch.randn((1,3,288,288))#.cuda()
        zf = torch.randn((1,96,8,8))#.cuda()
        # x.numpy().astype(np.float32).tofile('icraft/search_1_3_288_288.ftmp')
        # zf.numpy().astype(np.float32).tofile('icraft/template_1_96_8_8.ftmp')
        global TRACE
        if TRACE:
            traced = torch.jit.trace(self.net, [x, zf])
            torch.jit.save(traced, TRACE_PATH +'ettrack_net2_1x3x288x288_traced.pt')
            print('net2 traced')
            TRACE = False
            sys.exit()
        cls_score, bbox_pred = self.net.forward(x_crops.to(self.params.device), self.net.zf)

    # ICRAFT NOTE:
    # 导出输入作为量化校准集
    # if self.frame_num in [2,502, 1002, 1502, 2002, 2502]:
    #     x_crops.cpu().contiguous().numpy().astype(np.float32).tofile(f'icraft/calibration/x_{self.frame_num}.ftmp')
    # to numpy on cpu
    cls_score = torch.sigmoid(cls_score).squeeze().cpu().data.numpy() #[18,18]<-[1,1,18,18]

    # bbox to real predict
    bbox_pred = bbox_pred.squeeze().cpu().data.numpy()#[4,18,18]<-[1,4,18,18]

    pred_x1 = self.grid_to_search_x - bbox_pred[0, ...]
    pred_y1 = self.grid_to_search_y - bbox_pred[1, ...]
    pred_x2 = self.grid_to_search_x + bbox_pred[2, ...]
    pred_y2 = self.grid_to_search_y + bbox_pred[3, ...]


    # size penalty
    s_c = self.change(self.sz(pred_x2 - pred_x1, pred_y2 - pred_y1) / (self.sz_wh(target_sz)))  # scale penalty
    r_c = self.change((target_sz[0] / target_sz[1]) / ((pred_x2 - pred_x1) / (pred_y2 - pred_y1)))  # ratio penalty

    penalty = np.exp(-(r_c * s_c - 1) * p.penalty_k)
    pscore = penalty * cls_score

    # window penalty
    pscore = pscore * (1 - p.window_influence) + window * p.window_influence

    # get max
    r_max, c_max = np.unravel_index(pscore.argmax(), pscore.shape)

    # to real size
    pred_x1 = pred_x1[r_max, c_max]
    pred_y1 = pred_y1[r_max, c_max]
    pred_x2 = pred_x2[r_max, c_max]
    pred_y2 = pred_y2[r_max, c_max]

    pred_xs = (pred_x1 + pred_x2) / 2
    pred_ys = (pred_y1 + pred_y2) / 2
    pred_w = pred_x2 - pred_x1
    pred_h = pred_y2 - pred_y1

    diff_xs = pred_xs - p.instance_size // 2
    diff_ys = pred_ys - p.instance_size // 2

    diff_xs, diff_ys, pred_w, pred_h = diff_xs / scale_z, diff_ys / scale_z, pred_w / scale_z, pred_h / scale_z

    target_sz = target_sz / scale_z

    # size learning rate
    lr = penalty[r_max, c_max] * cls_score[r_max, c_max] * p.lr
    print(lr)
    # size rate
    res_xs = target_pos[0] + diff_xs
    res_ys = target_pos[1] + diff_ys
    res_w = pred_w * lr + (1 - lr) * target_sz[0]
    res_h = pred_h * lr + (1 - lr) * target_sz[1]

    target_pos = np.array([res_xs, res_ys])
    target_sz = target_sz * (1 - lr) + lr * np.array([res_w, res_h])

    if debug:
        return target_pos, target_sz, cls_score[r_max, c_max], cls_score
    else:
        return target_pos, target_sz, cls_score[r_max, c_max]

TransconverTracker.initialize = TransconverTracker_initialize
TransconverTracker.update = TransconverTracker_update

# ICRAFT NOTE:
# 改变Tracker的get_parameters和create_tracker方法，这样可以调用修改后的模块构造网络
def Tracker_get_parameters(self):
    params = TrackerParams()

    params.debug = 0
    params.visualization = False

    params.use_gpu = True

    params.checkpoint_epoch = 35

    params.net = ET_Tracker(search_size=256,
                            template_size=128,
                            stride=16,
                            e_exemplars=4,
                            sm_normalization=True,
                            temperature=2,
                            dropout=False)

    params.big_sz = 288
    params.small_sz = 256
    params.stride = 16
    params.even = 0
    params.model_name = 'et_tracker'

    params.image_sample_size = 256
    params.image_template_size = 128
    params.search_area_scale = 5

    params.window_influence = 0
    params.lr = 0.616
    params.penalty_k = 0.007
    params.context_amount = 0.5

    params.features_initialized = False

    return params

def Tracker_create_tracker(self, params):
    t = TransconverTracker(params)
    t.visdom = self.visdom
    return t

Tracker.get_parameters = Tracker_get_parameters
Tracker.create_tracker = Tracker_create_tracker

TRACE_PATH = "../2_compile/fmodel/"
os.makedirs(os.path.dirname(TRACE_PATH), exist_ok=True)
TRACE = True

if __name__ == '__main__':
    dataset_name = 'lasot'
    tracker_name = 'et_tracker'
    tracker_param = 'et_tracker'
    visualization=None
    debug=None
    visdom_info=None
    run_id = 2405101501
    dataset = get_dataset(dataset_name)

    tracker = Tracker(tracker_name, tracker_param, run_id)
    # et_tracker构造函数在此函数内调用
    params = tracker.get_parameters()
    visualization_ = visualization

    debug_ = debug
    if debug is None:
        debug_ = getattr(params, 'debug', 0)
    if visualization is None:
        if debug is None:
            visualization_ = getattr(params, 'visualization', False)
        else:
            visualization_ = True if debug else False

    params.visualization = visualization_
    params.debug = debug_
    params.use_gpu = False

    for seq in dataset[:]:
        print(seq)
        def _results_exist():
            if seq.dataset == 'oxuva':
                vid_id, obj_id = seq.name.split('_')[:2]
                pred_file = os.path.join(tracker.results_dir, '{}_{}.csv'.format(vid_id, obj_id))
                return os.path.isfile(pred_file)
            elif seq.object_ids is None:
                bbox_file = '{}/{}.txt'.format(tracker.results_dir, seq.name)
                return os.path.isfile(bbox_file)
            else:
                bbox_files = ['{}/{}_{}.txt'.format(tracker.results_dir, seq.name, obj_id) for obj_id in seq.object_ids]
                missing = [not os.path.isfile(f) for f in bbox_files]
                return sum(missing) == 0

        visdom_info = {} if visdom_info is None else visdom_info

        if _results_exist() and not debug:
            print('FPS: {}'.format(-1))
            continue

        print('Tracker: {} {} {} ,  Sequence: {}'.format(tracker.name, tracker.parameter_name, tracker.run_id, seq.name))

        tracker._init_visdom(visdom_info, debug_)
        if visualization_ and tracker.visdom is None:
            tracker.init_visualization()

        # Get init information
        init_info = seq.init_info()
        et_tracker = tracker.create_tracker(params)
        output = {'target_bbox': [],
            'time': [],
            'segmentation': [],
            'object_presence_score': []}

        def _store_outputs(tracker_out: dict, defaults=None):
            defaults = {} if defaults is None else defaults
            for key in output.keys():
                val = tracker_out.get(key, defaults.get(key, None))
                if key in tracker_out or val is not None:
                    output[key].append(val)

        # Initialize
        image = tracker._read_image(seq.frames[0])

        if et_tracker.params.visualization and tracker.visdom is None:
            tracker.visualize(image, init_info.get('init_bbox'))

        start_time = time.time()
        out = et_tracker.initialize(image, init_info)
        if out is None:
            out = {}

        prev_output = OrderedDict(out)

        init_default = {'target_bbox': init_info.get('init_bbox'),
                        'time': time.time() - start_time,
                        'segmentation': init_info.get('init_mask'),
                        'object_presence_score': 1.}

        _store_outputs(out, init_default)

        for frame_num, frame_path in enumerate(seq.frames[1:], start=1):
            image = tracker._read_image(frame_path)

            start_time = time.time()

            info = seq.frame_info(frame_num)
            info['previous_output'] = prev_output

            out = et_tracker.track(image, info)
            prev_output = OrderedDict(out)
            _store_outputs(out, {'time': time.time() - start_time})

            segmentation = out['segmentation'] if 'segmentation' in out else None
            if tracker.visdom is not None:
                tracker.visdom_draw_tracking(image, out['target_bbox'], segmentation)
            elif et_tracker.params.visualization:
                tracker.visualize(image, out['target_bbox'], segmentation)

        for key in ['target_bbox', 'segmentation']:
            if key in output and len(output[key]) <= 1:
                output.pop(key)

        output['image_shape'] = image.shape[:2]
        output['object_presence_score_threshold'] = et_tracker.params.get('object_presence_score_threshold', 0.55)

        sys.stdout.flush()

        if isinstance(output['time'][0], (dict, OrderedDict)):
            exec_time = sum([sum(times.values()) for times in output['time']])
            num_frames = len(output['time'])
        else:
            exec_time = sum(output['time'])
            num_frames = len(output['time'])

        print('FPS: {}'.format(num_frames / exec_time))

        if not debug:
            _save_tracker_output(seq, tracker, output)