PeftModelForCausalLM(
(base_model): LoraModel(
(model): Florence2ForConditionalGeneration(
(vision_tower): DaViT(
(convs): ModuleList(
(0): ConvEmbed(
(proj): Conv2d(3, 256, kernel_size=(7, 7), stride=(4, 4), padding=(3, 3))
(norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
)
(1): ConvEmbed(
(proj): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
)
(2): ConvEmbed(
(proj): Conv2d(512, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(3): ConvEmbed(
(proj): Conv2d(1024, 2048, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(blocks): ModuleList(
(0): MySequential(
(0): MySequential(
(spatial_block): SpatialBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256)
)
)
(window_attn): PreNorm(
(norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(fn): WindowAttention(
(qkv): Linear(in_features=256, out_features=768, bias=True)
(proj): Linear(in_features=256, out_features=256, bias=True)
(softmax): Softmax(dim=-1)
)
(drop_path): Identity()
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256)
)
)
(ffn): PreNorm(
(norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=256, out_features=1024, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=1024, out_features=256, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=1024, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=256, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): Identity()
)
)
(channel_block): ChannelBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256)
)
)
(channel_attn): PreNorm(
(norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(fn): ChannelAttention(
(qkv): Linear(in_features=256, out_features=768, bias=True)
(proj): Linear(in_features=256, out_features=256, bias=True)
)
(drop_path): DropPath(drop_prob=0.004)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256)
)
)
(ffn): PreNorm(
(norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=256, out_features=1024, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=1024, out_features=256, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=1024, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=256, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.004)
)
)
)
)
(1): MySequential(
(0): MySequential(
(spatial_block): SpatialBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=512)
)
)
(window_attn): PreNorm(
(norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(fn): WindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.009)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=512)
)
)
(ffn): PreNorm(
(norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=512, out_features=2048, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=2048, out_features=512, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=2048, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=512, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.009)
)
)
(channel_block): ChannelBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=512)
)
)
(channel_attn): PreNorm(
(norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(fn): ChannelAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(drop_path): DropPath(drop_prob=0.013)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=512)
)
)
(ffn): PreNorm(
(norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=512, out_features=2048, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=2048, out_features=512, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=2048, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=512, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.013)
)
)
)
)
(2): MySequential(
(0): MySequential(
(spatial_block): SpatialBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(window_attn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): WindowAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.017)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(ffn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.017)
)
)
(channel_block): ChannelBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(channel_attn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): ChannelAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
)
(drop_path): DropPath(drop_prob=0.022)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(ffn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.022)
)
)
)
(1): MySequential(
(spatial_block): SpatialBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(window_attn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): WindowAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.026)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(ffn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.026)
)
)
(channel_block): ChannelBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(channel_attn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): ChannelAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
)
(drop_path): DropPath(drop_prob=0.030)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(ffn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.030)
)
)
)
(2): MySequential(
(spatial_block): SpatialBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(window_attn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): WindowAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.035)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(ffn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.035)
)
)
(channel_block): ChannelBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(channel_attn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): ChannelAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
)
(drop_path): DropPath(drop_prob=0.039)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(ffn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.039)
)
)
)
(3): MySequential(
(spatial_block): SpatialBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(window_attn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): WindowAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.043)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(ffn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.043)
)
)
(channel_block): ChannelBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(channel_attn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): ChannelAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
)
(drop_path): DropPath(drop_prob=0.048)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(ffn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.048)
)
)
)
(4): MySequential(
(spatial_block): SpatialBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(window_attn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): WindowAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.052)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(ffn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.052)
)
)
(channel_block): ChannelBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(channel_attn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): ChannelAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
)
(drop_path): DropPath(drop_prob=0.057)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(ffn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.057)
)
)
)
(5): MySequential(
(spatial_block): SpatialBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(window_attn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): WindowAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.061)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(ffn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.061)
)
)
(channel_block): ChannelBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(channel_attn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): ChannelAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
)
(drop_path): DropPath(drop_prob=0.065)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(ffn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.065)
)
)
)
(6): MySequential(
(spatial_block): SpatialBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(window_attn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): WindowAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.070)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(ffn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.070)
)
)
(channel_block): ChannelBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(channel_attn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): ChannelAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
)
(drop_path): DropPath(drop_prob=0.074)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(ffn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.074)
)
)
)
(7): MySequential(
(spatial_block): SpatialBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(window_attn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): WindowAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.078)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(ffn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.078)
)
)
(channel_block): ChannelBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(channel_attn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): ChannelAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
)
(drop_path): DropPath(drop_prob=0.083)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(ffn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.083)
)
)
)
(8): MySequential(
(spatial_block): SpatialBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(window_attn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): WindowAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.087)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(ffn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.087)
)
)
(channel_block): ChannelBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(channel_attn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): ChannelAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
)
(drop_path): DropPath(drop_prob=0.091)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024)
)
)
(ffn): PreNorm(
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.091)
)
)
)
)
(3): MySequential(
(0): MySequential(
(spatial_block): SpatialBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(2048, 2048, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=2048)
)
)
(window_attn): PreNorm(
(norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
(fn): WindowAttention(
(qkv): Linear(in_features=2048, out_features=6144, bias=True)
(proj): Linear(in_features=2048, out_features=2048, bias=True)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.096)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(2048, 2048, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=2048)
)
)
(ffn): PreNorm(
(norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=2048, out_features=8192, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=8192, out_features=2048, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=8192, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=2048, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.096)
)
)
(channel_block): ChannelBlock(
(conv1): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(2048, 2048, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=2048)
)
)
(channel_attn): PreNorm(
(norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
(fn): ChannelAttention(
(qkv): Linear(in_features=2048, out_features=6144, bias=True)
(proj): Linear(in_features=2048, out_features=2048, bias=True)
)
(drop_path): DropPath(drop_prob=0.100)
)
(conv2): PreNorm(
(fn): DepthWiseConv2d(
(dw): Conv2d(2048, 2048, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=2048)
)
)
(ffn): PreNorm(
(norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
(fn): Mlp(
(net): Sequential(
(fc1): Linear(in_features=2048, out_features=8192, bias=True)
(act): GELU(approximate='none')
(fc2): lora.Linear(
(base_layer): Linear(in_features=8192, out_features=2048, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=8192, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=2048, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
(drop_path): DropPath(drop_prob=0.100)
)
)
)
)
)
(avgpool): AdaptiveAvgPool1d(output_size=1)
)
(image_proj_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(image_pos_embed): LearnedAbsolutePositionEmbedding2D(
(row_embeddings): Embedding(50, 1024)
(column_embeddings): Embedding(50, 1024)
)
(visual_temporal_embed): PositionalEmbeddingCosine1D()
(language_model): Florence2LanguageForConditionalGeneration(
(model): Florence2LanguageModel(
(shared): Embedding(51289, 1024, padding_idx=1)
(encoder): Florence2Encoder(
(embed_tokens): Florence2ScaledWordEmbedding(51289, 1024, padding_idx=1)
(embed_positions): Florence2LearnedPositionalEmbedding(1026, 1024)
(layers): ModuleList(
(0-11): 12 x Florence2EncoderLayer(
(self_attn): Florence2SdpaAttention(
(k_proj): lora.Linear(
(base_layer): Linear(in_features=1024, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=1024, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(v_proj): lora.Linear(
(base_layer): Linear(in_features=1024, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=1024, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(q_proj): lora.Linear(
(base_layer): Linear(in_features=1024, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=1024, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(out_proj): Linear(in_features=1024, out_features=1024, bias=True)
)
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(activation_fn): GELUActivation()
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(layernorm_embedding): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(decoder): Florence2Decoder(
(embed_tokens): Florence2ScaledWordEmbedding(51289, 1024, padding_idx=1)
(embed_positions): Florence2LearnedPositionalEmbedding(1026, 1024)
(layers): ModuleList(
(0-11): 12 x Florence2DecoderLayer(
(self_attn): Florence2SdpaAttention(
(k_proj): lora.Linear(
(base_layer): Linear(in_features=1024, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=1024, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(v_proj): lora.Linear(
(base_layer): Linear(in_features=1024, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=1024, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(q_proj): lora.Linear(
(base_layer): Linear(in_features=1024, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=1024, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(out_proj): Linear(in_features=1024, out_features=1024, bias=True)
)
(activation_fn): GELUActivation()
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(encoder_attn): Florence2SdpaAttention(
(k_proj): lora.Linear(
(base_layer): Linear(in_features=1024, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=1024, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(v_proj): lora.Linear(
(base_layer): Linear(in_features=1024, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=1024, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(q_proj): lora.Linear(
(base_layer): Linear(in_features=1024, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=1024, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(out_proj): Linear(in_features=1024, out_features=1024, bias=True)
)
(encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(fc2): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=1024, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=1024, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(layernorm_embedding): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
(lm_head): lora.Linear(
(base_layer): Linear(in_features=1024, out_features=51289, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=1024, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=51289, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
)
)
)
)