stable_diffusion代码运行过程

加载模型以及参数

加载参数

首先在Main函数的最开始，新建argparse对象parser，向parser中输入参数以及模型信息，再将这些信息转化为opt

	arser = argparse.ArgumentParser()
    parser.add_argument(
        "--prompt",
        type=str,
        nargs="?",
        default="a painting of a virus monster playing guitar",
        help="the prompt to render"
    )
    opt = parser.parse_args()
    config = OmegaConf.load(f"{opt.config}")

下面是debug后config的值：

{'model': {'base_learning_rate': 0.0001, 
'target': 'ldm.models.diffusion.ddpm.LatentDiffusion',
 'params': {'linear_start': 0.00085, 'linear_end': 0.012, 'num_timesteps_cond': 1, 'log_every_t': 200, 'timesteps': 1000, 'first_stage_key': 'jpg', 'cond_stage_key': 'txt', 'image_size': 64, 'channels': 4, 'cond_stage_trainable': False, 'conditioning_key': 'crossattn', 'monitor': 'val/loss_simple_ema', 'scale_factor': 0.18215, 'use_ema': False, 
 'personalization_config': {'target': 'ldm.modules.embedding_manager.EmbeddingManager', 'params': {'placeholder_strings': ['*'], 'initializer_words': ['sculpture'], 
 'per_image_tokens': False, 'num_vectors_per_token': 1, 'progressive_words': False}}, 
 'unet_config': {'target': 'ldm.modules.diffusionmodules.openaimodel.UNetModel', 
 'params': {'image_size': 32, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2, 1], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4, 4], 'num_heads': 8, 'use_spatial_transformer': True, 'transformer_depth': 1, 'context_dim': 768, 'use_checkpoint': True, 'legacy': False}}, 
 'first_stage_config': {'target': 'ldm.models.autoencoder.AutoencoderKL',
  'params': {'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, '
  lossconfig': {'target': 'torch.nn.Identity'}}}, 
  'cond_stage_config': {'target': 'ldm.modules.encoders.modules.FrozenCLIPEmbedder'}}}}

我们可以看到，config主要记录了LatentDiffusion的超参数以及模型参数，其中具体包括unet参数，first_stage参数，cond_stage参数，这三者可以认为是LatentDiffusion三个阶段，而这些参数都是parser从yaml文件中（configs/stable-diffusion/v1-inference.yaml）读取的：

model:
  base_learning_rate: 1.0e-04
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.00085
    linear_end: 0.0120
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: "jpg"
    cond_stage_key: "txt"
    image_size: 64
    channels: 4
    cond_stage_trainable: false   # Note: different from the one we trained before
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    scale_factor: 0.18215
    use_ema: False

    personalization_config:
      target: ldm.modules.embedding_manager.EmbeddingManager
      params:
        placeholder_strings: ["*"]
        initializer_words: ["sculpture"]
        per_image_tokens: false
        num_vectors_per_token: 1
        progressive_words: False
        
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 32 # unused
        in_channels: 4
        out_channels: 4
        model_channels: 320
        attention_resolutions: [ 4, 2, 1 ]
        num_res_blocks: 2
        channel_mult: [ 1, 2, 4, 4 ]
        num_heads: 8
        use_spatial_transformer: True
        transformer_depth: 1
        context_dim: 768
        use_checkpoint: True
        legacy: False

    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity

    cond_stage_config:
      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

加载模型

之后使用下面的代码加载模型，其中调用关系复杂：

model = load_model_from_config(config, f"{opt.ckpt}")

首先会调用load_model_from_config函数，该函数接受config和ckpt，从ckpt文件加载模型的状态字典，并将其加载到根据config文件创建的模型中，还打印了可能存在的缺失或意外的键

def load_model_from_config(config, ckpt, verbose=False):
    print(f"Loading model from {ckpt}")
    #从ckpt文件加载一个字典，其中包含模型的状态字典state_dict
    pl_sd = torch.load(ckpt, map_location="cpu")
    if "global_step" in pl_sd:
        print(f"Global Step: {pl_sd['global_step']}")
    sd = pl_sd["state_dict"]
    #从配置文件config中实例化一个模型
    model = instantiate_from_config(config.model)
    #使用load_state_dict()方法将模型状态字典sd加载到模型中
    m, u = model.load_state_dict(sd, strict=False)
    if len(m) > 0 and verbose:
        print("missing keys:")
        print(m)
    if len(u) > 0 and verbose:
        print("unexpected keys:")
        print(u)

    model.cuda()
    model.eval()
    return model

其中instantiate_from_config函数的具体作用是检查config字典中是否存在名为"target"的键，如果不存在，将检查config是否等于’is_first_stage’或’is_unconditional’，如果"target"键存在于config中，将使用get_obj_from_str()函数根据config[“target”]的值实例化一个对象：

def instantiate_from_config(config, **kwargs):
    if not "target" in config:
        if config == '__is_first_stage__':
            return None
        elif config == "__is_unconditional__":
            return None
        raise KeyError("Expected key `target` to instantiate.")
    return get_obj_from_str(config["target"])(**config.get("params", dict()), **kwargs)

其中涉及get_obj_from_str函数，它根据给定的字符串string实例化一个对象，在debug过程中，string输入为’ldm.models.diffusion.ddpm.LatentDiffusion’，也就是实例化ldm：

def get_obj_from_str(string, reload=False):
    module, cls = string.rsplit(".", 1)
    if reload:
        module_imp = importlib.import_module(module)
        importlib.reload(module_imp)
    #获取实例属性
    return getattr(importlib.import_module(module, package=None), cls)

由于string输入为’ldm.models.diffusion.ddpm.LatentDiffusion’，module就是’ldm.models.diffusion.ddpm’，就是cls就是LatentDiffusion，也就是说要去’ldm.models.diffusion.ddpm这个类去使用getattr函数取出LatentDiffusion的类属性。那么接下来就是加载ddpm代码：
进入DDPM类后，先进行各种初始化，在其中self.model = DiffusionWrapper(unet_config, conditioning_key)这段代码调用DiffusionWrapper类初始化模型：

传入的参数是unet_config,和conditioning_key

class DiffusionWrapper(pl.LightningModule):
    def __init__(self, diff_model_config, conditioning_key):
        super().__init__()
        #根据传入的unet参数进行实例化
        self.diffusion_model = instantiate_from_config(diff_model_config)
        self.conditioning_key = conditioning_key
        assert self.conditioning_key in [None, 'concat', 'crossattn', 'hybrid', 'adm']

    def forward(self, x, t, c_concat: list = None, c_crossattn: list = None):
        if self.conditioning_key is None:
            out = self.diffusion_model(x, t)
        elif self.conditioning_key == 'concat':
            xc = torch.cat([x] + c_concat, dim=1)
            out = self.diffusion_model(xc, t)
        elif self.conditioning_key == 'crossattn':
            cc = torch.cat(c_crossattn, 1)
            out = self.diffusion_model(x, t, context=cc)
        elif self.conditioning_key == 'hybrid':
            xc = torch.cat([x] + c_concat, dim=1)
            cc = torch.cat(c_crossattn, 1)
            out = self.diffusion_model(xc, t, context=cc)
        elif self.conditioning_key == 'adm':
            cc = c_crossattn[0]
            out = self.diffusion_model(x, t, y=cc)
        else:
            raise NotImplementedError()

        return out

进入openaimodel.py文件中的unet类去实例化unet对象，其中unet就是先计算time_embed，再下采样（由ResBlock、AttentionBlock和TimestepEmbedSequential组成），中间层，上采样
接下来ldm函数中的super().__init__(conditioning_key=conditioning_key, *args, **kwargs)会跳转到ddpm类，通过register_schedule计算所需参数

接下来就是处理第一阶段模型，使用self.instantiate_first_stage(first_stage_config)初始化模型参数，其中方法和上述差不多，在进入autoencoder.py中的AutoencoderKL类后，首先初始化encoder和decoder，之后加载必要参数：

    def instantiate_first_stage(self, config):
    	#根据config字典来实例化一个模型（model）
        model = instantiate_from_config(config)
        #将实例化后的模型赋值给self.first_stage_model
        self.first_stage_model = model.eval()
        self.first_stage_model.train = disabled_train
        #禁用参数的梯度计算
        for param in self.first_stage_model.parameters():
            param.requires_grad = False

在之后进行初始化cond_stage模型，还是调用self.instantiate_cond_stage(cond_stage_config)，这次到get_obj_from_str方法时的string=‘ldm.modules.encoders.modules.FrozenCLIPEmbedder’，也就是要去’FrozenCLIPEmbedder’中提取属性
在FrozenCLIPEmbedder类中我们可以设置预训练的参数，

class FrozenCLIPEmbedder(AbstractEncoder):
    """Uses the CLIP transformer encoder for text (from Hugging Face)"""
    def __init__(self, version="clip-vit-large-patch14", device="cuda", max_length=77):
        super().__init__()
        self.tokenizer = CLIPTokenizer.from_pretrained(version)
        self.transformer = CLIPTextModel.from_pretrained(version)
        self.device = device
        self.max_length = max_length

进行到这里以及完成了ldm模型的加载，下面进行加载采样器：

sampler = DDIMSampler(model)

class DDIMSampler(object):
    def __init__(self, model, schedule="linear", **kwargs):
        super().__init__()
        self.model = model
        self.ddpm_num_timesteps = model.num_timesteps
        self.schedule = schedule

接下来将空提示词加载为条件向量：

if opt.scale != 1.0:
	uc = model.get_learned_conditioning(batch_size * [""])

这条命令执行会调用以下函数：

	#其中c=['', '', '']
    def get_learned_conditioning(self, c):
        if self.cond_stage_forward is None:
        	#有encode
            if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode):
            	#将c encode一下
                c = self.cond_stage_model.encode(c, embedding_manager=self.embedding_manager)
                if isinstance(c, DiagonalGaussianDistribution):
                    c = c.mode()
            else:
                c = self.cond_stage_model(c)
        else:
            assert hasattr(self.cond_stage_model, self.cond_stage_forward)
            c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
        return c

接下来对于encode会调用

    def encode(self, text, **kwargs):
        return self(text, **kwargs)

转到clip的前向传播函数：

    def forward(self, text, **kwargs):
    	#按照self.max_length的大小对['', '', '']进行编码
        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
        #取出token
        tokens = batch_encoding["input_ids"].to(self.device)
        #调用transformer的前向函数        
        z = self.transformer(input_ids=tokens, **kwargs)

        return z