mean,var=0.,0. for i inrange(10000): x = torch.randn(512) a = torch.randn(512,512) y = a @ x mean += y.mean().item() var += y.pow(2).mean().item() print(mean()/10000, math.sqrt(var/10000)) #0.00889449315816164 22.629779825053976 print(math.sqrt(512)) # 22.627416997969522
mean,var = 0.,0. for i inrange(10000): x = torch.randn(512) a = torch.randn(512,512) b = torch.randn(512,512) y = a @ x z = b @ y mean += z.mean().item() var += z.pow(2).mean().item() print(mean/10000, math.sqrt(var/10000)) #0.6010947234869003 511.8684602024235
mean,var=0.,0. for i inrange(10000): x = torch.randn(512) a = torch.randn(512,512)/math.sqrt(512) y = a @ x mean += y.mean().item() var += y.pow(2).mean().item() print(mean/10000, math.sqrt(var/10000)) #0.00039810733370250094 1.0007971983717594
1 2 3 4 5 6
x = torch.randn(512) for i inrange(100): a = torch.randn(512,512)/math.sqrt(512) x = a @ x print(x.mean(), x.std()) #tensor(-0.0048) tensor(1.2810)
#sigmoid x = torch.randn(512) for i inrange(100): a = torch.randn(512,512)/math.sqrt(512) x = torch.sigmoid( a @ x) print(x.mean(), x.std()) #tensor(0.5057) tensor(0.1180)
1 2 3 4 5 6 7
#tanh x = torch.randn(512) for i inrange(100): a = torch.randn(512,512)/math.sqrt(512) x = torch.tanh( a @ x) print(x.mean(), x.std()) #tensor(-0.0051) tensor(0.0879)
x = torch.randn(512) for i inrange(100): a = torch.Tensor(512,512).uniform_(-1,1)/math.sqrt(512) x = torch.tanh( a @ x) print(x.mean(), x.std()) #tensor(-3.8077e-26) tensor(1.2476e-24)
1 2 3 4 5 6
x = torch.randn(512) for i inrange(100): a = torch.Tensor(512,512).uniform_(0,1)/math.sqrt(512) x = torch.tanh( a @ x) print(x.mean(), x.std()) #tensor(-1.) tensor(0.)
1 2 3 4 5 6
x = torch.randn(512) for i inrange(100): a = torch.Tensor(512,512).uniform_(0,1)/math.sqrt(512) x = torch.sigmoid( a @ x) print(x.mean(), x.std()) #tensor(1.0000) tensor(3.8114e-06)
1 2 3 4 5 6
x = torch.randn(512) for i inrange(100): a = torch.Tensor(512,512).uniform_(-1,1)/math.sqrt(512) x = torch.sigmoid( a @ x) print(x.mean(), x.std()) #tensor(0.4934) tensor(0.0659)
方差都出人意料的小。这就几乎不能学习到什么有用的特征了。
为此,Glorot and Bengio 提出了Xavier initialization的初始化方式
x = torch.randn(512) for i inrange(100): a = xavier(512,512) x = torch.tanh( a @ x) print(x.mean(), x.std()) #tensor(0.0854) tensor(0.9933) x = torch.randn(512) for i inrange(100): a = xavier(512,512) x = torch.sigmoid( a @ x) print(x.mean(), x.std()) #tensor(0.4686) tensor(0.4976)
Kaiming Initialization
进来CV领域中,激活方法多是采用Relu 函数。对于这个函数。之前的初始化方法,又有哪些不一样?
1 2 3 4 5 6
x = torch.randn(512) for i inrange(100): a = torch.randn(512,512)/math.sqrt(512) x = torch.relu(a @ x) print(x.mean(), x.std()) # tensor(4.6656e-16) tensor(6.7154e-16)
1 2 3 4 5 6
x = torch.randn(512) for i inrange(100): a = xavier(512,512) x = torch.relu( a @ x) print(x.mean(), x.std()) # tensor(nan) tensor(nan)
之前的初始化方法,对于Relu函数都不奏效了。那对于每一层来说,有什么变化
1 2 3 4 5 6 7 8 9 10 11 12
mean,var=0.,0. for i inrange(10000): x = torch.randn(512) a = torch.randn(512,512)/math.sqrt(512) y = torch.relu(a @ x) mean += y.mean().item() var += y.pow(2).mean().item()
mean,var=0.,0. for i inrange(10000): x = torch.randn(512) a = torch.randn(512,512)/math.sqrt(512/2.) y = torch.relu(a @ x) mean += y.mean().item() var += y.pow(2).mean().item()