1
2
3
4
5
6
if group['use_grams']:
            # "Grams: Gradient Descent with Adaptive Momentum Scaling": https://arxiv.org/abs/2412.17107
            u = (y - z).mul_(ckp1).add_(update, alpha=dlr * xy_step)
            u.copy_(torch.sign(update) * u.abs())
            y.sub_(u)
            del u
Edit Report
Pub: 31 Jan 2025 18:24 UTC
Views: 59