@@ -660,9 +660,10 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
660
660
ggml_tensor * gpt_neox_ff (
661
661
const gpt_neox_block &block,
662
662
ggml_context * ctx0,
663
- ggml_tensor * inp) {
663
+ ggml_tensor * inp,
664
+ const gpt_neox_hparams &hparams) {
664
665
665
- ggml_tensor * cur = ggml_norm (ctx0, inp);
666
+ ggml_tensor * cur = ggml_norm (ctx0, inp, hparams. norm_eps );
666
667
667
668
cur = ggml_add (ctx0, ggml_mul (ctx0, ggml_repeat (ctx0, block.ln_2_g , cur), cur), ggml_repeat (ctx0, block.ln_2_b , cur));
668
669
cur = ggml_mul_mat (ctx0, block.c_mlp_fc_w , cur);
@@ -753,7 +754,7 @@ bool gpt_neox_eval(
753
754
// self-attention
754
755
{
755
756
{
756
- cur = ggml_norm (ctx0, inpL);
757
+ cur = ggml_norm (ctx0, inpL, hparams. norm_eps );
757
758
758
759
cur = ggml_add (ctx0,
759
760
ggml_mul (ctx0, ggml_repeat (ctx0, model.blocks [il].ln_1_g , cur), cur),
@@ -844,7 +845,7 @@ bool gpt_neox_eval(
844
845
if (hparams.par_res == 0 ) {
845
846
struct ggml_tensor * inpFF = ggml_add (ctx0, cur, inpL);
846
847
847
- cur = gpt_neox_ff (model.blocks [il], ctx0, inpFF);
848
+ cur = gpt_neox_ff (model.blocks [il], ctx0, inpFF, hparams );
848
849
849
850
// input for next layer
850
851
inpL = ggml_add (ctx0, cur, inpFF);
@@ -853,7 +854,7 @@ bool gpt_neox_eval(
853
854
854
855
// this is independent of the self-attention result, so it could be done in parallel to the self-attention
855
856
// note here we pass inpL instead of cur
856
- cur = gpt_neox_ff (model.blocks [il], ctx0, inpL);
857
+ cur = gpt_neox_ff (model.blocks [il], ctx0, inpL, hparams );
857
858
858
859
// layer input + FF
859
860
cur = ggml_add (ctx0, cur, inpFF);
@@ -867,7 +868,7 @@ bool gpt_neox_eval(
867
868
868
869
// norm
869
870
{
870
- inpL = ggml_norm (ctx0, inpL);
871
+ inpL = ggml_norm (ctx0, inpL, hparams. norm_eps );
871
872
872
873
// inpL = ln_f_g*inpL + ln_f_b
873
874
inpL = ggml_add (ctx0,
0 commit comments