import { EntryBody } from "../components/Entry/EntryBody";

export const notes_on_memorisation: { [id: string]: any } = {
  id: "notes_on_memorisation",
  title: <>Notes on Memorisation in NLP</>,
  date: "August 2023",

  Body: (
    <EntryBody
      paragraphs={[
        <div className="font-mono"></div>,

        <div className="font-mono">
          These are some notes taken on papers about memorisation in NLP
          systems, just to have them around.
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">----------------------- Finding Memo</div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          counterfactual memorisation (CM) ~ a metric from computer vision
          (Feldman and Zhang, 2020) that assigns high values to examples a model
          can only predict correctly if they are in the training set
        </div>,

        <div className="font-mono">
          -&gt; estimate the memorization value of a training sample by training
          multiple models on different random subsets of the training data and
          then measuring the deviation in the sample’s classification accuracy
          under inclusion/exclusion
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          extractive memorization ~ exact training data generation under
          insufficient context
        </div>,

        <div className="font-mono">
          -&gt; unconstrained generation: a training string s is extractable if
          there exists a prefix c that could exactly generate s under an
          appropriate sampling strategy (e.g. greedy decoding)
        </div>,

        <div className="font-mono">
          -&gt; constrained generation (NMT): this definition labels an input
          sentence (source) as being memorized if its transduction (translation)
          could be replicated exactly with a prefix considerably shorter than
          the length of the full input sentence (source), under greedy decoding.
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          On the Finding Memo paper (Table 2) we see this example from the
          training data, found through the extractive memorisation algorithm:
          Why study in Peru? Spanish Courses -&gt; Warum in Peru studieren?
        </div>,

        <div className="font-mono">
          When the suffix is perturbed, we have mistranslations: Why study in
          Peru? University Courses -&gt; Warum in Peru studieren?
        </div>,

        <div className="font-mono">
          When the prefix is perturbed, translations are correct: You study in
          Peru? Spanish Courses -&gt; Sie studieren in Peru? Spanischkurse
        </div>,

        <div className="font-mono">
          But is"nt this a case of correct generalisation from an incorrect
          datapoint? The suffix perturbations are correct generalisations from
          the training sample, if we assume it is true.
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          In Table 8 of the Annex, there seems to be examples of another kind of
          memorisation, for example:
        </div>,

        <div className="font-mono">
          (training data) Madam President, Commissioner, ladies and gentlemen
          -&gt; Frau Präsidentin, Herr Kommissar, meine Damen und Herren
        </div>,

        <div className="font-mono">
          (wrong translation) Madam President, Commissioner, ladies and doctors
          -&gt; Frau Präsidentin, Herr Kommissar, meine Damen und Herren
        </div>,

        <div className="font-mono">
          This seems to be a case where the language modeling moment of
          translation "takes over" and the translation becomes detached from the
          source, which would put it in the domain of hallucinations. Most
          likely the original sentence appears many times in the training data,
          which causes this behaviour (like they show on raunak on
          hallucinations (?)).
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          ----------------------- Quantifying Memorization Across Neural
          Language Models
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          Memorization significantly grows as we increase (1) the capacity of a
          model, (2) the number of times an example has been duplicated, and (3)
          the number of tokens of context used to prompt the model.
          Surprisingly, we find the situation becomes more complicated when
          generalizing these results across model families.
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          Definition 3.1. A string s is extractable with k tokens of context
          from a model f if there exists a (length-k) string p, such that the
          concatenation [p || s] is contained in the training data for f, and f
          produces s when prompted with p using greedy decoding.
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          They claim the GPT2 baseline on Fig1(a) "allows to establish what
          fraction of the training data is sufficiently “easy” that any language
          model can correctly predict the 50-token suffix, even if the example
          has not been seen during training". But the behaviour of GPT2 models
          in this biased the Pile subset of repetitions & long sequences is
          similar to the Neo models on a random sample of its own training data.
          Aren"t they just saying "a model has better performance on its own
          training data"? And because they test on long sequences (sparse) &
          repeated sentences (dense), the conditioned probabilities will be very
          strong in both cases. I do not see why call this behaviour pathologic.
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          Consider the probability of generating x afer sentence s: p(x|s) =
          p(sx)/p(s) ~ 1 when p(sx) ~ p(s), when does this equality (more or
          less) hold? We can estimate it from the training data.
        </div>,

        <div className="font-mono">
          For long sentences, let"s say that they are only once in the training
          data, then p(sx) = p(s) and p(x|s) = 1, BAM memorized D:
        </div>,

        <div className="font-mono">
          For sequences repeated multiple times in the training data, even if sy
          and sz are also present, since sx is repeated more times, instead of
          p(sx)=1/3 _ p(s) (here i assume independence which i think its fine)
          we will have (if repeated 8 times & the other 1), p(sx)=8/10 _ p(s) &
          therefore p(x|s) = 8/10\*p(s)/p(s) ~ 1, greatly overestimating p(x|s)
          if sx where not repeated in the data. Note that just one duplication
          would double the (theoretical estimated) prob of generating sx (if sy,
          sz, ... are only once in the data)
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          Seems like these memorisation behaviours, ar just the result of the
          model doing a better job at modeling the underlying conditional
          probability distribution in the training data, which is problematic
          when the training data is of bad quality. Smaller models may not have
          the capacity to store these parts of the distribution (which are a bit
          OOD if you like, most examples are not too sparse or too dense, but
          rather in the middle).
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          Based on this, and understanding generalizing by having a better
          conditional probability distribution of the training data, I say that
          this claim: "larger models are indeed memorizing more data, and not
          simply generalizing better" is misguided to say the least.
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          Is this still a research question: "are no known techniques to
          identify the tail of memorized data without conditioning the model
          with a large context"
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          We find that using beam search with 100 beams results in marginally
          more extracted memorization
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          Experiments on T5: Surprisingly, while a scaling trend does hold here
          as well, the absolute memorization in masked models is an order of
          magnitude lower than for comparably sized causal language models.
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          This is interesting, because the modeling approach is different, the
          conditional probability argument presented before may not be fit for
          MLM, because: (1) there is context in both sides (2) the model
          completes more than 1 token at a time (15%) nonono this is not so,
          continue reading:
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          I think this "order of magnitude" is due to the samples they are
          evaluating on are because they are evaluation on things that are not
          training examples. For the model, while training, the training example
          is not s, but s with a specific 15% masked. Therefore, if they do not
          chose one of the 0.85|s|+1 substrings that the model saw during
          training (which may be more than one), the sample is not a training
          sample. By their own definition, it is not memorisation (at least in
          the cases where they are not training samples, and this is not
          guaranteed in their method). And because it is a matter of chance, it
          seems logical that, even if the memorization rates of the MLM where
          the same of the causal model, the results will be much lower. Still,
          in the case of repetition you may get more memorization just because
          it is more likely to hit actual training examples in the sampling they
          do (as if it is repeated it has been seen more times by the model).
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          The prob of hitting a training sample may be something like:
          #repetitions*in_training_data * #epochs \_ 1/(0.85|s|+1) (capped at 1,
          this is an upper bound that assumes no repetition in the sampling of
          substrings during training)
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          If we look at the T5 paper, we actually see that: Note that 2^35
          tokens only covers a fraction of the entire C4 data set, so we never
          repeat any data during pre-training
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          ----------------------- Memorisation versus Generalisation in
          Pre-trained Language Models
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">Study of BERT training for NER.</div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          During the second phase, BERT completely ignores the noisy tokens and
          correctly misclassifies them, performing “worse” than a random
          classifier. The step-like improvements during the third stage show
          that the model is unable to learn any patterns from the noise and
          improves by repeatedly optimising on the same examples, gradually
          memorising them.
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          We also hypothesise that due to the robustness to noise shown in the
          second phase of training, a noise detector can be constructed based
          only on BERT’s training losses, without requiring any other
          information. We find that a simple detector that clusters the losses
          using k-means reliably achieves over 90% noise-detection F1 score in
          all our experiments, further showing how the model is able to actively
          detect and reject single noisy examples
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          The above properties can mostly be attributed to BERT’s pre-training
          process—after large-scale optimisation as a language model, the
          network is primed for learning general patterns and better able to
          ignore individual noisy examples. We find that a randomly initialised
          model with the same architecture does not only achieve lower overall
          performance but crucially does not exhibit’s BERT’s distinct second
          phase
        </div>,

        <div className="font-mono"></div>,

        <div className="font-mono">--- 11.08.2023 Tsz</div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          &gt; Estimating CM using checkpoints along training trajectory
        </div>,

        <div className="font-mono">&gt; Memorisation attractor</div>,

        <div className="font-mono">&gt; Effect on destillation</div>,

        <div className="font-mono">
          &gt; Representation collapse/degeneration/attractors
        </div>,
        <div></div>,

        <div className="font-mono">Relevant Papers:</div>,

        <div className="font-mono"></div>,

        <div className="font-mono">
          &gt; Finding Memo: Extractive Memorization in Constrained Sequence
          Generation Tasks (Vikas Raunak, Arul Menezes) 2023
        </div>,

        <div className="font-mono">
          &gt; Quantifying Memorization Across Neural Language Models (Carlini)
          2022
        </div>,
      ]}
    />
  ),
};
