Skip to main content

Table 1 Regular expression patterns for the detection of residue mentions in text. The patterns recognise single (SITE) or multiple wild-type residue sites (SITES), a sequence range or residue pair (RANGE/PAIR), and point mutation (MUTATION). The set covers abbreviated notations of residues as well as grammatic expressions found in text.

From: Annotation of protein residues based on a literature analysis: cross-validation against UniProtKb

RANGE-TO

= ("-"+ ("to" "-+")? |"to");

CONVERT-TO

= ("to"|"-"+">"?);

XAA

= ("X"|"XAA"|"xaa");

POS

= (1–9)(0–9)*;

RESN1

= [ARNDCQEGHILKMFPSTWYVOUBZX];

RESN3

= ([aA]la|ALA | [aA]rg|ARG | [aA]sn|ASN | [aA]sp|ASP | [cC]ys|CYS

 

   | [gG]ln|GLN | [gG]lu|GLU | [gG]ly|GLY | [hH]is|HIS | [iI]le|ILE

 

   | [lL]eu|LEU | [lL]ys|LYS | [mM]et|MET | [pP]he|PHE | [pP]ro|PRO

 

   | [sS]er|SER | [tT]hr|THR | [tT]rp|TRP | [tT]yr|TYR | [vV]al|VAL

 

   | [pP]yl|PYL | [sS]ec|SEC | [aA]sx|ASX | [gG]lx|GLX | [xX]aa|XAA);

RESNF

= ([aA]lanine | [aA]rginine | [aA]sparagine | [aA]spart(ate|ic acid) | [cC]ysteine

 

   | [gG]lutamine | [gG]lutam(ate|ic acid) | [gG]lycine | [hH]istidine | [iI]soleucine

 

   | [lL]eucine | [lL]ysine | [mM]ethionine | [pP]henylalanine | [pP]roline

 

   | [sS]erine | [tT]hreonine | [tT]ryptophan | [tT]yrosine | [vV]aline

 

   | [pP]yrrolysine | [sS]elenocysteine | [aA]spartic acid or [aA]sparagine

 

   | [gG]lutamic acid or [gG]lutamine);

SITE

= ((RESN3 | RESNF) POS "residue"?

 

   | (RESN3 | RESNF)"-"+ POS "residue"?

 

   | (RESN3 | RESNF)"residue"? "at position"? POS "residue"?

 

   | (RESN3 | RESNF)"("POS")" "residue"?

 

   |"amino acid"? "residue" "at position"? POS

 

   |"amino acid" "residue"? "at position"? POS

 

   | RESNF "residue" POS);

SITES

= (RESNF"s"((","|"and"|"or") RESNF"s")*

 

   | RESNF"s"? ("at position" "s"?)? (","|"and"|"or") (("at position" "s"?)? (","|"and

 

   |"or") POS)+

 

   | RESNF "residue" "s"?

 

   | RESN3 "residue" "s"? ("at position" "s"?)? POS (("at position" "s"?)? (","|"and" | "or") POS)+

 

   | RESN3 "residue" "s"?

 

   |"residue" "s"? ("at position" "s"?)? POS (","|"and"|"or") POS)+

 

   | (RESN3 | RESNF)"for"(RESN3 | RESNF)"at position"POS (","|"and"|"or") POS)+

 

   | RESNF ("," | "and" | "or") POS)* "residue" "s"?);

RANGE/PAIR

= ("residue" "s"? ("," | "and" | "or") RANGE-TO POS)+

 

   |"amino acid" "residue"? "s"? ("," | "and" | "or") RANGE-TO POS)+

 

   | ("resiude" "s"?)? "at position" "s"? ("," | "and" | "or") RANGE-TO POS)+

 

   | RESI RANGE-TO RESI);

MUTATION

= (RESN1 POS RESN1

 

   | RESN1 "-" POS "-" RESN1

 

   | RESN1 "(" POS ")" RESN1

 

   | RESI CONVERT-TO (RESN3 | RESNF)

 

   | RESI RESN3

 

   |"from" (RESNF | RESN3) CONVERT-TO (RESNF | RESN3)"at position"POS

 

   | (RESN3 | RESNF) "for" (RESN3 | RESNF) "at position" POS

 

   | RESI ("-"+ | CONVERT-TO) RESI "substitution");