Skip to main content

Table 1 Regular expression patterns for the detection of residue mentions in text. The patterns recognise single (SITE) or multiple wild-type residue sites (SITES), a sequence range or residue pair (RANGE/PAIR), and point mutation (MUTATION). The set covers abbreviated notations of residues as well as grammatic expressions found in text.

From: Annotation of protein residues based on a literature analysis: cross-validation against UniProtKb

RANGE-TO = ("-"+ ("to" "-+")? |"to");
CONVERT-TO = ("to"|"-"+">"?);
XAA = ("X"|"XAA"|"xaa");
POS = (1–9)(0–9)*;
RESN1 = [ARNDCQEGHILKMFPSTWYVOUBZX];
RESN3 = ([aA]la|ALA | [aA]rg|ARG | [aA]sn|ASN | [aA]sp|ASP | [cC]ys|CYS
     | [gG]ln|GLN | [gG]lu|GLU | [gG]ly|GLY | [hH]is|HIS | [iI]le|ILE
     | [lL]eu|LEU | [lL]ys|LYS | [mM]et|MET | [pP]he|PHE | [pP]ro|PRO
     | [sS]er|SER | [tT]hr|THR | [tT]rp|TRP | [tT]yr|TYR | [vV]al|VAL
     | [pP]yl|PYL | [sS]ec|SEC | [aA]sx|ASX | [gG]lx|GLX | [xX]aa|XAA);
RESNF = ([aA]lanine | [aA]rginine | [aA]sparagine | [aA]spart(ate|ic acid) | [cC]ysteine
     | [gG]lutamine | [gG]lutam(ate|ic acid) | [gG]lycine | [hH]istidine | [iI]soleucine
     | [lL]eucine | [lL]ysine | [mM]ethionine | [pP]henylalanine | [pP]roline
     | [sS]erine | [tT]hreonine | [tT]ryptophan | [tT]yrosine | [vV]aline
     | [pP]yrrolysine | [sS]elenocysteine | [aA]spartic acid or [aA]sparagine
     | [gG]lutamic acid or [gG]lutamine);
SITE = ((RESN3 | RESNF) POS "residue"?
     | (RESN3 | RESNF)"-"+ POS "residue"?
     | (RESN3 | RESNF)"residue"? "at position"? POS "residue"?
     | (RESN3 | RESNF)"("POS")" "residue"?
     |"amino acid"? "residue" "at position"? POS
     |"amino acid" "residue"? "at position"? POS
     | RESNF "residue" POS);
SITES = (RESNF"s"((","|"and"|"or") RESNF"s")*
     | RESNF"s"? ("at position" "s"?)? (","|"and"|"or") (("at position" "s"?)? (","|"and
     |"or") POS)+
     | RESNF "residue" "s"?
     | RESN3 "residue" "s"? ("at position" "s"?)? POS (("at position" "s"?)? (","|"and" | "or") POS)+
     | RESN3 "residue" "s"?
     |"residue" "s"? ("at position" "s"?)? POS (","|"and"|"or") POS)+
     | (RESN3 | RESNF)"for"(RESN3 | RESNF)"at position"POS (","|"and"|"or") POS)+
     | RESNF ("," | "and" | "or") POS)* "residue" "s"?);
RANGE/PAIR = ("residue" "s"? ("," | "and" | "or") RANGE-TO POS)+
     |"amino acid" "residue"? "s"? ("," | "and" | "or") RANGE-TO POS)+
     | ("resiude" "s"?)? "at position" "s"? ("," | "and" | "or") RANGE-TO POS)+
     | RESI RANGE-TO RESI);
MUTATION = (RESN1 POS RESN1
     | RESN1 "-" POS "-" RESN1
     | RESN1 "(" POS ")" RESN1
     | RESI CONVERT-TO (RESN3 | RESNF)
     | RESI RESN3
     |"from" (RESNF | RESN3) CONVERT-TO (RESNF | RESN3)"at position"POS
     | (RESN3 | RESNF) "for" (RESN3 | RESNF) "at position" POS
     | RESI ("-"+ | CONVERT-TO) RESI "substitution");