Source code for camcops_server.cc_modules.cc_validators

#!/usr/bin/env python

"""
camcops_server/cc_modules/cc_validators.py

===============================================================================

    Copyright (C) 2012, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CamCOPS.

    CamCOPS is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CamCOPS is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CamCOPS. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**String validators and the like.**

All functions starting ``validate_`` do nothing if the input is good, and raise
:exc:`ValueError` if it's bad, with a descriptive error (you can use ``str()``
on the exception).

All validators take a
:class:`camcops_server.cc_modules.cc_request.CamcopsRequest` parameter, for
internationalized error messages.

WARNING: even the error messages shouldn't contain the error-producing strings.
"""

import ipaddress
import logging
import re
from typing import Callable, List, Optional, TYPE_CHECKING
import urllib.parse

from cardinal_pythonlib.logs import BraceStyleAdapter
from colander import EMAIL_RE

from camcops_server.cc_modules.cc_constants import (
    MINIMUM_PASSWORD_LENGTH,
    StringLengths,
)
from camcops_server.cc_modules.cc_password import password_prohibited

if TYPE_CHECKING:
    from camcops_server.cc_modules.cc_request import CamcopsRequest

log = BraceStyleAdapter(logging.getLogger(__name__))


# =============================================================================
# Typing constants
# =============================================================================

STRING_VALIDATOR_TYPE = Callable[[str, Optional["CamcopsRequest"]], None]
# ... string validators raise ValueError if the string is invalid


# =============================================================================
# Raising exceptions: sometimes internationalized, sometimes not
# =============================================================================


[docs]def dummy_gettext(x: str) -> str: """ Returns the input directly. """ return x
# ============================================================================= # Regex manipulation # =============================================================================
[docs]def anchor( expression: str, anchor_start: bool = True, anchor_end: bool = True ) -> str: """ Adds start/end anchors. """ start = "^" if anchor_start else "" end = "$" if anchor_end else "" return f"{start}{expression}{end}"
[docs]def zero_or_more(expression: str) -> str: """ Regex for zero or more copies. """ return f"{expression}*"
[docs]def one_or_more(expression: str) -> str: """ Regex for one or more copies. """ return f"{expression}+"
[docs]def min_max_copies(expression: str, max_count: int, min_count: int = 1) -> str: """ Given a regex expression, permit it a minimum/maximum number of times. For example, for a regex group ``x``, produce ``x{min,max}``. Be very careful if you use ``min_count == 0`` -- without other restrictions, your regex may match an empty string. """ assert 0 <= min_count <= max_count return f"{expression}{{{min_count},{max_count}}}"
[docs]def describe_regex_permitted_char( expression: str, req: Optional["CamcopsRequest"] = None, invalid_prefix: bool = True, ) -> str: """ Describes the characters permitted in a regular expression character selector -- as long as it's simple! This won't handle arbitrary regexes. """ assert expression.startswith("[") and expression.endswith("]") content = expression[1:-1] # strip off surrounding [] permitted = [] # type: List[str] length = len(content) _ = req.gettext if req else dummy_gettext i = 0 while i < length: if content[i] == "\\": # backslash preceding another character: regex code or escaped char assert i + 1 < length, f"Bad escaping in {expression!r}" escaped = content[i + 1] if escaped == "w": permitted.append(_("word character")) elif escaped == "W": permitted.append(_("non-word character")) elif escaped == "d": permitted.append(_("digit")) elif escaped == "D": permitted.append(_("non-digit")) elif escaped == "s": permitted.append(_("whitespace")) elif escaped == "S": permitted.append(_("non-whitespace")) else: permitted.append(repr(escaped)) i += 2 elif i + 1 < length and content[i + 1] == "-": # range like A-Z assert i + 2 < length, f"Bad range specification in {expression!r}" permitted.append(content[i : i + 3]) # noqa: E203 i += 3 else: char = content[i] if char == ".": permitted.append(_("any character")) else: permitted.append(repr(char)) i += 1 description = ", ".join(permitted) prefix = _("Invalid string.") + " " if invalid_prefix else "" return prefix + _("Permitted characters:") + " " + description
[docs]def describe_regex_permitted_char_length( expression: str, max_length: int, min_length: int = 1, req: Optional["CamcopsRequest"] = None, ) -> str: """ Describes a valid string by permitted characters and length. """ _ = req.gettext if req else dummy_gettext return ( _("Invalid string.") + " " + _("Minimum length = {}. Maximum length = {}.").format( min_length, max_length ) + " " + describe_regex_permitted_char(expression, req, invalid_prefix=False) )
# ============================================================================= # Generic validation functions # =============================================================================
[docs]def validate_by_char_and_length( x: str, permitted_char_expression: str, max_length: int, min_length: int = 1, req: Optional["CamcopsRequest"] = None, flags: int = 0, ) -> None: """ Validate a string based on permitted characters and length. """ regex = re.compile( anchor( min_max_copies( expression=permitted_char_expression, min_count=min_length, max_count=max_length, ) ), flags=flags, ) if not regex.match(x): raise ValueError( describe_regex_permitted_char_length( permitted_char_expression, min_length=min_length, max_length=max_length, req=req, ) )
# ============================================================================= # Generic strings # ============================================================================= ALPHA_CHAR = "[A-Za-z]" ALPHANUM_UNDERSCORE_CHAR = "[A-Za-z0-9_]" ALPHANUM_UNDERSCORE_REGEX = re.compile( anchor(one_or_more(ALPHANUM_UNDERSCORE_CHAR)) ) ALPHANUM_UNDERSCORE_HYPHEN_CHAR = r"[A-Za-z0-9_\-]" ALPHANUM_UNDERSCORE_HYPHEN_DOT_CHAR = r"[A-Za-z0-9_\-\.]" ALPHANUM_COMMA_UNDERSCORE_HYPHEN_BRACE_CHAR = r"[A-Za-z0-9,_\-\{\}]" ALPHANUM_UNDERSCORE_HYPHEN_SPACE_CHAR = r"[A-Za-z0-9_\- ]" HUMAN_NAME_CHAR_UNICODE = r"[\w\-'’ \.]" # \w is a word character; with the re.UNICODE flag, that includes accented # characters. Then we allow hyphen, plain apostrophe, Unicode apostrophe, # space, dot. HUMAN_MANDATORY_CHAR_REGEX = re.compile(r"\w+", re.UNICODE) # ... for "at least one word character somewhere" # ----------------------------------------------------------------------------- # Level 1. Computer-style simple strings with no spaces. # -----------------------------------------------------------------------------
[docs]def validate_alphanum(x: str, req: Optional["CamcopsRequest"] = None) -> None: """ Validates a generic alphanumeric string. """ if not x.isalnum(): _ = req.gettext if req else dummy_gettext raise ValueError(_("Invalid alphanumeric string"))
[docs]def validate_alphanum_underscore( x: str, req: Optional["CamcopsRequest"] = None ) -> None: """ Validates a string that can be alphanumeric or contain an underscore. """ if not ALPHANUM_UNDERSCORE_REGEX.match(x): raise ValueError( describe_regex_permitted_char(ALPHANUM_UNDERSCORE_CHAR, req) )
# ----------------------------------------------------------------------------- # Level 2. Human-style simple strings, allowing spaces but only minimal # punctuation. # ----------------------------------------------------------------------------- # ... see specific validators. # ----------------------------------------------------------------------------- # Level 3. Human-style strings, such as people's names; may involve accented # characters, spaces, some punctuation; may be used as Python or SQL search # literals (with suitable precautions). # ----------------------------------------------------------------------------- # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 3(a). Human names # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]def validate_human_name( x: str, req: Optional["CamcopsRequest"] = None, min_length: int = 0, max_length: int = StringLengths.PATIENT_NAME_MAX_LEN, ) -> None: """ Accepts spaces, accents, etc. This is hard. See https://stackoverflow.com/questions/888838/regular-expression-for-validating-names-and-surnames """ # noqa validate_by_char_and_length( x, permitted_char_expression=HUMAN_NAME_CHAR_UNICODE, min_length=min_length, max_length=max_length, req=req, ) if not HUMAN_MANDATORY_CHAR_REGEX.match(x): _ = req.gettext if req else dummy_gettext raise ValueError("Names require at least one 'word' character")
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 3(c). Search terms for simple near-alphanumeric SQL content, allowing # wildcards. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RESTRICTED_SQL_SEARCH_LITERAL_CHAR = r"[A-Za-z0-9\- _%]" # ... hyphens are meaningful in regexes, so escape it
[docs]def validate_restricted_sql_search_literal( x: str, req: Optional["CamcopsRequest"] = None, min_length: int = 0, max_length: int = StringLengths.SQL_SEARCH_LITERAL_MAX_LENGTH, ) -> None: """ Validates a string that can be fairly broad, and can do SQL finding via wildcards such as ``%`` and ``_``, but should be syntactically safe in terms of HTML etc. It does not permit arbitrary strings; it's a subset of what might be possible in SQL. """ validate_by_char_and_length( x, permitted_char_expression=RESTRICTED_SQL_SEARCH_LITERAL_CHAR, min_length=min_length, max_length=max_length, req=req, )
# ----------------------------------------------------------------------------- # Level 4. Infinitely worrying. # ----------------------------------------------------------------------------- # noinspection PyUnusedLocal
[docs]def validate_anything(x: str, req: Optional["CamcopsRequest"] = None) -> None: """ Lets anything through. May be unwise. """ pass
# ============================================================================= # Specific well-known computer formats # ============================================================================= # ----------------------------------------------------------------------------- # Base 64 encoding # ----------------------------------------------------------------------------- # BASE64_REGEX = re.compile( # "^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$" # # https://stackoverflow.com/questions/475074/regex-to-parse-or-validate-base64-data # noqa # ) # ----------------------------------------------------------------------------- # Email addresses # ----------------------------------------------------------------------------- EMAIL_RE_COMPILED = re.compile(EMAIL_RE)
[docs]def validate_email(email: str, req: Optional["CamcopsRequest"] = None) -> None: """ Validate an e-mail address. Is this a valid e-mail address? We use the same validation system as our web form (which uses Colander's method plus a length constraint). """ if len( email ) > StringLengths.EMAIL_ADDRESS_MAX_LEN or not EMAIL_RE_COMPILED.match( email ): _ = req.gettext if req else dummy_gettext raise ValueError(_("Invalid e-mail address"))
# ----------------------------------------------------------------------------- # IP addresses # -----------------------------------------------------------------------------
[docs]def validate_ip_address( x: str, req: Optional["CamcopsRequest"] = None ) -> None: """ Validates an IP address. """ # https://stackoverflow.com/questions/3462784/check-if-a-string-matches-an-ip-address-pattern-in-python # noqa try: ipaddress.ip_address(x) except ValueError: _ = req.gettext if req else dummy_gettext raise ValueError(_("Invalid IP address"))
# ----------------------------------------------------------------------------- # URLs # ----------------------------------------------------------------------------- # Per https://mathiasbynens.be/demo/url-regex, using @stephenhay's regex but # restricted further. VALID_REDIRECT_URL_REGEX = re.compile(r"^https?://[^\s/$.?#].[^\s]*$")
[docs]def validate_any_url(url: str, req: Optional["CamcopsRequest"] = None) -> None: """ Validates a URL. If valid, returns the URL; if not, returns ``default``. See https://stackoverflow.com/questions/22238090/validating-urls-in-python However, avoid this one. For example, a URL such as xxhttps://127.0.0.1:8088/ can trigger Chrome to launch ``xdg-open``. """ log.warning("Avoid this validator! It allows open-this-file URLs!") result = urllib.parse.urlparse(url) if not result.scheme or not result.netloc: _ = req.gettext if req else dummy_gettext raise ValueError(_("Invalid URL"))
[docs]def validate_redirect_url( url: str, req: Optional["CamcopsRequest"] = None ) -> None: """ Validates a URL. If valid, returns the URL; if not, returns ``default``. See https://stackoverflow.com/questions/22238090/validating-urls-in-python """ if not VALID_REDIRECT_URL_REGEX.match(url): _ = req.gettext if req else dummy_gettext raise ValueError(_("Invalid redirection URL"))
# ============================================================================= # CamCOPS system-oriented names # ============================================================================= # ----------------------------------------------------------------------------- # Group names # -----------------------------------------------------------------------------
[docs]def validate_group_name( name: str, req: Optional["CamcopsRequest"] = None ) -> None: """ Is the string a valid group name? Group descriptions can be anything, but group names shouldn't have odd characters in -- this greatly facilitates config file handling etc. (for example: no spaces, no commas). """ validate_by_char_and_length( name, permitted_char_expression=ALPHANUM_UNDERSCORE_HYPHEN_CHAR, min_length=StringLengths.GROUP_NAME_MIN_LEN, max_length=StringLengths.GROUP_NAME_MAX_LEN, req=req, )
# ----------------------------------------------------------------------------- # Usernames # -----------------------------------------------------------------------------
[docs]def validate_username( name: str, req: Optional["CamcopsRequest"] = None ) -> None: """ Is the string a valid user name? """ validate_by_char_and_length( name, permitted_char_expression=ALPHANUM_COMMA_UNDERSCORE_HYPHEN_BRACE_CHAR, min_length=StringLengths.USERNAME_CAMCOPS_MIN_LEN, max_length=StringLengths.USERNAME_CAMCOPS_MAX_LEN, req=req, )
# ----------------------------------------------------------------------------- # Devices # -----------------------------------------------------------------------------
[docs]def validate_device_name( x: str, req: Optional["CamcopsRequest"] = None ) -> None: """ Validate a client device name -- the computer-oriented one, not the friendly one. """ validate_by_char_and_length( x, permitted_char_expression=ALPHANUM_COMMA_UNDERSCORE_HYPHEN_BRACE_CHAR, min_length=1, max_length=StringLengths.DEVICE_NAME_MAX_LEN, req=req, )
# ----------------------------------------------------------------------------- # Export recipients # ----------------------------------------------------------------------------- def validate_export_recipient_name( x: str, req: Optional["CamcopsRequest"] = None ) -> None: validate_by_char_and_length( x, permitted_char_expression=ALPHANUM_UNDERSCORE_CHAR, min_length=StringLengths.EXPORT_RECIPIENT_NAME_MIN_LEN, max_length=StringLengths.EXPORT_RECIPIENT_NAME_MAX_LEN, req=req, ) # ----------------------------------------------------------------------------- # Passwords # -----------------------------------------------------------------------------
[docs]def validate_new_password( x: str, req: Optional["CamcopsRequest"] = None ) -> None: """ Validate a proposed new password. Enforce our password policy. """ _ = req.gettext if req else dummy_gettext if not x or not x.strip(): raise ValueError(_("Passwords can't be blank")) if len(x) < MINIMUM_PASSWORD_LENGTH: raise ValueError( _("Passwords can't be shorter than {} characters").format( MINIMUM_PASSWORD_LENGTH ) ) # No maximum length, because we store a hash. # No other character limitations. if password_prohibited(x): raise ValueError(_("That password is used too commonly; try again"))
# ----------------------------------------------------------------------------- # HL7 # -----------------------------------------------------------------------------
[docs]def validate_hl7_id_type( x: str, req: Optional["CamcopsRequest"] = None ) -> None: """ Validate HL7 Identifier Type. """ validate_by_char_and_length( x, permitted_char_expression=ALPHANUM_UNDERSCORE_HYPHEN_SPACE_CHAR, min_length=0, max_length=StringLengths.HL7_ID_TYPE_MAX_LEN, req=req, )
[docs]def validate_hl7_aa(x: str, req: Optional["CamcopsRequest"] = None) -> None: """ Validate HL7 Assigning Authority. """ validate_by_char_and_length( x, permitted_char_expression=ALPHANUM_UNDERSCORE_HYPHEN_SPACE_CHAR, min_length=0, max_length=StringLengths.HL7_AA_MAX_LEN, req=req, )
# ----------------------------------------------------------------------------- # Task table names # ----------------------------------------------------------------------------- TASK_TABLENAME_REGEX = re.compile( anchor(ALPHA_CHAR, anchor_start=True, anchor_end=False) + # ... don't start with a number # ... and although tables can and do start with underscores, task tables # don't. anchor( min_max_copies( ALPHANUM_UNDERSCORE_CHAR, min_count=0, max_count=StringLengths.TABLENAME_MAX_LEN - 1, ), anchor_start=False, anchor_end=True, ) )
[docs]def validate_task_tablename( x: str, req: Optional["CamcopsRequest"] = None ) -> None: """ Validates a string that could be a task tablename. """ if not TASK_TABLENAME_REGEX.match(x): _ = req.gettext if req else dummy_gettext raise ValueError( _( "Task table names must start with a letter, and contain only " "contain alphanumeric characters (A-Z, a-z, 0-9) or " "underscores (_)." ) )
# ----------------------------------------------------------------------------- # Filenames # ----------------------------------------------------------------------------- DOWNLOAD_FILENAME_REGEX = re.compile(r"\w[\w-]*.[\w]+") # \w is equivalent to [A-Za-z0-9_]; see https://regexr.com/
[docs]def validate_download_filename( x: str, req: Optional["CamcopsRequest"] = None ) -> None: """ Validate a file for user download. - Permit e.g. ``CamCOPS_dump_2021-06-04T100622.zip``. - Prohibit silly things (like directory/drive delimiters). """ if not DOWNLOAD_FILENAME_REGEX.match(x): _ = req.gettext if req else dummy_gettext raise ValueError( _( "Download filenames must (1) begin with an " "alphanumeric/underscore character; (2) contain only " "alphanumeric characters, underscores, and hyphens; and " "(3) end with a full stop followed by an " "alphanumeric/underscore extension." ) )