Source code for camcops_server.cc_modules.cc_dump

#!/usr/bin/env python

"""
camcops_server/cc_modules/cc_dump.py

===============================================================================

    Copyright (C) 2012, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CamCOPS.

    CamCOPS is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CamCOPS is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CamCOPS. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Methods for providing a dump of data from the server to the web user.**

"""

import logging
from typing import (
    Any,
    Dict,
    Generator,
    Iterable,
    List,
    Optional,
    Set,
    Tuple,
    Type,
    TYPE_CHECKING,
    Union,
)

from cardinal_pythonlib.logs import BraceStyleAdapter
from cardinal_pythonlib.sqlalchemy.orm_inspect import (
    gen_columns,
    gen_orm_classes_from_base,
    walk_orm_tree,
)
from sqlalchemy.exc import CompileError
from sqlalchemy.engine.base import Engine
from sqlalchemy.orm import Session as SqlASession
from sqlalchemy.sql.schema import Column, MetaData, Table

from camcops_server.cc_modules.cc_blob import Blob
from camcops_server.cc_modules.cc_db import (
    GenericTabletRecordMixin,
    TaskDescendant,
)
from camcops_server.cc_modules.cc_device import Device
from camcops_server.cc_modules.cc_email import Email
from camcops_server.cc_modules.cc_exportmodels import (
    ExportedTask,
    ExportedTaskEmail,
    ExportedTaskFileGroup,
    ExportedTaskHL7Message,
)
from camcops_server.cc_modules.cc_exportrecipient import ExportRecipient
from camcops_server.cc_modules.cc_group import Group, group_group_table
from camcops_server.cc_modules.cc_membership import UserGroupMembership
from camcops_server.cc_modules.cc_patient import Patient
from camcops_server.cc_modules.cc_patientidnum import (
    all_extra_id_columns,
    PatientIdNum,
)
from camcops_server.cc_modules.cc_sqla_coltypes import CamcopsColumn
from camcops_server.cc_modules.cc_task import Task
from camcops_server.cc_modules.cc_user import User

if TYPE_CHECKING:
    from camcops_server.cc_modules.cc_request import CamcopsRequest
    from camcops_server.cc_modules.cc_summaryelement import ExtraSummaryTable
    from camcops_server.cc_modules.cc_simpleobjects import TaskExportOptions

log = BraceStyleAdapter(logging.getLogger(__name__))


# =============================================================================
# Constants
# =============================================================================

# Restrict specified tables to certain columns only:
DUMP_ONLY_COLNAMES = {  # mapping of tablename : list_of_column_names
    Device.__tablename__: ["camcops_version", "friendly_name", "id", "name"],
    User.__tablename__: ["fullname", "id", "username"],
}
# Drop specific columns from certain tables:
DUMP_DROP_COLNAMES = {}  # mapping of tablename : list_of_column_names
# List of columns to be skipped regardless of table:
DUMP_SKIP_COLNAMES = [
    # We restrict to current records only, so many of these are irrelevant:
    "_addition_pending",
    "_forcibly_preserved",
    "_manually_erased",
    "_manually_erased_at",
    "_manually_erasing_user_id",
    "_move_off_tablet",
    "_removal_pending",
    "_removing_user_id",
    "_successor_pk",
    "_when_removed_batch_utc",
    "_when_removed_exact",
]
DUMP_SKIP_RELNAMES = [
    # List of *relationship* names to ignore
    "_manually_erasing_user",
    "_removing_user",
]
# List of table names to be skipped at all times:
DUMP_SKIP_TABLES = [
    # We don't have to list all admin tables here; we process the dump starting
    # with tasks, so only things that have ORM relationships to a task might
    # feature. (The Email/ExportedTask* set don't, so this is just caution in
    # case we add a relationship later!)
    Email.__tablename__,
    ExportedTask.__tablename__,
    ExportedTaskEmail.__tablename__,
    ExportedTaskFileGroup.__tablename__,
    ExportedTaskHL7Message.__tablename__,
    ExportRecipient.__tablename__,
    group_group_table.name,
    UserGroupMembership.__tablename__,
]
# Tables for which no relationships will be traversed:
DUMP_SKIP_ALL_RELS_FOR_TABLES = [Group.__tablename__]
FOREIGN_KEY_CONSTRAINTS_IN_DUMP = False
# ... the keys will be present, but should we try to enforce constraints?


# =============================================================================
# Handy place to hold the controlling information
# =============================================================================


[docs]class DumpController(object):
    """
    A controller class that manages the copying (dumping) of information from
    our database to another SQLAlchemy :class:`Engine`/:class:`Session`.
    """

[docs]    def __init__(
        self,
        dst_engine: Engine,
        dst_session: SqlASession,
        export_options: "TaskExportOptions",
        req: "CamcopsRequest",
    ) -> None:
        """
        Args:
            dst_engine: destination SQLAlchemy Engine
            dst_session:  destination SQLAlchemy Session
            export_options: :class:`camcops_server.cc_modules.cc_simpleobjects.TaskExportOptions`
            req: :class:`camcops_server.cc_modules.cc_request.CamcopsRequest`
        """  # noqa
        self.dst_engine = dst_engine
        self.dst_session = dst_session
        self.export_options = export_options
        self.req = req

        # We start with blank metadata.
        self.dst_metadata = MetaData()
        # Tables we are inserting into the destination database:
        self.dst_tables = {}  # type: Dict[str, Table]
        # ... note that creating a Table() for a given SQLAlchemy metadata is
        #     permitted only once, so we add to self.dst_tables as soon
        #     as we create that.
        # Tables we've created:
        self.tablenames_created = set()  # type: Set[str]
        # Tables we've processed, though we may ignore them:
        self.tablenames_seen = set()  # type: Set[str]
        # ORM objects we've visited:
        self.instances_seen = set()  # type: Set[object]

        if export_options.db_make_all_tables_even_empty:
            self._create_all_dest_tables()

    def _create_all_dest_tables(self) -> None:
        """
        Creates all tables in the destination database, even ones that may
        not be used.
        """
        log.debug("Creating all destination tables...")
        for table in self.gen_all_dest_tables():
            self._create_dest_table(table)
        log.debug("... all destination tables created.")

[docs]    def gen_all_dest_tables(self) -> Generator[Table, None, None]:
        """
        Generates all destination tables.
        """
        tablenames_seen = set()  # type: Set[str]
        for cls in gen_orm_classes_from_base(
            GenericTabletRecordMixin
        ):  # type: Type[GenericTabletRecordMixin]  # noqa
            instance = cls()
            for table in self.gen_all_dest_tables_for_obj(instance):
                if table.name in tablenames_seen:
                    continue
                tablenames_seen.add(table.name)
                yield table

[docs]    def gen_all_dest_tables_for_obj(
        self, src_obj: object
    ) -> Generator[Table, None, None]:
        """
        Generates all destination tables for an object.
        """
        # Main table
        yield self.get_dest_table_for_src_object(src_obj)
        # Additional tables
        if isinstance(src_obj, Task):
            add_extra_id_cols = (
                self.export_options.db_patient_id_in_each_row
                and not src_obj.is_anonymous
            )
            estables = src_obj.get_all_summary_tables(self.req)
            for est in estables:
                yield self.get_dest_table_for_est(
                    est, add_extra_id_cols=add_extra_id_cols
                )

[docs]    def gen_all_dest_columns(
        self,
    ) -> Generator[Union[Column, CamcopsColumn], None, None]:
        """
        Generates all destination columns.
        """
        for table in self.gen_all_dest_tables():
            for col in table.columns:
                yield col

[docs]    def consider_object(self, src_obj: object) -> None:
        """
        Think about an SQLAlchemy ORM object. If it comes from a table we
        want dumped, add this object to the dump.
        """
        # noinspection PyUnresolvedReferences
        src_table = src_obj.__table__  # type: Table
        src_tablename = src_table.name
        if src_tablename not in self.tablenames_seen:
            # If we encounter a table we've not seen, offer our "table decider"
            # the opportunity to add it to the metadata and create the table.
            self._add_dump_table_for_src_object(src_obj)
        # If this table is going into the destination, copy the object
        # (and maybe remove columns from it, or add columns to it).
        if src_tablename in self.dst_tables and not self._dump_skip_table(
            src_tablename
        ):
            self._copy_object_to_dump(src_obj)

    @staticmethod
    def _merits_extra_id_num_columns(
        obj: object,
    ) -> Tuple[bool, Optional[Patient]]:
        """
        Is the source object one that would support the addition of extra
        ID number information if the export option ``DB_PATIENT_ID_PER_ROW`` is
        set? If so, return the relevant patient.

        Args:
            obj: an SQLAlchemy ORM object

        Returns:
            tuple: ``(merits, patient)``, where ``merits`` is a ``bool`` (does
            it merit this?) and ``patient`` is a relevant
            :class:`camcops_server.cc_modules.cc_patient.Patient``, if found.
            It is also guaranteed that if a patient is returned, ``merits`` is
            ``True`` (but not guaranteed that if ``merits`` is true, that
            ``patient`` is not ``None``).

        """
        if not isinstance(obj, GenericTabletRecordMixin):
            # Must be data that originated from the client.
            return False, None
        if isinstance(obj, PatientIdNum):
            # PatientIdNum already has this info.
            return False, None
        if isinstance(obj, Patient):
            return True, obj
        if isinstance(obj, Task):
            if obj.is_anonymous:
                # Anonymous tasks don't.
                return False, None
            return True, obj.patient
        if isinstance(obj, TaskDescendant):
            merits = obj.task_ancestor_might_have_patient()
            patient = obj.task_ancestor_patient()
            return merits, patient
        log.warning(
            f"_merits_extra_id_num_columns_if_requested: don't know "
            f"how to handle {obj!r}"
        )
        return False, None

[docs]    def get_dest_table_for_src_object(self, src_obj: object) -> Table:
        """
        Produces the destination table for the source object.

        Args:
            src_obj:
                An SQLAlchemy ORM object. It will *not* be a
                :class:`camcops_server.cc_modules.cc_summaryelement.ExtraSummaryTable`;
                those are handled instead by
                :meth:`_get_or_insert_summary_table`.

        Returns:
            an SQLAlchemy :class:`Table`
        """
        # noinspection PyUnresolvedReferences
        src_table = src_obj.__table__  # type: Table
        tablename = src_table.name

        # Don't create it twice in the SQLAlchemy metadata.
        if tablename in self.dst_tables:
            return self.dst_tables[tablename]

        # Copy columns, dropping any we don't want, and dropping FK constraints
        dst_columns = []  # type: List[Column]
        for src_column in src_table.columns:
            # log.debug("trying {!r}", src_column.name)
            if self._dump_skip_column(tablename, src_column.name):
                # log.debug("... skipping {!r}", src_column.name)
                continue
            # You can't add the source column directly; you get
            # "sqlalchemy.exc.ArgumentError: Column object 'ccc' already
            # assigned to Table 'ttt'"
            copied_column = src_column.copy()
            copied_column.comment = src_column.comment
            # ... see SQLAlchemy trivial bug:
            # https://bitbucket.org/zzzeek/sqlalchemy/issues/4087/columncopy-doesnt-copy-comment-attribute  # noqa
            if FOREIGN_KEY_CONSTRAINTS_IN_DUMP:
                copied_column.foreign_keys = set(
                    fk.copy() for fk in src_column.foreign_keys
                )
                log.warning(
                    "NOT WORKING: foreign key commands not being " "emitted"
                )
                # but
                # https://docs.sqlalchemy.org/en/latest/core/constraints.html
                # works fine under SQLite, even if the other table hasn't been
                # created yet. Does the table to which the FK refer have to be
                # in the metadata already?
                # That's quite possible, but I've not checked.
                # Would need to iterate through tables in dependency order,
                # like merge_db() does.
            else:
                # Probably blank already, as the copy() command only copies
                # non-constraint-bound ForeignKey objects, but to be sure:
                copied_column.foreign_keys = set()
                # ... type is: Set[ForeignKey]
            # if src_column.foreign_keys:
            #     log.debug("Column {}, FKs {!r} -> {!r}", src_column.name,
            #               src_column.foreign_keys,
            #               copied_column.foreign_keys)
            dst_columns.append(copied_column)

        # Add extra columns?
        if self.export_options.db_include_summaries:
            if isinstance(src_obj, GenericTabletRecordMixin):
                for summary_element in src_obj.get_summaries(self.req):
                    dst_columns.append(
                        CamcopsColumn(
                            summary_element.name,
                            summary_element.coltype,
                            exempt_from_anonymisation=True,
                            comment=summary_element.decorated_comment,
                        )
                    )
        if self.export_options.db_patient_id_in_each_row:
            merits, _ = self._merits_extra_id_num_columns(src_obj)
            if merits:
                dst_columns.extend(all_extra_id_columns(self.req))
            if isinstance(src_obj, TaskDescendant):
                dst_columns += src_obj.extra_task_xref_columns()

        dst_table = Table(tablename, self.dst_metadata, *dst_columns)
        # ... that modifies the metadata, so:
        self.dst_tables[tablename] = dst_table
        return dst_table

[docs]    def get_dest_table_for_est(
        self, est: "ExtraSummaryTable", add_extra_id_cols: bool = False
    ) -> Table:
        """
        Add an additional summary table to the dump, if it's not there already.
        Return the table (from the destination database).

        Args:
            est:
                a
                :class:`camcops_server.cc_modules.cc_summaryelement.ExtraSummaryTable`
            add_extra_id_cols:
                Add extra ID columns, for the ``DB_PATIENT_ID_PER_ROW``
                export option?
        """  # noqa
        tablename = est.tablename
        if tablename in self.dst_tables:
            return self.dst_tables[tablename]

        columns = est.columns.copy()
        if add_extra_id_cols:
            columns.extend(all_extra_id_columns(self.req))
            columns.extend(est.extra_task_xref_columns())
        table = Table(tablename, self.dst_metadata, *columns)
        # ... that modifies the metadata, so:
        self.dst_tables[tablename] = table
        return table

    def _add_dump_table_for_src_object(self, src_obj: object) -> None:
        """
        - Mark the object's table as seen.

        - If we want it, add it to the metadata and execute a CREATE TABLE
          command.

        - We may translate the table en route.

        Args:
            src_obj:
                An SQLAlchemy ORM object. It will *not* be a
                :class:`camcops_server.cc_modules.cc_summaryelement.ExtraSummaryTable`;
                those are handled instead by
                :meth:`_get_or_insert_summary_table`.
        """  # noqa
        # noinspection PyUnresolvedReferences
        src_table = src_obj.__table__  # type: Table
        tablename = src_table.name
        self.tablenames_seen.add(tablename)

        # Skip the table?
        if self._dump_skip_table(tablename):
            return

        # Get the table definition
        dst_table = self.get_dest_table_for_src_object(src_obj)
        # Create it
        self._create_dest_table(dst_table)

    def _create_dest_table(self, dst_table: Table) -> None:
        """
        Creates a table in the destination database.
        """
        tablename = dst_table.name
        if tablename in self.tablenames_created:
            return  # don't create it twice
        # Create the table
        # log.debug("Adding table {!r} to dump output", tablename)
        # You have to use an engine, not a session, to create tables (or you
        # get "AttributeError: 'Session' object has no attribute
        # '_run_visitor'").
        # However, you have to commit the session, or you get
        #     "sqlalchemy.exc.OperationalError: (sqlite3.OperationalError)
        #     database is locked", since a session is also being used.
        self.dst_session.commit()
        dst_table.create(self.dst_engine)
        self.tablenames_created.add(tablename)

    def _copy_object_to_dump(self, src_obj: object) -> None:
        """
        Copy the SQLAlchemy ORM object to the dump.
        """
        # noinspection PyUnresolvedReferences
        src_table = src_obj.__table__  # type: Table
        adding_extra_ids = False
        patient = None  # type: Optional[Patient]
        if self.export_options.db_patient_id_in_each_row:
            adding_extra_ids, patient = self._merits_extra_id_num_columns(
                src_obj
            )

        # 1. Insert row for this object, potentially adding and removing
        #    columns.
        tablename = src_table.name
        dst_table = self.dst_tables[tablename]
        assert dst_table.name == tablename
        row = {}  # type: Dict[str, Any]
        # Copy columns, skipping any we don't want
        for attrname, column in gen_columns(src_obj):
            if self._dump_skip_column(tablename, column.name):
                continue
            row[column.name] = getattr(src_obj, attrname)
        # Any other columns to add for this table?
        if isinstance(src_obj, GenericTabletRecordMixin):
            if self.export_options.db_include_summaries:
                for summary_element in src_obj.get_summaries(self.req):
                    row[summary_element.name] = summary_element.value
            if adding_extra_ids:
                if patient:
                    patient.add_extra_idnum_info_to_row(row)
                if isinstance(src_obj, TaskDescendant):
                    src_obj.add_extra_task_xref_info_to_row(row)
        try:
            self.dst_session.execute(dst_table.insert(row))
        except CompileError:
            log.critical("\ndst_table:\n{}\nrow:\n{}", dst_table, row)
            raise

        # 2. If required, add extra tables/rows that this task wants to
        #    offer (usually tables whose rows don't have a 1:1 correspondence
        #    to the task or its ancillary objects).
        if isinstance(src_obj, Task):
            estables = src_obj.get_all_summary_tables(self.req)
            # ... includes SNOMED
            for est in estables:
                dst_summary_table = self._get_or_insert_summary_table(
                    est, add_extra_id_cols=adding_extra_ids
                )
                for row in est.rows:
                    if patient:
                        patient.add_extra_idnum_info_to_row(row)
                    if adding_extra_ids:
                        est.add_extra_task_xref_info_to_row(row)
                    try:
                        self.dst_session.execute(dst_summary_table.insert(row))
                    except CompileError:
                        log.critical(
                            "\ndst_summary_table:\n{}\nrow:\n{}",
                            dst_table,
                            row,
                        )
                        raise

    def _get_or_insert_summary_table(
        self, est: "ExtraSummaryTable", add_extra_id_cols: bool = False
    ) -> Table:
        """
        Add an additional summary table to the dump, if it's not there already.
        Return the table (from the destination database).

        Args:
            est:
                a
                :class:`camcops_server.cc_modules.cc_summaryelement.ExtraSummaryTable`
            add_extra_id_cols:
                Add extra ID columns, for the ``DB_PATIENT_ID_PER_ROW``
                export option?
        """  # noqa
        tablename = est.tablename
        if tablename not in self.tablenames_created:
            table = self.get_dest_table_for_est(
                est, add_extra_id_cols=add_extra_id_cols
            )
            self._create_dest_table(table)
        return self.dst_tables[tablename]

    def _dump_skip_table(self, tablename: str) -> bool:
        """
        Should we skip this table (omit it from the dump)?
        """
        if (
            not self.export_options.include_blobs
            and tablename == Blob.__tablename__
        ):
            return True
        if tablename in DUMP_SKIP_TABLES:
            return True
        return False

    @staticmethod
    def _dump_skip_column(tablename: str, columnname: str) -> bool:
        """
        Should we skip this column (omit it from the dump)?
        """
        if columnname in DUMP_SKIP_COLNAMES:
            return True
        if (
            tablename in DUMP_ONLY_COLNAMES
            and columnname not in DUMP_ONLY_COLNAMES[tablename]
        ):
            return True
        if (
            tablename in DUMP_DROP_COLNAMES
            and columnname in DUMP_DROP_COLNAMES[tablename]
        ):
            return True
        return False


# =============================================================================
# Copying stuff to a dump
# =============================================================================


[docs]def copy_tasks_and_summaries(
    tasks: Iterable[Task],
    dst_engine: Engine,
    dst_session: SqlASession,
    export_options: "TaskExportOptions",
    req: "CamcopsRequest",
) -> None:
    """
    Copy a set of tasks, and their associated related information (found by
    walking the SQLAlchemy ORM tree), to the dump.

    Args:
        tasks: tasks to copy
        dst_engine: destination SQLAlchemy Engine
        dst_session:  destination SQLAlchemy Session
        export_options: :class:`camcops_server.cc_modules.cc_simpleobjects.TaskExportOptions`
        req: :class:`camcops_server.cc_modules.cc_request.CamcopsRequest`
    """  # noqa
    # How best to create the structure that's required?
    #
    # https://stackoverflow.com/questions/21770829/sqlalchemy-copy-schema-and-data-of-subquery-to-another-database  # noqa
    # https://stackoverflow.com/questions/40155340/sqlalchemy-reflect-and-copy-only-subset-of-existing-schema  # noqa
    #
    # - Should we attempt to copy the MetaData object? That seems extremely
    #   laborious, since every ORM class is tied to it. Moreover,
    #   MetaData.tables is an immutabledict, so we're not going to be editing
    #   anything. Even if we cloned the MetaData, that's not going to give us
    #   ORM classes to walk.
    # - Shall we operate at a lower level? That seems sensible.
    # - Given that... we don't need to translate the PKs at all, unlike
    #   merge_db.
    # - Let's not create FK constraints explicitly. Most are not achievable
    #   anyway (e.g. linking on device/era; omission of BLOBs).

    controller = DumpController(
        dst_engine=dst_engine,
        dst_session=dst_session,
        export_options=export_options,
        req=req,
    )

    # We walk through all the objects.
    log.debug("Starting to copy tasks...")
    for startobj in tasks:
        log.debug("Processing task: {!r}", startobj)
        for src_obj in walk_orm_tree(
            startobj,
            seen=controller.instances_seen,
            skip_relationships_always=DUMP_SKIP_RELNAMES,
            skip_all_relationships_for_tablenames=DUMP_SKIP_ALL_RELS_FOR_TABLES,  # noqa
            skip_all_objects_for_tablenames=DUMP_SKIP_TABLES,
        ):
            controller.consider_object(src_obj)
    log.debug("... finished copying tasks.")