Source code for ticclat.ticclat_schema

# coding: utf-8
# pylint: disable=too-few-public-methods
"""
SQLAlchemy schema of the TICCLAT database.

Contains all the tables of the database and their connections, defined as
SQLAlchemy declarative_base subclasses.

Many of the tables here defined are based on an INT lexicon database created
in the IMPACT project
(https://ivdnt.org/images/stories/onderzoek_en_onderwijs/publicaties/impact/impact_lexicon_structure.pdf).
See https://github.com/TICCLAT/docs/blob/master/database_design.md for more
information about the database design.

Based on this, in TICCLAT, we added tables for:
- links between wordforms
- morphological paradigm groups of wordforms
- anagram hashes from TICCL
- spelling variants from TICCL
- identifiers linking wordforms to external sources like the WNT, MNW, INT.
"""

from sqlalchemy import Column, String, Table, ForeignKey, Unicode, Boolean, \
    Integer, BigInteger, ForeignKeyConstraint
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()


# Table for storing the relation between corpora and documents.
corpusId_x_documentId = Table('corpusId_x_documentId', Base.metadata,
                              Column('corpus_id', BigInteger(), ForeignKey('corpora.corpus_id')),
                              Column('document_id', BigInteger(), ForeignKey('documents.document_id'))
                              )


[docs]class TextAttestation(Base):
    """
    Table for storing text attestations.

    A text attestation entry is defined in the INT schema as the occurrence
    and frequency of wordforms in documents.
    """
    __tablename__ = 'text_attestations'

    attestation_id = Column(BigInteger(), primary_key=True)
    frequency = Column(BigInteger())
    wordform_id = Column(BigInteger(), ForeignKey('wordforms.wordform_id'))
    document_id = Column(BigInteger(), ForeignKey('documents.document_id'))

    ta_document = relationship('Document', back_populates='document_wordforms')
    ta_wordform = relationship('Wordform', back_populates='wordform_documents')

    def __init__(self, document, wordform, frequency):
        self.ta_document = document
        self.ta_wordform = wordform
        self.frequency = frequency


[docs]class Corpus(Base):
    """Table for storing corpus metadata."""
    __tablename__ = 'corpora'

    corpus_id = Column(BigInteger(), primary_key=True)
    name = Column(String(255))
    corpus_documents = relationship('Document',
                                    secondary=corpusId_x_documentId,
                                    back_populates='document_corpora')


[docs]class Document(Base):
    """Table for storing document metadata."""
    __tablename__ = 'documents'

    document_id = Column(BigInteger(), primary_key=True)
    persistent_id = Column(String(255), index=True)
    word_count = Column(BigInteger())
    encoding = Column(BigInteger())
    title = Column(String(255))
    year_from = Column(BigInteger())
    year_to = Column(BigInteger())
    pub_year = Column(BigInteger())
    author = Column(String(255))
    editor = Column(String(255))
    publisher = Column(String(255))
    publishing_location = Column(String(255))
    text_type = Column(String(255))
    region = Column(String(255))
    language = Column(String(255))
    other_languages = Column(String(255))
    spelling = Column(String(255))
    parent_document = Column(BigInteger(), index=True)

    document_corpora = relationship('Corpus', secondary=corpusId_x_documentId,
                                    back_populates='corpus_documents')
    document_wordforms = relationship('TextAttestation', back_populates='ta_document')


# Table for storing which lexica wordforms occur in.
lexical_source_wordform = Table('lexical_source_wordform', Base.metadata,
                                Column('wordform_source_id', BigInteger(), primary_key=True),
                                Column('lexicon_id', BigInteger(), ForeignKey('lexica.lexicon_id')),
                                Column('wordform_id', BigInteger(), ForeignKey('wordforms.wordform_id'))
                                )


[docs]class Lexicon(Base):
    """
    Table for storing lexicon metadata.

    vocabulary (bool): if True, all words in this lexicon are (supposed to be)
                       valid words, if False, some are misspelled
    """
    __tablename__ = 'lexica'

    lexicon_id = Column(BigInteger(), primary_key=True)
    lexicon_name = Column(String(255))
    vocabulary = Column(Boolean)

    lexicon_wordforms = relationship('Wordform',
                                     secondary=lexical_source_wordform,
                                     back_populates='wf_lexica')
    lexicon_wordform_links = relationship('WordformLink',
                                          secondary='source_x_wordform_link')

    def __str__(self):
        return '<Lexicon {}>'.format(self.lexicon_name)


[docs]class Anahash(Base):
    """
    Table for storing anahashes.

    The anahashes in this table have no direct relation to the wordforms, those
    links are tracked in the wordforms table. This was done so that the
    anahashes table can be efficiently searched, e.g. for ranges in anahash
    "space".
    """
    __tablename__ = 'anahashes'

    anahash_id = Column(BigInteger(), primary_key=True)
    anahash = Column(BigInteger(), unique=True, index=True)

    def __str__(self):
        return '<Anahash {}>'.format(self.anahash)


[docs]class Wordform(Base):
    """Table for storing wordforms and associated anahashes."""
    __tablename__ = 'wordforms'

    wordform_id = Column(BigInteger(), primary_key=True)
    wordform = Column(Unicode(255, convert_unicode=False), unique=True, index=True)
    anahash_id = Column(BigInteger(), ForeignKey("anahashes.anahash_id", ondelete='SET NULL'))

    anahash = relationship('Anahash')
    wordform_lowercase = Column(Unicode(255, convert_unicode=False), nullable=False, index=True)

    wf_lexica = relationship('Lexicon', secondary=lexical_source_wordform,
                             back_populates='lexicon_wordforms')
    wordform_documents = relationship('TextAttestation', back_populates='ta_wordform')

[docs]    def link(self, wordform):
        """Add WordformLinks between self and another wordfrom and vice versa.

        The WordformLinks are added only in the link does not yet exist.

        Inputs:
            wordform (Wordform): Wordform that is related to Wordform self.
        """
        links = [w.linked_to for w in self.links]
        if wordform not in links:
            WordformLink(self, wordform)
            WordformLink(wordform, self)

[docs]    def link_with_metadata(self, wf_to, wf_from_correct, wf_to_correct,
                           lexicon):
        """Add WordformLinks with metadata.

        Adds a WordformLink between self and another wordfrom, and vice versa,
        if these links are not yet in the database.
        And adds a WordformLinkSource, with Lexicon, and information about
        which Wordforms are correct according to the Lexicon. No duplicate
        WordformLinkSources are added.

        TODO: add Uniqueconstraint on (wf_from (self), wf_to, lexicon)?

        Inputs:
            wf_to (Wordform): Wordform self will be linked to (and vice versa)
            wf_from_correct (boolean): True if Wordform self is correct
                according to the lexicon, False otherwise.
            wf_to_correct (boolean): True if Wordform wf_to is correct
                according to the lexicon, False otherwise.
            lexicon (Lexicon): The Lexicon that contains the WordformLink
        """
        self.link(wf_to)

        # check whether the WordformLinkSource is already in the database
        wfl = next((wfl for wfl in self.links if wfl.linked_to == wf_to))
        wflinks = wfl.wf_links
        lexica = [l.wfls_lexicon for l in wflinks]

        if lexicon not in lexica:
            # add WordformLinkSource for link from wf (self) to corr
            WordformLinkSource(wfl, wf_from_correct, wf_to_correct, lexicon)

            # add WordformLinkSource for link from corr to wf (self)
            wfl = next((wfl for wfl in wf_to.links if wfl.linked_to == self))
            WordformLinkSource(wfl, wf_to_correct, wf_from_correct, lexicon)

[docs]    def link_spelling_correction(self, corr, lexicon):
        """Add a spelling correction WordformLink.

        This method sets the booleans that indicate which Wordforms are correct
        (according to the lexicon).

        Inputs:
            corr (Wordform): A correction candidate of Wordform self
            lexicon (Lexicon): The Lexicon that contains the WordformLink
        """
        self.link_with_metadata(corr,
                                wf_from_correct=False,
                                wf_to_correct=True,
                                lexicon=lexicon)

    def __str__(self):
        return '<Wordform {}>'.format(self.wordform_lowercase)


[docs]class WordformLink(Base):
    """Table for storing links between wordforms."""
    __tablename__ = 'wordform_links'

    wordform_from = Column(BigInteger(), ForeignKey('wordforms.wordform_id'), primary_key=True)
    wordform_to = Column(BigInteger(), ForeignKey('wordforms.wordform_id'), primary_key=True)

    linked_from = relationship('Wordform', backref='links',
                               primaryjoin=(Wordform.wordform_id == wordform_from))
    linked_to = relationship('Wordform', backref='links_2_to_1',
                             primaryjoin=(Wordform.wordform_id == wordform_to))

    def __init__(self, wf1, wf2):
        self.linked_from = wf1
        self.linked_to = wf2

    def __str__(self):
        return '<WordformLink {} -> {}>'.format(self.linked_from.wordform, self.linked_to.wordform)


[docs]class WordformLinkSource(Base):
    """
    Table for storing the sources of links between wordforms.

    Wordform links are given by lexica (dictionaries, spelling correction
    lists, etc.). This table records which lexicon a given link between
    wordforms was originally ingested from.
    """
    __tablename__ = 'source_x_wordform_link'
    __table_args__ = (
        ForeignKeyConstraint(['wordform_from', 'wordform_to'],
                             ['wordform_links.wordform_from', 'wordform_links.wordform_to']),
    )

    source_x_wordform_link_id = Column(BigInteger(), primary_key=True)
    wordform_from = Column(BigInteger(), nullable=False)
    wordform_to = Column(BigInteger(), nullable=False)
    lexicon_id = Column(BigInteger(), ForeignKey('lexica.lexicon_id'))

    wordform_from_correct = Column(Boolean)
    wordform_to_correct = Column(Boolean)

    ld = Column(Integer())
    anahash_difference = Column(BigInteger())

    wfls_wflink = relationship('WordformLink', backref='wf_links')
    wfls_lexicon = relationship('Lexicon', backref='wfl_lexica')

    def __init__(self, wflink, wf_from_correct, wf_to_correct, lexicon):
        self.wfls_wflink = wflink
        self.wordform_from_correct = wf_from_correct
        self.wordform_to_correct = wf_to_correct
        self.wfls_lexicon = lexicon

    def __str__(self):
        return '<WordformLinkSource {} -> {} in "{}">'.format(
            self.wfls_wflink.linked_from.wordform,
            self.wfls_wflink.linked_to.wordform,
            self.wfls_lexicon.lexicon_name,
        )


[docs]class MorphologicalParadigm(Base):
    """
    Table for storing information about morphological paradigms of wordforms.

    The paradigms are determined according to Reynaert's method (to be
    published).
    """
    __tablename__ = 'morphological_paradigms'

    paradigm_id = Column(BigInteger(), primary_key=True)

    Z = Column(BigInteger(), index=True)
    Y = Column(BigInteger(), index=True)
    X = Column(BigInteger(), index=True)
    W = Column(BigInteger(), index=True)
    V = Column(BigInteger(), index=True)
    word_type_code = Column(String(10), index=True)
    word_type_number = Column(BigInteger(), index=True)

    wordform_id = Column(BigInteger(), ForeignKey('wordforms.wordform_id'))


[docs]class ExternalLink(Base):
    """Table for storing ids from external sources of wordforms.

    Used for linking wordforms to external sources, such as the WNT, MNW, INT.
    """
    __tablename__ = 'external_links'

    external_link_id = Column(BigInteger(), primary_key=True)
    wordform_id = Column(BigInteger(), ForeignKey('wordforms.wordform_id'))
    source_name = Column(String(5))
    source_id = Column(String(10))


[docs]class WordformFrequencies(Base):
    """Materialized view containing overall frequencies of wordforms

    The data in this table can be used to filter wordforms on frequency. This
    is necessary, because there is a lot of noise in the wordforms table, and
    this makes aggregating over all wordforms expensive.
    """
    __tablename__ = 'wordform_frequency'

    wordform_id = Column(BigInteger(), primary_key=True)
    wordform = Column(Unicode(255, convert_unicode=False), index=True,
                      unique=True)
    frequency = Column(BigInteger())


[docs]class TicclatVariant(Base):
    """Contains spelling variants of words, ingested from TICCL
    """
    __tablename__ = 'ticcl_variants'

    ticclat_variant_id = Column(BigInteger(), primary_key=True)
    wordform = Column(Unicode(255, convert_unicode=False), index=True, unique=True)
    wordform_source = Column(Unicode(255, convert_unicode=False), index=True)
    wordform_source_id = Column(BigInteger(), ForeignKey('wordforms.wordform_id'), index=True)
    levenshtein_distance = Column(BigInteger(), index=True)
    frequency = Column(BigInteger(), index=True)