Source code for ticclat.ticclat_schema

# coding: utf-8
# pylint: disable=too-few-public-methods
"""
SQLAlchemy schema of the TICCLAT database.

Contains all the tables of the database and their connections, defined as
SQLAlchemy declarative_base subclasses.

Many of the tables here defined are based on an INT lexicon database created
in the IMPACT project
(https://ivdnt.org/images/stories/onderzoek_en_onderwijs/publicaties/impact/impact_lexicon_structure.pdf).
See https://github.com/TICCLAT/docs/blob/master/database_design.md for more
information about the database design.

Based on this, in TICCLAT, we added tables for:
- links between wordforms
- morphological paradigm groups of wordforms
- anagram hashes from TICCL
- spelling variants from TICCL
- identifiers linking wordforms to external sources like the WNT, MNW, INT.
"""

from sqlalchemy import Column, String, Table, ForeignKey, Unicode, Boolean, \
    Integer, BigInteger, ForeignKeyConstraint
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()


# Table for storing the relation between corpora and documents.
corpusId_x_documentId = Table('corpusId_x_documentId', Base.metadata,
                              Column('corpus_id', BigInteger(), ForeignKey('corpora.corpus_id')),
                              Column('document_id', BigInteger(), ForeignKey('documents.document_id'))
                              )


[docs]class TextAttestation(Base): """ Table for storing text attestations. A text attestation entry is defined in the INT schema as the occurrence and frequency of wordforms in documents. """ __tablename__ = 'text_attestations' attestation_id = Column(BigInteger(), primary_key=True) frequency = Column(BigInteger()) wordform_id = Column(BigInteger(), ForeignKey('wordforms.wordform_id')) document_id = Column(BigInteger(), ForeignKey('documents.document_id')) ta_document = relationship('Document', back_populates='document_wordforms') ta_wordform = relationship('Wordform', back_populates='wordform_documents') def __init__(self, document, wordform, frequency): self.ta_document = document self.ta_wordform = wordform self.frequency = frequency
[docs]class Corpus(Base): """Table for storing corpus metadata.""" __tablename__ = 'corpora' corpus_id = Column(BigInteger(), primary_key=True) name = Column(String(255)) corpus_documents = relationship('Document', secondary=corpusId_x_documentId, back_populates='document_corpora')
[docs]class Document(Base): """Table for storing document metadata.""" __tablename__ = 'documents' document_id = Column(BigInteger(), primary_key=True) persistent_id = Column(String(255), index=True) word_count = Column(BigInteger()) encoding = Column(BigInteger()) title = Column(String(255)) year_from = Column(BigInteger()) year_to = Column(BigInteger()) pub_year = Column(BigInteger()) author = Column(String(255)) editor = Column(String(255)) publisher = Column(String(255)) publishing_location = Column(String(255)) text_type = Column(String(255)) region = Column(String(255)) language = Column(String(255)) other_languages = Column(String(255)) spelling = Column(String(255)) parent_document = Column(BigInteger(), index=True) document_corpora = relationship('Corpus', secondary=corpusId_x_documentId, back_populates='corpus_documents') document_wordforms = relationship('TextAttestation', back_populates='ta_document')
# Table for storing which lexica wordforms occur in. lexical_source_wordform = Table('lexical_source_wordform', Base.metadata, Column('wordform_source_id', BigInteger(), primary_key=True), Column('lexicon_id', BigInteger(), ForeignKey('lexica.lexicon_id')), Column('wordform_id', BigInteger(), ForeignKey('wordforms.wordform_id')) )
[docs]class Lexicon(Base): """ Table for storing lexicon metadata. vocabulary (bool): if True, all words in this lexicon are (supposed to be) valid words, if False, some are misspelled """ __tablename__ = 'lexica' lexicon_id = Column(BigInteger(), primary_key=True) lexicon_name = Column(String(255)) vocabulary = Column(Boolean) lexicon_wordforms = relationship('Wordform', secondary=lexical_source_wordform, back_populates='wf_lexica') lexicon_wordform_links = relationship('WordformLink', secondary='source_x_wordform_link') def __str__(self): return '<Lexicon {}>'.format(self.lexicon_name)
[docs]class Anahash(Base): """ Table for storing anahashes. The anahashes in this table have no direct relation to the wordforms, those links are tracked in the wordforms table. This was done so that the anahashes table can be efficiently searched, e.g. for ranges in anahash "space". """ __tablename__ = 'anahashes' anahash_id = Column(BigInteger(), primary_key=True) anahash = Column(BigInteger(), unique=True, index=True) def __str__(self): return '<Anahash {}>'.format(self.anahash)
[docs]class Wordform(Base): """Table for storing wordforms and associated anahashes.""" __tablename__ = 'wordforms' wordform_id = Column(BigInteger(), primary_key=True) wordform = Column(Unicode(255, convert_unicode=False), unique=True, index=True) anahash_id = Column(BigInteger(), ForeignKey("anahashes.anahash_id", ondelete='SET NULL')) anahash = relationship('Anahash') wordform_lowercase = Column(Unicode(255, convert_unicode=False), nullable=False, index=True) wf_lexica = relationship('Lexicon', secondary=lexical_source_wordform, back_populates='lexicon_wordforms') wordform_documents = relationship('TextAttestation', back_populates='ta_wordform') def __str__(self): return '<Wordform {}>'.format(self.wordform_lowercase)
[docs]class WordformLinkSource(Base): """ Table for storing the sources of links between wordforms. Wordform links are given by lexica (dictionaries, spelling correction lists, etc.). This table records which lexicon a given link between wordforms was originally ingested from. """ __tablename__ = 'source_x_wordform_link' __table_args__ = ( ForeignKeyConstraint(['wordform_from', 'wordform_to'], ['wordform_links.wordform_from', 'wordform_links.wordform_to']), ) source_x_wordform_link_id = Column(BigInteger(), primary_key=True) wordform_from = Column(BigInteger(), nullable=False) wordform_to = Column(BigInteger(), nullable=False) lexicon_id = Column(BigInteger(), ForeignKey('lexica.lexicon_id')) wordform_from_correct = Column(Boolean) wordform_to_correct = Column(Boolean) ld = Column(Integer()) anahash_difference = Column(BigInteger()) wfls_wflink = relationship('WordformLink', backref='wf_links') wfls_lexicon = relationship('Lexicon', backref='wfl_lexica') def __init__(self, wflink, wf_from_correct, wf_to_correct, lexicon): self.wfls_wflink = wflink self.wordform_from_correct = wf_from_correct self.wordform_to_correct = wf_to_correct self.wfls_lexicon = lexicon def __str__(self): return '<WordformLinkSource {} -> {} in "{}">'.format( self.wfls_wflink.linked_from.wordform, self.wfls_wflink.linked_to.wordform, self.wfls_lexicon.lexicon_name, )
[docs]class MorphologicalParadigm(Base): """ Table for storing information about morphological paradigms of wordforms. The paradigms are determined according to Reynaert's method (to be published). """ __tablename__ = 'morphological_paradigms' paradigm_id = Column(BigInteger(), primary_key=True) Z = Column(BigInteger(), index=True) Y = Column(BigInteger(), index=True) X = Column(BigInteger(), index=True) W = Column(BigInteger(), index=True) V = Column(BigInteger(), index=True) word_type_code = Column(String(10), index=True) word_type_number = Column(BigInteger(), index=True) wordform_id = Column(BigInteger(), ForeignKey('wordforms.wordform_id'))
[docs]class WordformFrequencies(Base): """Materialized view containing overall frequencies of wordforms The data in this table can be used to filter wordforms on frequency. This is necessary, because there is a lot of noise in the wordforms table, and this makes aggregating over all wordforms expensive. """ __tablename__ = 'wordform_frequency' wordform_id = Column(BigInteger(), primary_key=True) wordform = Column(Unicode(255, convert_unicode=False), index=True, unique=True) frequency = Column(BigInteger())
[docs]class TicclatVariant(Base): """Contains spelling variants of words, ingested from TICCL """ __tablename__ = 'ticcl_variants' ticclat_variant_id = Column(BigInteger(), primary_key=True) wordform = Column(Unicode(255, convert_unicode=False), index=True, unique=True) wordform_source = Column(Unicode(255, convert_unicode=False), index=True) wordform_source_id = Column(BigInteger(), ForeignKey('wordforms.wordform_id'), index=True) levenshtein_distance = Column(BigInteger(), index=True) frequency = Column(BigInteger(), index=True)