Example Implementation of PyDistintoX as Library

(Last updated on 2026-03-30)

# standard library imports
from pathlib import Path

# application-specific imports
from pydistintox import *

spacy_download('en_core_web_sm')
# Caution: Keep in mind that the download command installs a Python package into your environment. 
# In order for it to be found after installation, you MAY need to restart or reload your Python process so that new packages are recognized.
# Have a look at https://gitlab.com/leongluesing/PyDistintoX/#troubleshooting

# paths
# You may replace these paths with your actual data paths!
# For the moment, they depend on your working directory and,
# if executed in the root dir of the pydistintox project,
# they use the example texts by Conan Doyle
working_dir = Path.cwd()
INPUT_TAR = working_dir / 'data' / 'texts' / 'example_texts' /'corp_tar'
INPUT_REF = working_dir / 'data' / 'texts' / 'example_texts' /'corp_ref'

JSON = working_dir / 'data' / 'interim' / 'json'

RESULTS_SCORES = working_dir / 'data' / 'results' / 'scores'
RESULTS_CORRELATIONS = working_dir / 'data' / 'results' / 'correlations_of_measures'

# further parameter
config = Config(
    skip_nlp = False,
    save_nlp= JSON,
    spacy_model_name='en_core_web_sm',
    target=INPUT_TAR,
    reference=INPUT_REF,
    verbose = True,
    source='YOUR COMMENT HERE',
    scaling= True,
    measures_to_calculate = ['zeta_sd2', 'afn']
)

# start the application
td_matrices, scores_tf_idf, terms = compute_td_matrices_and_measures(
    config
)


# calculate scores from non-tf-idf measures
scores_non_tf_idf = calculate_scores(
    matrices=td_matrices, 
    config=config,
)

# format scores
scores_all = scores_tf_idf | scores_non_tf_idf
measures = list(scores_all.keys())
scores_processed = format_scores(
    scores_all=scores_all,
    terms=terms,
    header_scores='Score',
    header_terms='Term',
)
for name, score in scores_processed.items():
    save_results(
        result= score,
        path=RESULTS_SCORES / f'{name}.csv',
    )

# calculate rank correlation between scores
correlation_matrix, index = calculate_rank_correlation(scores_all)

# create visualizations
for measure, score in scores_processed.items():
    visualize_score(
        score_processed=score,
        title=measure,
        path=RESULTS_SCORES,
    )
visualize_rank_correlations(
    correlation_matrix,
    index,
    dir_path= RESULTS_CORRELATIONS
)

# show visualizations
write_html_index(
    file_names=[f'{name}.html' for name in measures],
    dir= RESULTS_SCORES,
)

open_html(RESULTS_SCORES / 'index.html')
open_html(RESULTS_CORRELATIONS / 'heatmap.png')