Example Implementation of PyDistintoX as Library

(Last updated on 2026-05-06)

# standard library imports
from pathlib import Path

# application-specific imports
from pydistintox import *

spacy_download("en_core_web_sm")
# Caution: Keep in mind that the download command installs a Python package into your environment.
# In order for it to be found after installation, you MAY need to restart or reload your Python process so that new packages are recognized.
# Have a look at https://gitlab.com/pydistintox/pydistintox#troubleshooting

# paths
# You may replace these paths with your actual data paths!
# For the moment, they depend on your working directory and,
# if executed in the root dir of the pydistintox project,
# they use the example texts by Conan Doyle
working_dir = Path.cwd()
INPUT_TAR = working_dir / "data" / "example_texts" / "tar"
INPUT_REF = working_dir / "data" / "example_texts" / "ref"

JSON = working_dir / "data" / "interim" / "json"

RESULTS_SCORES = working_dir / "data" / "results" / "scores"
RESULTS_CORRELATIONS = working_dir / "data" / "results" / "correlations_of_measures"

# further parameter
config = Config(
    skip_nlp=False,
    save_nlp=JSON,
    spacy_model_name="en_core_web_sm",
    target=INPUT_TAR,
    reference=INPUT_REF,
    verbose=True,
    source="YOUR COMMENT HERE",
    scaling=True,
    measures_to_calculate=["zeta_sd2", "afn"],
)

# start the application
parsed_corp_tar, parsed_corp_ref = generate_parsed_corpora(config)
td_matrices, scores_tf_idf, terms = compute_td_matrices_and_measures(
    parsed_corp_tar=parsed_corp_tar, parsed_corp_ref=parsed_corp_ref, config=config
)

# calculate scores from non-tf-idf measures
scores_non_tf_idf = calculate_scores(
    matrices=td_matrices,
    config=config,
)

# format scores
scores_all = scores_tf_idf | scores_non_tf_idf
measures = list(scores_all.keys())
scores_processed = format_scores(
    scores_all=scores_all,
    terms=terms,
    header_scores="Score",
    header_terms="Term",
)
for name, score in scores_processed.items():
    save_results(
        result=score,
        path=RESULTS_SCORES / f"{name}.csv",
    )

# calculate rank correlation between scores
correlation_matrix, index = calculate_rank_correlation(scores_all)

# create visualizations
for measure, score in scores_processed.items():
    visualize_score(
        score_processed=score,
        title=measure,
        path=RESULTS_SCORES,
    )
visualize_rank_correlations(correlation_matrix, index, dir_path=RESULTS_CORRELATIONS)

# show visualizations
write_html_index(
    file_names=[f"{name}.html" for name in measures],
    dir=RESULTS_SCORES,
)

open_html(RESULTS_SCORES / "index.html")
open_html(RESULTS_CORRELATIONS / "heatmap.html")