Example Implementation of PyDistintoX as Library
(Last updated on 2026-05-06)
# standard library imports
from pathlib import Path
# application-specific imports
from pydistintox import *
spacy_download("en_core_web_sm")
# Caution: Keep in mind that the download command installs a Python package into your environment.
# In order for it to be found after installation, you MAY need to restart or reload your Python process so that new packages are recognized.
# Have a look at https://gitlab.com/pydistintox/pydistintox#troubleshooting
# paths
# You may replace these paths with your actual data paths!
# For the moment, they depend on your working directory and,
# if executed in the root dir of the pydistintox project,
# they use the example texts by Conan Doyle
working_dir = Path.cwd()
INPUT_TAR = working_dir / "data" / "example_texts" / "tar"
INPUT_REF = working_dir / "data" / "example_texts" / "ref"
JSON = working_dir / "data" / "interim" / "json"
RESULTS_SCORES = working_dir / "data" / "results" / "scores"
RESULTS_CORRELATIONS = working_dir / "data" / "results" / "correlations_of_measures"
# further parameter
config = Config(
skip_nlp=False,
save_nlp=JSON,
spacy_model_name="en_core_web_sm",
target=INPUT_TAR,
reference=INPUT_REF,
verbose=True,
source="YOUR COMMENT HERE",
scaling=True,
measures_to_calculate=["zeta_sd2", "afn"],
)
# start the application
parsed_corp_tar, parsed_corp_ref = generate_parsed_corpora(config)
td_matrices, scores_tf_idf, terms = compute_td_matrices_and_measures(
parsed_corp_tar=parsed_corp_tar, parsed_corp_ref=parsed_corp_ref, config=config
)
# calculate scores from non-tf-idf measures
scores_non_tf_idf = calculate_scores(
matrices=td_matrices,
config=config,
)
# format scores
scores_all = scores_tf_idf | scores_non_tf_idf
measures = list(scores_all.keys())
scores_processed = format_scores(
scores_all=scores_all,
terms=terms,
header_scores="Score",
header_terms="Term",
)
for name, score in scores_processed.items():
save_results(
result=score,
path=RESULTS_SCORES / f"{name}.csv",
)
# calculate rank correlation between scores
correlation_matrix, index = calculate_rank_correlation(scores_all)
# create visualizations
for measure, score in scores_processed.items():
visualize_score(
score_processed=score,
title=measure,
path=RESULTS_SCORES,
)
visualize_rank_correlations(correlation_matrix, index, dir_path=RESULTS_CORRELATIONS)
# show visualizations
write_html_index(
file_names=[f"{name}.html" for name in measures],
dir=RESULTS_SCORES,
)
open_html(RESULTS_SCORES / "index.html")
open_html(RESULTS_CORRELATIONS / "heatmap.html")