Example Implementation of PyDistintoX as Library
(Last updated on 2026-03-30)
# standard library imports
from pathlib import Path
# application-specific imports
from pydistintox import *
spacy_download('en_core_web_sm')
# Caution: Keep in mind that the download command installs a Python package into your environment.
# In order for it to be found after installation, you MAY need to restart or reload your Python process so that new packages are recognized.
# Have a look at https://gitlab.com/leongluesing/PyDistintoX/#troubleshooting
# paths
# You may replace these paths with your actual data paths!
# For the moment, they depend on your working directory and,
# if executed in the root dir of the pydistintox project,
# they use the example texts by Conan Doyle
working_dir = Path.cwd()
INPUT_TAR = working_dir / 'data' / 'texts' / 'example_texts' /'corp_tar'
INPUT_REF = working_dir / 'data' / 'texts' / 'example_texts' /'corp_ref'
JSON = working_dir / 'data' / 'interim' / 'json'
RESULTS_SCORES = working_dir / 'data' / 'results' / 'scores'
RESULTS_CORRELATIONS = working_dir / 'data' / 'results' / 'correlations_of_measures'
# further parameter
config = Config(
skip_nlp = False,
save_nlp= JSON,
spacy_model_name='en_core_web_sm',
target=INPUT_TAR,
reference=INPUT_REF,
verbose = True,
source='YOUR COMMENT HERE',
scaling= True,
measures_to_calculate = ['zeta_sd2', 'afn']
)
# start the application
td_matrices, scores_tf_idf, terms = compute_td_matrices_and_measures(
config
)
# calculate scores from non-tf-idf measures
scores_non_tf_idf = calculate_scores(
matrices=td_matrices,
config=config,
)
# format scores
scores_all = scores_tf_idf | scores_non_tf_idf
measures = list(scores_all.keys())
scores_processed = format_scores(
scores_all=scores_all,
terms=terms,
header_scores='Score',
header_terms='Term',
)
for name, score in scores_processed.items():
save_results(
result= score,
path=RESULTS_SCORES / f'{name}.csv',
)
# calculate rank correlation between scores
correlation_matrix, index = calculate_rank_correlation(scores_all)
# create visualizations
for measure, score in scores_processed.items():
visualize_score(
score_processed=score,
title=measure,
path=RESULTS_SCORES,
)
visualize_rank_correlations(
correlation_matrix,
index,
dir_path= RESULTS_CORRELATIONS
)
# show visualizations
write_html_index(
file_names=[f'{name}.html' for name in measures],
dir= RESULTS_SCORES,
)
open_html(RESULTS_SCORES / 'index.html')
open_html(RESULTS_CORRELATIONS / 'heatmap.png')