diff --git a/.gitignore b/.gitignore index cc76b10..d5f232d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,6 @@ build/ __pycache__/ codauto/__pycache__/ codauto/.ipynb_checkpoints/ -pruebas_codifier.py \ No newline at end of file +pruebas_codifier.py +*.venv +.idea/ \ No newline at end of file diff --git a/codauto/__init__.py b/codauto/__init__.py index 4e4faa1..1a8ba5d 100644 --- a/codauto/__init__.py +++ b/codauto/__init__.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- from .utils import * from .codifier import * -from .prompt_maker import * -from .prompt_text import * from .structured import * from .codifier_fasttext import * \ No newline at end of file diff --git a/codauto/codifier.py b/codauto/codifier.py index 6388e43..e2906c2 100644 --- a/codauto/codifier.py +++ b/codauto/codifier.py @@ -40,6 +40,7 @@ class Codifier(ABC): train_df (pd.DataFrame): Cleaned training dataset. test_df (pd.DataFrame): Cleaned testing dataset with hierarchical labels. language (str): The language of your textual data. + preprocess (bool): Specifies whether the text from the training, test, or prediction set should be preprocessed. Methods: get_correspondences(corres_df): @@ -96,15 +97,17 @@ class Codifier(ABC): def __init__(self, structure_instance, - train_df, - test_df, root_path, + train_df=None, + test_df=None, corres_df=None, min_lenght_texts=3, - language='es' + language='es', + preprocess = True ): self.root_path = root_path self.language = language + self.preprocess = preprocess os.makedirs(self.root_path, exist_ok=True) self.logger = logging.getLogger(f'CNAECodifier.{id(self)}') self.logger.setLevel(logging.INFO) @@ -115,16 +118,16 @@ def __init__(self, stream_handler.setFormatter(formatter) if not self.logger.hasHandlers(): self.logger.addHandler(stream_handler) - self.logger.propagate = False + self.logger.propagate = True self.min_lenght_texts = min_lenght_texts self.structure = structure_instance self.model = None if corres_df is not None: self.correspondences = self.get_correspondences(corres_df) - else: - self.correspondences = None - self.train_df = self.get_train_dataset(train_df, 'train_df') - self.test_df = self.get_test_dataset(test_df, 'test_df') + if train_df is not None: + self.train_df = self.get_train_dataset(train_df, 'train_df') + if test_df is not None: + self.test_df = self.get_test_dataset(test_df, 'test_df') def get_correspondences(self, correspondences_df): """ @@ -232,10 +235,11 @@ def clean_data(self, data_df): data_df[col_n_0] = data_df[col_n_0].apply( lambda n: self.get_code(str(n).replace('.', '')) ) - data_df[col_n_1] = data_df[col_n_1].apply(preprocess_text, args=(self.language,)) - data_df[col_n_1] = data_df[col_n_1].apply( - lambda desc: np.nan if len(desc) <= self.min_lenght_texts else desc - ) + if self.preprocess: + data_df[col_n_1] = data_df[col_n_1].apply(preprocess_text, args=(self.language,)) + data_df[col_n_1] = data_df[col_n_1].apply( + lambda desc: np.nan if len(desc) <= self.min_lenght_texts else desc + ) return data_df def load_data(self, data_df, data_name='data_df'): @@ -960,7 +964,7 @@ def predict( raw_predictions = self.get_pred_for_batch( desc_l_not_direct_recoding, idxs_not_direct_recoding, - True + self.preprocess ) for idx, original_code in zip( diff --git a/codauto/codifier_bert.py b/codauto/codifier_bert.py new file mode 100644 index 0000000..8ff36a1 --- /dev/null +++ b/codauto/codifier_bert.py @@ -0,0 +1,418 @@ +# -*- coding: utf-8 -*- +# ------------------------------------------------------------------------------ +# Copyright (C) [2025] Instituto Nacional de Estadística +# +# Este archivo forma parte del proyecto statcodgen. +# +# Licenciado bajo la Licencia Pública de la Unión Europea (EUPL) v.1.2. +# Puede obtener una copia de la licencia en la raiz de este proyecto o en: +# https://eupl.eu/1.2/es/ +# +# A menos que se indique lo contrario, este software se distribuye +# "TAL CUAL", SIN GARANTÍAS NI CONDICIONES DE NINGÚN TIPO. +# Consulte la licencia para conocer los términos específicos. +# ------------------------------------------------------------------------------ +# Copyright (C) [2025] National Institute of Statistics +# +# This file is part of the statcodgen project. +# +# Licensed under the European Union Public License (EUPL) v.1.2. +# You can obtain a copy of the license at the root of this project or at: +# https://eupl.eu/1.2/es/ +# +# Unless otherwise indicated, this software is distributed +# "AS IS", WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. +# See the license for specific terms. +# ------------------------------------------------------------------------------ +""" +Created on Tue Sep 9 13:40:12 2025 + +@author: U853768 +""" +import numpy as np +import torch +from transformers import ( + AutoTokenizer, + AutoModelForSequenceClassification, + Trainer, + TrainingArguments, + DataCollatorWithPadding, + pipeline, +) +import os +from datasets import Dataset, ClassLabel, DatasetDict +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, top_k_accuracy_score +from sklearn.preprocessing import LabelEncoder +from codifier import Codifier + + +class CodifierBERT(Codifier): + """ + CodifierBERT is a BERT-based text classification codifier built on the Hugging Face + Transformers ecosystem. It extends the `Codifier` base class to support model + training, evaluation, inference, and persistence for hierarchical or structured + text classification tasks. + + The class automates dataset preparation, tokenizer setup, metric computation, + model fine-tuning, and prediction generation. + + Parameters + ---------- + structure_instance : object + Structured child instance representing label hierarchy or taxonomy information. + train_df : pandas.DataFrame + Training dataset containing two columns, first must be the label and second the text. + test_df : pandas.DataFrame + Test dataset for evaluation purposes. It must have 3 columns: first column must be label, second text and third source (source can be empty) + root_path : str + Path to the directory where models, logs, and checkpoints are stored. + model_id : str + Hugging Face model identifier (e.g., 'bert-base-uncased'). + corres_df : pandas.DataFrame, optional + Correspondence dataframe linking labels or hierarchical mappings with other classifications. + min_lenght_texts : int, default=3 + Minimum number of tokens required for a text to be considered valid. + + Attributes + ---------- + tokenizer : AutoTokenizer + Tokenizer loaded from Hugging Face using `model_id`. + label_encoder : LabelEncoder + Scikit-learn label encoder for mapping textual labels to integers. + id2label : dict of int to str + Mapping from integer class indices to label strings. + label2id : dict of str to int + Reverse mapping from label strings to class indices. + class_label : datasets.ClassLabel + Hugging Face datasets label handler for encoding. + device : torch.device + Active computing device ('cuda' or 'cpu'). + train_data : datasets.DatasetDict + Processed Hugging Face dataset for training and validation. + classifier : transformers.Pipeline + Inference pipeline for batched predictions. + """ + + def __init__(self, + structure_instance, + root_path, + model_id, + train_df=None, + test_df=None, + corres_df=None, + min_lenght_texts=3, + device='cuda:1', + preprocess=True, + language='es' + ): + super().__init__( + structure_instance=structure_instance, + train_df=train_df, + test_df=test_df, + root_path=root_path, + corres_df=corres_df, + min_lenght_texts=min_lenght_texts, + preprocess=preprocess, + language=language + ) + self.model_id = model_id + self.tokenizer = AutoTokenizer.from_pretrained(self.model_id) + self.label_encoder = LabelEncoder() + self.label_encoder.fit(list(self.structure.reversed_hierarchy.keys())) + self.id2label = {i: str(label) for i, label in enumerate(self.label_encoder.classes_)} + self.label2id = {label: i for i, label in self.id2label.items()} + self.class_label = ClassLabel(names=self.label_encoder.classes_.tolist()) + self.device = device + self.train_data = None + self.classifier = None + + def tokenize(self, example, max_seq_len=256): + """ + Tokenizes input text examples for model ingestion. + + Parameters + ---------- + example : dict + Input example containing the 'target' text field. + max_seq_len : int, default=256 + Maximum sequence length for padding and truncation. + + Returns + ------- + dict + Tokenized representation of the input suitable for model processing. + """ + return self.tokenizer( + example['target'], + padding='max_length', + truncation=True, + max_length=max_seq_len + ) + + def get_pipeline(self): + """ + Initializes a Hugging Face text-classification pipeline for inference. + + Notes + ----- + Automatically selects the appropriate device (GPU or CPU) based on availability. + """ + device = 0 + if self.device == 'cpu': + device = -1 + self.classifier = pipeline( + "text-classification", + model=self.model, + tokenizer=self.tokenizer, + top_k=None, + device=device + ) + + @staticmethod + def compute_metrics(eval_pred): + """ + Computes performance metrics for model evaluation. + + Parameters + ---------- + eval_pred : tuple + A tuple containing (logits, labels) from the evaluation step. + + Returns + ------- + dict + Dictionary containing the following metrics: + - 'accuracy': float + - 'f1_macro': float + - 'precision_macro': float + - 'recall_macro': float + - 'top_1_accuracy': float + - 'top_2_accuracy': float + """ + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + num_classes = logits.shape[1] + class_labels = np.arange(num_classes) + accuracy = accuracy_score(labels, predictions) + f1 = f1_score(labels, predictions, average='macro', zero_division=0) + precision = precision_score(labels, predictions, average='macro', zero_division=0) + recall = recall_score(labels, predictions, average='macro', zero_division=0) + top_1_acc = top_k_accuracy_score(labels, logits, k=1, labels=class_labels) + top_2_acc = top_k_accuracy_score(labels, logits, k=2, labels=class_labels) + return { + 'accuracy': accuracy, + 'f1_macro': f1, + 'precision_macro': precision, + 'recall_macro': recall, + 'top_1_accuracy': top_1_acc, + 'top_2_accuracy': top_2_acc, + } + + def set_train_data(self, batched, test_size, seed): + """ + Prepares and encodes training and validation datasets using Hugging Face Datasets. + + Parameters + ---------- + batched : bool + Whether to apply transformations in batched mode. + test_size : float + Fraction of data reserved for validation. + seed : int + Random seed for reproducibility. + + Returns + ------- + None + The prepared dataset is stored in `self.train_data`. + """ + train_df = self.train_df.copy() + train_df.columns = ['label', 'target'] + data = DatasetDict({ + 'train': Dataset.from_pandas(train_df[['label', 'target']]) + }) + data = data.map( + lambda x: {'label': self.label_encoder.transform(x['label'])}, batched=batched + ) + data = data.cast_column('label', self.class_label) + data = data['train'].train_test_split( + test_size=test_size, seed=seed#, stratify_by_column='label' + ) + data["validation"] = data.pop("test") + self.train_data = data + + def train(self, train_args={}, batched=True, test_size=0.05, seed=42): + """ + Fine-tunes the BERT model using Hugging Face's Trainer API. + + This method configures and launches fine-tuning for the underlying + transformer model (e.g., BERT, RoBERTa, DistilBERT) on a labeled dataset + for sequence classification tasks. It uses the Hugging Face `Trainer` + and `TrainingArguments` APIs for efficient and configurable training. + + Parameters + ---------- + train_args : dict, optional + Additional or overriding keyword arguments for `TrainingArguments`. + Common keys include: + - output_dir : str + Directory to save logs and model outputs. + - num_train_epochs : int + Number of training epochs. Default is 2. + - learning_rate : float + Learning rate for the optimizer. Default is 5e-5. + - lr_scheduler_type : str + Type of learning rate scheduler (e.g., 'linear', 'cosine'). + - per_device_train_batch_size : int + Training batch size per device. Default is 16. + - per_device_eval_batch_size : int + Evaluation batch size per device. Default is 32. + - gradient_accumulation_steps : int + Number of update steps to accumulate before performing a backward/update pass. + - warmup_ratio : float + Fraction of total steps used for learning rate warmup. + - weight_decay : float + Weight decay (L2 regularization) coefficient. + - logging_dir : str + Directory for training logs. + - logging_strategy : str + When to log metrics ('epoch', 'steps', etc.). Default is 'epoch'. + - evaluation_strategy : str + When to run evaluation ('epoch', 'steps', etc.). Default is 'epoch'. + - save_strategy : str + Model checkpoint saving strategy. Default is 'no'. + - fp16 : bool + Whether to use mixed precision (float16) training. Default is False. + - load_best_model_at_end : bool + Whether to reload the best model after training. Default is False. + - report_to : list + Integrations for logging/reporting (e.g., ['tensorboard', 'wandb']). + - log_level : str + Logging verbosity level. Default is 'warning'. + + batched : bool, default=True + Whether tokenization should be applied in batches during dataset mapping. + + test_size : float, default=0.05 + Fraction of the dataset reserved for validation during training. + + seed : int, default=42 + Random seed used for reproducibility during dataset splitting. + + Returns + ------- + None + Trains and initializes the classification pipeline. + """ + args = dict( + output_dir=self.root_path, + num_train_epochs=2, + learning_rate=5e-5, + lr_scheduler_type='linear', + per_device_train_batch_size=16, + per_device_eval_batch_size=32, + gradient_accumulation_steps=2, + warmup_ratio=0.1, + weight_decay=0.01, + logging_dir='./logs', + logging_strategy='epoch', + eval_strategy='epoch', + save_strategy='no', + fp16=False, + load_best_model_at_end=False, + report_to=[], + log_level='warning', + max_seq_len = 256, + local_rank = -1 + ) + args.update(train_args) + if self.train_data is None: + self.set_train_data(batched, test_size, seed) + model = AutoModelForSequenceClassification.from_pretrained( + self.model_id, + num_labels=len(self.id2label), + id2label=self.id2label, + label2id=self.label2id, + ).to(self.device) + tokenized_data = self.train_data.map(lambda batch: self.tokenize(batch, max_seq_len=args['max_seq_len']), batched=batched) + data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer) + args_to_train = args.copy() + args_to_train.pop('max_seq_len', None) + training_args = TrainingArguments(**args_to_train) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_data['train'], + eval_dataset=tokenized_data['validation'], + compute_metrics=self.compute_metrics, + data_collator=data_collator + ) + trainer.train() + self.model = trainer.model + self.get_pipeline() + + def save(self, name='bert_model', save_tok=True): + """ + Saves the trained model and optionally its tokenizer. + + Parameters + ---------- + name : str, default='bert_model' + Directory name for saving model. + save_tok : bool, default=True + Whether to save the tokenizer. + + Returns + ------- + None + """ + path = os.path.join(self.root_path, name) + if save_tok: + self.tokenizer.save_pretrained(path) + self.model.save_pretrained(path) + + def load(self, name, load_tok=True): + """ + Loads a previously trained model and tokenizer from disk. + + Parameters + ---------- + name : str + Name of the saved model directory. + load_tok : bool, default=True + Whether to load the tokenizer. + + Returns + ------- + None + Loads model and initializes the pipeline. + """ + path = os.path.join(self.root_path, name) + if load_tok: + self.tokenizer = AutoTokenizer.from_pretrained(path) + self.model = AutoModelForSequenceClassification.from_pretrained(path) + self.get_pipeline() + + def get_pred_for_batch(self, samples): + """ + Generate predictions for a batch of text samples using Hugging Face pipeline. + + Parameters + ---------- + samples : list of str + Input text samples to classify. + + Returns + ------- + list of tuple + Each element contains: + - labels (list of str): predicted labels sorted by confidence. + - confidences (list of float): corresponding probabilities. + """ + results = self.classifier(samples, truncation=True) + formatted = [None]*len(samples) + for index, sample_result in enumerate(results): + labels = [item["label"] for item in sample_result] + confidences = [item["score"] for item in sample_result] + formatted[index] = (labels, confidences) + return formatted diff --git a/codauto/prompt_maker.py b/codauto/prompt_maker.py deleted file mode 100644 index d5f7dac..0000000 --- a/codauto/prompt_maker.py +++ /dev/null @@ -1,64 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Thu Jul 31 13:59:16 2025 - -@author: git.metodologia@ine.es -""" -from nltk.corpus import stopwords -from nltk.tokenize import word_tokenize -from string import punctuation - -from codauto.prompt_text import prompt_synt_data -from codauto.prompt_text import prompt_aug_data - - -def fill_prompt_synt_data(title, includes, prompt=prompt_synt_data): - """ - Fill a synthetic prompt template with the provided title and included content. - - This function replaces the placeholders '' and '<includes>' in the - prompt template with the actual `title` and `includes` values supplied. - - Parameters: - title (str): The title to insert into the prompt. - includes (str): Additional content or context to insert into the prompt. - prompt (str, optional): The synthetic prompt template to use. Defaults to global `prompt_synt_data`. - - Returns: - str: The prompt text with the placeholders replaced by the provided values. - """ - data_text = prompt.replace( - '<title>', title - ).replace( - '<includes>', includes - ) - return data_text - - -def fill_prompt_aug_data(title, includes, language='spanish', prompt=prompt_aug_data): - """ - Fill a prompt template with keywords extracted from a title and additional content. - - This function tokenizes the input `title`, removes stopwords and punctuation according - to the specified `language`, and replaces the placeholders '<title>' and '<includes>' - in the prompt template with the filtered keywords and the additional content. - - Parameters: - title (str): The input title from which to extract keywords. - includes (str): Additional content or context to insert into the prompt. - language (str, optional): Language for stopword removal (default is 'spanish'). - prompt (str, optional): The prompt template to fill (default is a global variable `prompt_aug_data`). - - Returns: - str: The filled-in prompt with keywords and additional content. - """ - stop_words = set(stopwords.words(language)) - stop_words.update(punctuation) - words = word_tokenize(title) - words_list = set([word for word in words if word not in stop_words]) - data_text = prompt.replace( - '<title>', str(words_list) - ).replace( - '<includes>', includes - ) - return data_text diff --git a/codauto/prompt_text.py b/codauto/prompt_text.py deleted file mode 100644 index 35cc374..0000000 --- a/codauto/prompt_text.py +++ /dev/null @@ -1,75 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Thu Jul 31 14:03:15 2025 - -@author: git.metodologia@ine.es -""" - -prompt_synt_data = """ -Eres un experto en generación de datos de entrenamiento para modelos de clasificación. -Tu tarea es tomar una frase raíz de una actividad económica general y generar frases de entrenamiento a partir de ejemplos concretos incluidos en un texto. -Cuando corresponda, procura introducir la palabra 'otras' u 'otros' y también el uso o destino del producto. -Siempre que sea fabricación de un objeto, incluye el material con el que se hace. -Evita generar frases completamente distintas o que cambien el contexto y devuelve ÚNICAMENTE las frases sin presentaciones, comentarios, explicaciones ni frases adicionales. -Evita introducir tu output con frases como 'aquí te dejo las frases generadas' o similares. -Quiero utilizar directamente tu output así que SOLO quiero las frases. -Aquí tienes un ejemplo: -Raiz: Elaboración de productos de panadería y pastelería. -Texto: Esta clase incluye la fabricación de cualquier producto de panadería como: barras de pan, pasteles, otras harinas y pan para pienso. -Salida: -Elaboración de barras de pan. -Elaboración de pan para pienso. -roducción de productos de pasteles. -Elaboración de otras harinas. -Ahora hazlo tú con la siguiente raíz y texto: -Raiz: <title> -Texto: <includes> -""" - -prompt_aug_data = """ -Estoy haciendo frases de entrenamiento para entrenar de manera supervisada un modelo de clasificación de actividades económicas en base a un estandar. -Dispongo de un título de clase y de una descripción de cada clase posible del estandar -Necesito tu ayuda para que me generes un diccioniario de sinónimos para poder hacer un set de entrenamiento para dicha clase -Se te va a proporcionar una lista de palabras y la descripción de la clase -Quiero que me devuelvas todo los ejemplos concretos o sinónimos de las palabras incluidas en la lista que NO aparezcan en el texto -Ademas esos ejemplos tienen que tener sentido dentro del contexto de la clase para no generar ruido dentro del modelo -Si no encuentras ninguna palabra de la que puedas dar ejemplos devuelve solo la palabra NADA -NO hagas en tu output ninguna introducción como 'aquí te dejo los sinónimos generadoss' o similares. -Quiero utilizar directamente tu output así que SOLO quiero las respuestas como diccionarios en formato python. -Aquí tienes un ejemplo: -Lista de palabras: [Elaboración, conservas, pescados]. -Texto: Esta clase comprende la elaboración de conservas de pescado como: melva, salmón o sardina -Salida: -{'pescados': ['atun', 'caballa', 'pez espada', 'anchoa' ], -'elaboración': ['fabricación', 'manufactura', 'produccion']} -Ahora hazlo tu: -Lista de palabras: <title> -Texto: <includes> -""" - -promt_data_de = """ -Eres un experto en generación de datos de entrenamiento para modelos de clasificación. -Tu tarea es tomar una frase raíz de una actividad económica general y generar frases de entrenamiento a partir de ejemplos concretos incluidos en un texto. -Cuando corresponda, procura introducir la palabra 'otras' u 'otros' y también el uso o destino del producto. -Siempre que sea fabricación de un objeto, incluye el material con el que se hace. -Evita generar frases completamente distintas o que cambien el contexto y devuelve ÚNICAMENTE las frases sin presentaciones, comentarios, explicaciones ni frases adicionales. -Evita introducir tu output con frases como 'aquí te dejo las frases generadas' o similares. -Quiero utilizar directamente tu output así que SOLO quiero las frases. -Aquí tienes un ejemplo: -Raiz: Elaboración de productos de panadería y pastelería. -Texto: Esta clase incluye la fabricación de cualquier producto de panadería como: barras de pan, pasteles, otras harinas y pan para pienso. -Salida: -Elaboración de barras de pan. -Elaboración de pan para pienso. -roducción de productos de pasteles. -Elaboración de otras harinas. -Ahora hazlo tú con la siguiente raíz y texto: -Raiz: <title> -Texto: <includes>. - -Erstelle je Klasse insgesamt genau 50 deutsche Daten. Ein drittel der erzeugten Daten pro Klasse sollen detaillierte Beschreibung aus Sicht des Gewerbetreibenden beinhalten. -Ein Beispiel: Wir sind ein Betrieb und arbeiten im Garten- und Landschaftsbau. Wir pflanzen Schnittblumen zu jeder Jahreszeit für den Verkauf an Floristen an. Beachte, was in der Klasse ein- und auschlossen ist. -Ändere nicht die Tätigkeit der vorgegebenen Klasse. -""" - - diff --git a/synt_data_maker/data_handler.py b/synt_data_maker/data_handler.py new file mode 100644 index 0000000..4c60e22 --- /dev/null +++ b/synt_data_maker/data_handler.py @@ -0,0 +1,213 @@ +# -*- coding: utf-8 -*- +# ------------------------------------------------------------------------------ +# Copyright (C) [2025] Instituto Nacional de Estadística +# +# Este archivo forma parte del proyecto statcodgen. +# +# Licenciado bajo la Licencia Pública de la Unión Europea (EUPL) v.1.2. +# Puede obtener una copia de la licencia en la raiz de este proyecto o en: +# https://eupl.eu/1.2/es/ +# +# A menos que se indique lo contrario, este software se distribuye +# "TAL CUAL", SIN GARANTÍAS NI CONDICIONES DE NINGÚN TIPO. +# Consulte la licencia para conocer los términos específicos. +# ------------------------------------------------------------------------------ +# Copyright (C) [2025] National Institute of Statistics +# +# This file is part of the statcodgen project. +# +# Licensed under the European Union Public License (EUPL) v.1.2. +# You can obtain a copy of the license at the root of this project or at: +# https://eupl.eu/1.2/es/ +# +# Unless otherwise indicated, this software is distributed +# "AS IS", WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. +# See the license for specific terms. +# ------------------------------------------------------------------------------ + +import pickle +import json +import yaml + + +def read_json(name: str) -> dict: + """ + Read a JSON file from disk. + + Parameters + ---------- + name : str + Path to the JSON file. + + Returns + ------- + dict + Parsed JSON content as a Python dictionary. + + Raises + ------ + FileNotFoundError + If `name` does not exist. + json.JSONDecodeError + If the file content is not valid JSON. + UnicodeDecodeError + If the file cannot be decoded as UTF-8. + """ + with open(name, "r", encoding="utf-8") as j: + data = json.load(j) + return data + + +def save_json(name: str, data: dict): + """ + Save a dictionary as a JSON file. + + Parameters + ---------- + name : str + Output path for the JSON file. + data : dict + Data to serialize as JSON. + + Returns + ------- + None + + Raises + ------ + OSError + If the file cannot be written. + TypeError + If `data` contains non-JSON-serializable objects. + + Notes + ----- + The current implementation uses the default `json.dump` settings (no + indentation, ASCII escaping enabled by default). + """ + with open(name, "w") as j: + json.dump(data, j) + + +def read_pkl(name: str): + """ + Load a Python object from a pickle file. + + Parameters + ---------- + name : str + Path to the pickle file. + + Returns + ------- + object + Deserialized Python object stored in the pickle file. + + Raises + ------ + FileNotFoundError + If `name` does not exist. + pickle.UnpicklingError + If the file is not a valid pickle stream. + Exception + Any exception raised during unpickling (e.g., missing class definitions). + + Warnings + -------- + Loading pickle files can execute arbitrary code. Only unpickle data from + trusted sources. + """ + with open(name, 'rb') as p: + data = pickle.load(p) + return data + + +def save_pkl(name: str, data): + """ + Serialize and save a Python object to a pickle file. + + Parameters + ---------- + name : str + Output path for the pickle file. + data : object + Python object to serialize. + + Returns + ------- + None + + Raises + ------ + OSError + If the file cannot be written. + pickle.PicklingError + If `data` cannot be pickled. + + Notes + ----- + Uses `pickle.HIGHEST_PROTOCOL`. + """ + with open(name, 'wb') as p: + pickle.dump(data, p, protocol=pickle.HIGHEST_PROTOCOL) + + +def read_yaml(name: str): + """ + Read a YAML file from disk using safe loading. + + Parameters + ---------- + name : str + Path to the YAML file. + + Returns + ------- + object + Parsed YAML content (commonly a dict or list), depending on the YAML + structure. + + Raises + ------ + FileNotFoundError + If `name` does not exist. + yaml.YAMLError + If the file content is not valid YAML. + UnicodeDecodeError + If the file cannot be decoded as UTF-8. + """ + with open(name, 'r', encoding="utf-8") as y: + data = yaml.safe_load(y) + return data + + +def save_yaml(name:str, data): + """ + Save data as a YAML file. + + Parameters + ---------- + name : str + Intended output path for the YAML file. + data : object + Data to serialize to YAML (typically dict, list, etc.). + + Returns + ------- + None + + Raises + ------ + OSError + If the file cannot be written. + yaml.YAMLError + If `data` cannot be represented as YAML. + + Notes + ----- + The current implementation ignores the `name` parameter and always writes + to `'data.yml'`. + """ + with open('data.yml', 'w') as outfile: + yaml.dump(data, outfile, default_flow_style=False) + diff --git a/synt_data_maker/example_new_approach/input/poem_body_template.json b/synt_data_maker/example_new_approach/input/poem_body_template.json new file mode 100644 index 0000000..d6e81d6 --- /dev/null +++ b/synt_data_maker/example_new_approach/input/poem_body_template.json @@ -0,0 +1,4 @@ +{ + "en": "The poem must be a **type**.\nMust be about **topic**.\nWrite only the poem, without any introduction.", + "sp": "El poema debe ser **type**.\nDebe tratar sobre **topic**.\nEscribe solo el poema, sin ninguna introducción." +} \ No newline at end of file diff --git a/synt_data_maker/example_new_approach/input/poem_param.json b/synt_data_maker/example_new_approach/input/poem_param.json new file mode 100644 index 0000000..3c551fa --- /dev/null +++ b/synt_data_maker/example_new_approach/input/poem_param.json @@ -0,0 +1,16 @@ +{ + "en": { + "topic": { + "l": "love", + "c": "city", + "w": "war" + } + }, + "sp": { + "topic": { + "l": "amor", + "c": "ciudad", + "w": "guerra" + } + } +} \ No newline at end of file diff --git a/synt_data_maker/example_new_approach/input/structure_template.json b/synt_data_maker/example_new_approach/input/structure_template.json new file mode 100644 index 0000000..1233890 --- /dev/null +++ b/synt_data_maker/example_new_approach/input/structure_template.json @@ -0,0 +1,4 @@ +{ + "en": "**intro**\n[poem_body]", + "sp": "**intro**\n[poem_body]" +} \ No newline at end of file diff --git a/synt_data_maker/input/body_param_template.json b/synt_data_maker/input/body_param_template.json new file mode 100644 index 0000000..724f932 --- /dev/null +++ b/synt_data_maker/input/body_param_template.json @@ -0,0 +1,5 @@ +{ + "en": "Write the examples in **language**.\nWrite **number** examples.\nThe examples must contain **len_constraint** words.\nThe examples must be **output_format**.\nYou must only answer with examples; do not provide an introduction.", + "sp": "Escribe los ejemplos en **language**.\nEscribe **number** ejemplos.\nLos ejemplos deben contener **len_constraint** palabras.\nLos ejemplos deben estar **output_format**.\nSolo debes responder con ejemplos; no proporciones una introducción.", + "de": "Schreibe die Beispiele in **language**.\nSchreibe **number** Beispiele.\nDie Beispiele müssen **len_constraint** Wörter enthalten.\nDie Beispiele müssen **output_format** sein.\nDu darfst nur mit Beispielen antworten; gib keine Einleitung." +} \ No newline at end of file diff --git a/synt_data_maker/input/body_template.json b/synt_data_maker/input/body_template.json new file mode 100644 index 0000000..3eb6937 --- /dev/null +++ b/synt_data_maker/input/body_template.json @@ -0,0 +1,5 @@ +{ + "sp": "**body_intro**\n**task**\n[body_param]", + "en": "**body_intro**\n**task**\n[body_param]", + "de": "**body_intro**\n**task**\n[body_param]" +} \ No newline at end of file diff --git a/synt_data_maker/input/example_template.json b/synt_data_maker/input/example_template.json new file mode 100644 index 0000000..e904dd1 --- /dev/null +++ b/synt_data_maker/input/example_template.json @@ -0,0 +1,5 @@ +{ + "sp": "Aquí tienes un ejemplo:**example**", + "en": "Here is an example:**example**", + "de": "Hier ist ein Beispiel:**example**" +} \ No newline at end of file diff --git a/synt_data_maker/input/objetive_template.json b/synt_data_maker/input/objetive_template.json new file mode 100644 index 0000000..487f21b --- /dev/null +++ b/synt_data_maker/input/objetive_template.json @@ -0,0 +1,5 @@ +{ + "sp": "Ahora es tu turno:Descripción:**description**:\nLista de palabras:**key_words**", + "en": "Now it's your turn:Description:**description**\nKey words:**key_words**", + "de": "Jetzt bist du dran:Beschreibung:**description**\nSchlüsselwörter:**key_words**" +} \ No newline at end of file diff --git a/synt_data_maker/input/param_dict.json b/synt_data_maker/input/param_dict.json new file mode 100644 index 0000000..2108553 --- /dev/null +++ b/synt_data_maker/input/param_dict.json @@ -0,0 +1,65 @@ +{ + "en": { + "task": { + "generate": "I want you to generate specific examples of economic activities that fit a given description.", + "process": "I want you to extract all specific examples of economic activities included in a given description.", + "combine": "I am going to give you a list of words and a description of economic activities in Norwegian Bokmål. I would like you to create new examples of economic activities by combining the different words. The examples must fit with the description" + }, + "language": { + "sp": "spanish", + "en": "english", + "nb": "bokmål norwegian", + "de": "german" + }, + "output_format": { + "lines": "separated by line break" + }, + "len_constraint": { + "short": "between 1-5", + "medium": "between 5-15", + "long": "more than 15" + } + }, + "sp": { + "task": { + "generate": "Quiero que generes ejemplos específicos de actividades económicas que encajen con una descripción dada.", + "process": "Quiero que extraigas todos los ejemplos específicos de actividades económicas incluidos en una descripción dada.", + "combine": "Voy a darte una lista de palabras y una descripción de actividades económicas en noruego Bokmål. Me gustaría que crearas nuevos ejemplos de actividades económicas combinando las diferentes palabras. Los ejemplos deben ajustarse a la descripción." + }, + "language": { + "sp": "español", + "en": "inglés", + "nb": "noruego bokmål", + "de": "alemán" + }, + "output_format": { + "lines": "separados por salto de línea" + }, + "len_constraint": { + "short": "entre 1 y 5", + "medium": "entre 5 y 15", + "long": "más de 15" + } + }, + "de": { + "task": { + "generate": "Ich möchte, dass du spezifische Beispiele für wirtschaftliche Aktivitäten generierst, die zu einer gegebenen Beschreibung passen.", + "process": "Ich möchte, dass du alle spezifischen Beispiele für wirtschaftliche Aktivitäten aus einer gegebenen Beschreibung extrahierst.", + "combine": "Ich werde dir eine Liste von Wörtern und eine Beschreibung wirtschaftlicher Aktivitäten auf Norwegisch (Bokmål) geben. Ich möchte, dass du neue Beispiele für wirtschaftliche Aktivitäten erstellst, indem du die verschiedenen Wörter kombinierst. Die Beispiele müssen zur Beschreibung passen." + }, + "language": { + "sp": "Spanisch", + "en": "Englisch", + "nb": "Norwegisch (Bokmål)", + "de": "Deutsch" + }, + "output_format": { + "lines": "durch Zeilenumbruch getrennt" + }, + "len_constraint": { + "short": "zwischen 1 und 5", + "medium": "zwischen 5 und 15", + "long": "mehr als 15" + } + } +} \ No newline at end of file diff --git a/synt_data_maker/input/prompt_template.json b/synt_data_maker/input/prompt_template.json new file mode 100644 index 0000000..afe30e7 --- /dev/null +++ b/synt_data_maker/input/prompt_template.json @@ -0,0 +1,5 @@ +{ + "sp": "[body]\n[example]\n[objetive]", + "en": "[body]\n[example]\n[objetive]", + "de": "[body]\n[example]\n[objetive]" +} \ No newline at end of file diff --git a/synt_data_maker/input/static_examples.json b/synt_data_maker/input/static_examples.json new file mode 100644 index 0000000..20f4ea2 --- /dev/null +++ b/synt_data_maker/input/static_examples.json @@ -0,0 +1,5 @@ +{ + "generate":"Denne klassen omfatter fremstilling av alle bakervarer, som brød, kaker, annet mel og fôrbrød\nYour reply:\nProduksjon av alle typer bakervarer og konditorvarer.\nProduksjon av brød og boller med fullkornsmel.\nProduksjon av bakervarer og konditorvarer\nProduksjon av boller, kaker og alle typer konditorvarer\nProduksjon av fullkornsbrød og brød med frø\nProduksjon av brød og raffinert mel\nMalingen av hvete til produksjon av hvete\nProduksjon av mel til dyrefôr\nProduksjon av alle typer mel til brødbakst\nProduksjon av bakervarer med og uten frø\n", + "process":"Description:\nDenne klassen omfatter fremstilling av alle bakervarer, som brød, kaker, annet mel og fôrbrød\nYour reply:\nProduksjon av alle typer bakervarer\nProduksjon av brød og kaker\nProduksjon av brød og andre melvarer\nProduksjon av sandwichbrød og kaker\nProduksjon av sandwichbrød og andre melvarer\nProduksjon av kaker og andre melvarer\nProduksjon av brød og sandwichbrød\n", + "combine":"Description:\nDenne klassen omfatter fremstilling av alle bakervarer, som brød, kaker, annet mel og fôrbrød\nYour reply:\nProduksjon av alle typer bakervarer\nProduksjon av brød og kaker\nProduksjon av brød og andre melvarer\nProduksjon av sandwichbrød og kaker\nProduksjon av sandwichbrød og andre melvarer\nProduksjon av kaker og andre melvarer\nProduksjon av brød og sandwichbrød\n" +} \ No newline at end of file diff --git a/synt_data_maker/probe.py b/synt_data_maker/probe.py new file mode 100644 index 0000000..1a72d29 --- /dev/null +++ b/synt_data_maker/probe.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Nov 19 13:25:01 2025 + +@author: U853768 +""" +from process_notes import ProcessNACENB +import pandas as pd +import os +from synt_data_maker import OnyxiaSyntDataGenerator + +notes_n = "notas_explicativas_noruego.csv" +train_n = "train_norwaydata.csv" +index_n = "NACE Rev. 2.1 - Index entries - Overview_translated_to_NB_with_deepl_complete.csv" +path = r'C:\Users\U000000\Documents\metodologia y de muestras\Programacion\Generador_de_base_de_datos\Noruego\input' +########## +notes_df = pd.read_csv( + os.path.join(path, notes_n), + dtype='str', + encoding='latin-1', + sep=';', +) +train_df = pd.read_csv( + os.path.join(path, train_n), + dtype='str' +) +index_df = pd.read_csv( + os.path.join(path, index_n), + dtype='str' +) +# Se obtiene el notes_dict ####### +process_nb = ProcessNACENB( + notes_df=notes_df[notes_df['level'] == "4"], + examples_df=index_df, + sample=5 +) + +process_nb.set_descriptions( + col_c='code', + col_d_l=['name', 'notes'] +) + +process_nb.set_key_words( + col_c='CODE', + col_k='KEYWORD_NB' +) + +process_nb.example_df = train_df +process_nb.set_examples( + col_c='nace_21_4digit_code', + col_d='text_for_coding' +) +# Se generan datos sintéticos +body_intro = 'You are an expert Norwegian economist.' +data_dict = process_nb.get_dict_data() +data_maker = OnyxiaSyntDataGenerator( + root=r'C:\Users\U000000\Documents\metodologia y de muestras\Programacion\Gitlab_Projects\statcodgen\synt_data_maker', + api_key="sk-50116763c294421a8e796d17e9fab0bb", + model="gpt-oss:120b", + label_contents_map_dict=data_dict, + body_intro=body_intro, + task='generate', + n_responses='11', + p_lang='en', + o_lang='sp', + output_format='lines', + len_constraint='short', + eg_filename='static_examples.json', + eg_strategy='static' + ) +response_df = data_maker.get_synt_data_df() +#meta_data_dict = data_maker.get_meta_analysis_data_df(reponse_df) +#data_dict_rep = data_maker.data_dict + + +#####_Se_generan_los prompts_#### +# ============================================================================= +# from prompt_synt_data_maker import PromptSyntDataMaker +# +# dict_ = { +# 'body_intro': 'You are an expert Norwegian economist', +# 'task': 'process', +# 'language': 'nb', +# 'longitude': 'short', +# 'rep_format': 'lines', +# 'example': 'This is an example', +# 'objetive': 'This is a description' +# } +# +# prompt_maker = PromptSyntDataMaker(prompt_lang='sp') +# example = """Descripción: +# Esta clase incluye la fabricación de todo tipo de productos de panadería, como pan, pasteles, otras harinas y pan de molde. +# Su respuesta: +# Fabricación de todo tipo de productos de panadería y pastelería. +# Fabricación de pan y bollos con harina integral. +# Producción de productos de panadería y pastelería. +# Producción de bollos, pasteles y todo tipo de productos de pastelería. +# Producción de pan integral y pan con semillas. +# Producción de pan y harina refinada. +# Molienda de trigo para la producción de harina. +# Producción de harina para piensos. +# Producción de todo tipo de harina para panadería. +# Producción de productos de panadería con y sin semillas. +# """ +# print(prompt_maker.write_prompt( +# body_intro='Eres un economista español', +# task='generate', +# language='sp', +# format_='lines', +# longitude='medium', +# example=example, +# description='Esto es una descripcion' +# )) +# ============================================================================= diff --git a/synt_data_maker/process_notes.py b/synt_data_maker/process_notes.py new file mode 100644 index 0000000..9a0938d --- /dev/null +++ b/synt_data_maker/process_notes.py @@ -0,0 +1,334 @@ +# -*- coding: utf-8 -*- +# ------------------------------------------------------------------------------ +# Copyright (C) [2025] Instituto Nacional de Estadística +# +# Este archivo forma parte del proyecto statcodgen. +# +# Licenciado bajo la Licencia Pública de la Unión Europea (EUPL) v.1.2. +# Puede obtener una copia de la licencia en la raiz de este proyecto o en: +# https://eupl.eu/1.2/es/ +# +# A menos que se indique lo contrario, este software se distribuye +# "TAL CUAL", SIN GARANTÍAS NI CONDICIONES DE NINGÚN TIPO. +# Consulte la licencia para conocer los términos específicos. +# ------------------------------------------------------------------------------ +# Copyright (C) [2025] National Institute of Statistics +# +# This file is part of the statcodgen project. +# +# Licensed under the European Union Public License (EUPL) v.1.2. +# You can obtain a copy of the license at the root of this project or at: +# https://eupl.eu/1.2/es/ +# +# Unless otherwise indicated, this software is distributed +# "AS IS", WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. +# See the license for specific terms. +# ------------------------------------------------------------------------------ + +from abc import ABC +import re +import logging +import random + +class NotesProcess(ABC): + """ + Initialize a notes-processing pipeline. + + The initializer stores the input notes and examples data, optionally + samples the notes DataFrame, initializes the internal dictionary used + to accumulate per-code metadata, and configures an instance logger. + + Parameters + ---------- + notes_df : pandas.DataFrame or None, optional + DataFrame containing the notes to process. If `sample` is provided + and truthy, a random sample is taken from this DataFrame. + sample : int or None, optional + Number of rows to sample from `notes_df`. If falsy/None, the full + DataFrame is used. + random_state : int, default=23 + Random seed used when sampling `notes_df`. + examples_df : pandas.DataFrame or None, optional + DataFrame containing auxiliary examples and/or keywords associated + with codes. + n_examples : int, default=10 + Maximum number of examples to sample per code when building the + dictionary. + + Attributes + ---------- + notes_df : pandas.DataFrame + Notes data (possibly sampled). + example_df : pandas.DataFrame or None + Examples data. + n_eg : int + Maximum number of examples per code. + notes_dict : dict + Dictionary keyed by code. Each value is a dict that may contain + ``'description'``, ``'example'``, and ``'key_words'``. + logger : logging.Logger + Instance logger created by :meth:`set_logger`. + + Returns + ------- + None + """ + def __init__( + self, + notes_df=None, + sample=None, + random_state=23, + examples_df=None, + n_examples=10 + ): + self.random_state = random_state + if not sample: + self.notes_df = notes_df + else: + self.notes_df = notes_df.sample(n=sample, random_state=random_state) + self.example_df = examples_df + self.n_eg = n_examples + self.notes_dict = {} + self.set_logger() + + def set_logger(self): + """ + Configure an instance-level logger for the class. + + Notes + ----- + - Prevents propagation to avoid duplicate logs. + - Creates a stream handler with timestamped formatter. + """ + self.logger = logging.getLogger(f'PromptMaker.{id(self)}') + self.logger.setLevel(logging.INFO) + formatter = logging.Formatter( + '%(asctime)s - %(levelname)s - %(message)s' + ) + stream_handler = logging.StreamHandler() + stream_handler.setFormatter(formatter) + if not self.logger.hasHandlers(): + self.logger.addHandler(stream_handler) + self.logger.propagate = False + + @staticmethod + def preprocess_text(text): + """ + Preprocess a text string. + + This base implementation is a no-op that returns the input unchanged. + Subclasses can override it to apply domain-specific normalization. + + Parameters + ---------- + text : str + Input text. + + Returns + ------- + str + The (possibly) processed text. In this base class it is identical to + the input. + """ + return text + + def compare_keys(self, col_c, name): + """ + Identify codes present in `notes_dict` that are missing from the examples DataFrame. + + The method compares the current keys of ``self.notes_dict`` against the + unique values of ``self.example_df[col_c]`` and returns the missing codes. + If any are missing, a warning is logged. + + Parameters + ---------- + col_c : str + Column name in ``self.example_df`` that contains code/label values. + name : str + Human-readable label used in the warning message (e.g., "examples", + "key words"). + + Returns + ------- + list + Codes that exist in ``self.notes_dict`` but do not appear in + ``self.example_df[col_c]``. + """ + data_c = self.notes_dict.keys() + examples_c = self.example_df[col_c].unique() + no_exp_key = [code for code in data_c if code not in examples_c] + if len(no_exp_key) > 0: + self.logger.warning(f"{no_exp_key} labels don't have {name}") + return no_exp_key + + def set_descriptions(self, col_c: str, col_d_l: list) -> None: + """ + Populate per-code descriptions in `notes_dict` from the notes DataFrame. + + For each row in ``self.notes_df``, this method concatenates the values of + the columns listed in `col_d_l` (joined by newlines), applies + :meth:`preprocess_text`, and stores the result as + ``self.notes_dict[code]['description']`` where `code` is taken from + ``row[col_c]``. + + Parameters + ---------- + col_c : str + Column name in ``self.notes_df`` containing the code/label used as the + dictionary key. + col_d_l : list of str + Column names in ``self.notes_df`` whose values are concatenated to form + the description. + + Returns + ------- + None + Updates ``self.notes_dict`` in-place. + + Notes + ----- + The function signature annotates a ``str`` return type, but the current + implementation does not return a value. + """ + for _, row in self.notes_df.iterrows(): + desc = self.preprocess_text('\n'.join(row[name] for name in col_d_l)) + code = row[col_c] + self.notes_dict[code] = {} + self.notes_dict[code]['description'] = desc + + def set_examples(self, col_c: str, col_d: str): + """ + Attach sampled example texts to each code in `notes_dict`. + + For each code in ``self.notes_dict``, this method selects matching rows in + ``self.example_df`` (where ``example_df[col_c] == code``), samples up to + ``self.n_eg`` values from column `col_d`, joins them with newlines, applies + :meth:`preprocess_text`, and stores the result under ``'example'``. + + If a code is missing from the examples DataFrame, ``None`` is stored. + + Parameters + ---------- + col_c : str + Column name in ``self.example_df`` containing code/label values. + col_d : str + Column name in ``self.example_df`` containing example text values. + + Returns + ------- + None + Updates ``self.notes_dict`` in-place. + """ + examples_keys = self.compare_keys(col_c, "examples") + for code, in_dict in self.notes_dict.items(): + if code in examples_keys: + examples = None + else: + eg_df = self.example_df[self.example_df[col_c] == code] + if self.n_eg < eg_df.shape[0]: + n = self.n_eg + else: + n = eg_df.shape[0] + examples = self.preprocess_text('\n'.join(eg_df[col_d].sample(n=n, random_state=self.random_state).values)) + in_dict['example'] = examples + + def set_key_words(self, col_c: str, col_k: str): + """ + Attach keyword tokens to each code in `notes_dict`. + + For each code in ``self.notes_dict``, the method selects matching rows in + ``self.example_df`` (where ``example_df[col_c] == code``), removes + duplicates by `col_k`, splits each keyword string on whitespace, applies + :meth:`preprocess_text` to each token, and stores the resulting list under + ``'key_words'``. + + If a code is missing from the examples DataFrame, ``None`` is stored. + + Parameters + ---------- + col_c : str + Column name in ``self.example_df`` containing code/label values. + col_k : str + Column name in ``self.example_df`` containing keyword strings. + + Returns + ------- + None + Updates ``self.notes_dict`` in-place. + """ + examples_keys = self.compare_keys(col_c, "key words") + for code, in_dict in self.notes_dict.items(): + if code in examples_keys: + key_words = None + else: + eg_df = self.example_df[self.example_df[col_c] == code].copy() + eg_df.drop_duplicates(subset=col_k, inplace=True) + key_words = [self.preprocess_text(word) for value in eg_df[col_k] for word in value.split()] + if self.n_eg < len(key_words): + n = self.n_eg + else: + n = len(key_words) + rng = random.Random(self.random_state) + key_words = rng.sample(key_words, k=n) + in_dict['key_words'] = key_words + + def get_dict_data(self, col_notes_c=None, col_notes_d_l=None, col_eg_c=None, col_eg_d=None, col_eg_k=None): + """ + Build and return the per-code dictionary with descriptions, examples and keywords. + + Depending on which column names are provided, this method calls: + - :meth:`set_descriptions` if `col_notes_c` and `col_notes_d_l` are provided, + - :meth:`set_examples` if `col_eg_c` and `col_eg_d` are provided, + - :meth:`set_key_words` if `col_eg_c` and `col_eg_k` are provided. + + Parameters + ---------- + col_notes_c : str or None, optional + Code column in ``self.notes_df``. + col_notes_d_l : list of str or None, optional + List of note columns in ``self.notes_df`` used to build descriptions. + col_eg_c : str or None, optional + Code column in ``self.example_df`` used to match examples/keywords. + col_eg_d : str or None, optional + Example text column in ``self.example_df``. + col_eg_k : str or None, optional + Keyword string column in ``self.example_df``. + + Returns + ------- + dict + The populated dictionary mapping ``code -> metadata``. + """ + if col_notes_c and col_notes_d_l: + self.set_descriptions(col_notes_c, col_notes_d_l) + if col_eg_c and col_eg_d: + self.set_examples(col_eg_c, col_eg_d) + if col_eg_c and col_eg_k: + self.set_key_words(col_eg_c, col_eg_k) + return self.notes_dict + + +class ProcessNACENB(NotesProcess): + @staticmethod + def preprocess_text(text: str) -> str: + """ + Preprocess Norwegian NACE by filtering characters and normalizing spacing. + + The method removes characters not matching a restricted alphabet + (A-Z/a-z plus ÆØÅ/æøå), selected punctuation, whitespace and newlines. + It also fixes spacing before periods (replaces `' .'` with `'.'`). + + Parameters + ---------- + text : str + Input text. + + Returns + ------- + str + Cleaned text containing only allowed characters and normalized + punctuation spacing. + """ + pattern = r"[^A-Za-zÆØÅæøå\.\,\:\;\-\"\' \n]" + text = re.sub(pattern, "", text) + return text.replace(' .', '.') \ No newline at end of file diff --git a/synt_data_maker/prompt_maker.py b/synt_data_maker/prompt_maker.py new file mode 100644 index 0000000..dd5bcda --- /dev/null +++ b/synt_data_maker/prompt_maker.py @@ -0,0 +1,246 @@ +# -*- coding: utf-8 -*- +# ------------------------------------------------------------------------------ +# Copyright (C) [2025] Instituto Nacional de Estadística +# +# Este archivo forma parte del proyecto statcodgen. +# +# Licenciado bajo la Licencia Pública de la Unión Europea (EUPL) v.1.2. +# Puede obtener una copia de la licencia en la raiz de este proyecto o en: +# https://eupl.eu/1.2/es/ +# +# A menos que se indique lo contrario, este software se distribuye +# "TAL CUAL", SIN GARANTÍAS NI CONDICIONES DE NINGÚN TIPO. +# Consulte la licencia para conocer los términos específicos. +# ------------------------------------------------------------------------------ +# Copyright (C) [2025] National Institute of Statistics +# +# This file is part of the statcodgen project. +# +# Licensed under the European Union Public License (EUPL) v.1.2. +# You can obtain a copy of the license at the root of this project or at: +# https://eupl.eu/1.2/es/ +# +# Unless otherwise indicated, this software is distributed +# "AS IS", WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. +# See the license for specific terms. +# ------------------------------------------------------------------------------ + +import re +import os +import logging +from abc import ABC + + +class PromptMaker(ABC): + """ + Utility class for building prompts from templates with placeholder patterns. + + This class replaces placeholder substrings within a template using values + defined in a dictionary. Placeholders follow a pattern such as ``"**-**"``, + where the hyphen represents a variable gap. + + Parameters + ---------- + prompt_template : str + The template string containing placeholders to be filled. + plh_replace_map_dict : dict + A dictionary mapping placeholder keys to replacement text. + gap_ph_pattern : str, optional + The placeholder pattern indicating gaps, by default ``"**-**"``. + root : str or None, optional + Root directory path. If ``None``, the current working directory is used. + + Attributes + ---------- + root : str + The resolved root directory path. + prompt_template : str + The stored prompt template. + plh_replace_map_dict : dict + Mapping of keys to replacement values. + gap_ph_pattern : str + Pattern describing the placeholder format. It must contains a hypen. + logger : logging.Logger + Instance-specific logger for the class. + """ + + def __init__( + self, + prompt_template, + plh_replace_map_dict, + gap_ph_pattern='**-**', + root=None + ): + self.set_logger() + if root is None: + self.root = os.path.abspath('.') + else: + self.root = root + self.prompt_template = prompt_template + self.plh_replace_map_dict = plh_replace_map_dict + self.gap_ph_pattern = gap_ph_pattern + + def set_logger(self): + """ + Configure an instance-level logger for the class. + + Notes + ----- + - Prevents propagation to avoid duplicate logs. + - Creates a stream handler with timestamped formatter. + """ + self.logger = logging.getLogger(f'PromptMaker.{id(self)}') + self.logger.setLevel(logging.INFO) + formatter = logging.Formatter( + '%(asctime)s - %(levelname)s - %(message)s' + ) + stream_handler = logging.StreamHandler() + stream_handler.setFormatter(formatter) + if not self.logger.hasHandlers(): + self.logger.addHandler(stream_handler) + self.logger.propagate = False + + @staticmethod + def check_text(string: str, text: str): + """ + Validate that ``text`` exists within ``string``. + + Parameters + ---------- + string : str + The larger string to search. + text : str + The substring that must be present. + + Raises + ------ + ValueError + If ``text`` is not found within ``string``. + """ + if text not in string: + raise ValueError(f"'{text}' not found in {string}.") + + @staticmethod + def fill_gap(string: str, pattern: str, text: str) -> str: + """ + Replace the first occurrence of a placeholder pattern with a value. + + Parameters + ---------- + string : str + The string containing the placeholder. + pattern : str + The placeholder pattern to replace. + text : str + The replacement text. + + Returns + ------- + str + The string with the placeholder replaced. + + Raises + ------ + ValueError + If the placeholder pattern is not found. + """ + PromptMaker.check_text(string, pattern) + return string.replace(pattern, text) + + @staticmethod + def get_re_pattern(pattern: str) -> str: + """ + Convert a placeholder pattern into a regex pattern capturing the gap. + + Parameters + ---------- + pattern : str + Placeholder pattern, e.g., ``"**-**"``. + + Returns + ------- + str + A regex pattern where the hyphen is replaced with a capture group. + """ + escaped = re.escape(pattern) + regex = escaped.replace(r"\-", r"(.*?)") + return regex + + @staticmethod + def get_gap_keys(text: str, pattern: str) -> list: + """ + Extract all placeholder keys found in the template. + + Parameters + ---------- + text : str + The template string. + pattern : str + The placeholder pattern. + + Returns + ------- + list of str + List of keys extracted from the template. + + Raises + ------ + ValueError + If the pattern does not contain a hyphen, which indicates the gap. + """ + PromptMaker.check_text(pattern, '-') + re_pattern = PromptMaker.get_re_pattern(pattern) + return re.findall(re_pattern, text) + + @staticmethod + def fill_text(template: str, text_dict: str, pattern="**-**") -> str: + """ + Replace all placeholders in a template using a dictionary of values. + + Parameters + ---------- + template : str + The template containing one or more placeholders. + text_dict : dict + Mapping of placeholder keys to replacement values. + pattern : str, optional + Placeholder format, by default ``"**-**"``. + + Returns + ------- + str + Fully resolved text with all placeholders replaced. + + Raises + ------ + KeyError + If any placeholder key is missing in ``text_dict``. + ValueError + If a placeholder cannot be resolved. + """ + gap_keys = PromptMaker.get_gap_keys(template, pattern) + final_text = template + for gap_key in gap_keys: + gap = PromptMaker.fill_gap(pattern, '-', gap_key) + final_text = PromptMaker.fill_gap(final_text, gap, text_dict[gap_key]) + return final_text + + def fill_prompt(self): + """ + Fill the object's prompt template using the instance value dictionary. + Returns + ------- + str + Rendered prompt with all placeholders replaced. + + Notes + ----- + This is a convenience wrapper around :meth:`fill_text`. + """ + filled_prompt = self.fill_text( + self.prompt_template, + self.plh_replace_map_dict, + self.gap_ph_pattern + ) + final_prompt = "\n".join(line.strip() for line in filled_prompt.splitlines() if line.strip()) + return final_prompt diff --git a/synt_data_maker/prompt_synt_data_maker.py b/synt_data_maker/prompt_synt_data_maker.py new file mode 100644 index 0000000..004ce22 --- /dev/null +++ b/synt_data_maker/prompt_synt_data_maker.py @@ -0,0 +1,309 @@ +# -*- coding: utf-8 -*- +# ------------------------------------------------------------------------------ +# Copyright (C) [2025] Instituto Nacional de Estadística +# +# Este archivo forma parte del proyecto statcodgen. +# +# Licenciado bajo la Licencia Pública de la Unión Europea (EUPL) v.1.2. +# Puede obtener una copia de la licencia en la raiz de este proyecto o en: +# https://eupl.eu/1.2/es/ +# +# A menos que se indique lo contrario, este software se distribuye +# "TAL CUAL", SIN GARANTÍAS NI CONDICIONES DE NINGÚN TIPO. +# Consulte la licencia para conocer los términos específicos. +# ------------------------------------------------------------------------------ +# Copyright (C) [2025] National Institute of Statistics +# +# This file is part of the statcodgen project. +# +# Licensed under the European Union Public License (EUPL) v.1.2. +# You can obtain a copy of the license at the root of this project or at: +# https://eupl.eu/1.2/es/ +# +# Unless otherwise indicated, this software is distributed +# "AS IS", WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. +# See the license for specific terms. +# ------------------------------------------------------------------------------ + +from prompt_maker import PromptMaker +import os +from data_handler import read_json + + +class PromptSyntDataMaker(PromptMaker): + """ + Generate fully-resolved prompts using hierarchical template definitions + and parameter dictionaries. + + This class extends :class:`PromptMaker` by enabling recursive template + expansion from multiple JSON template files, and by selecting + parameter-specific values from structured dictionaries. It is designed + for synthetic data generation workflows where prompts must be assembled + from reusable components. + + Parameters + ---------- + root : str or None, optional + Root directory containing template and parameter files. If ``None``, + the current working directory is used. Template and parameter files will be load + from self.root/input + name_root_template : str, optional + Base name of the template JSON file (without suffix). Defaults to + ``"prompt"``, which loads ``input/prompt_template.json``. All templates must have + the next name structure ``input/<name>_template.json`` + name_param_dict : str, optional + Path to the JSON file containing parameter dictionaries. + Defaults to ``"input/param_dict.json"``. + prompt_lang : str, optional + Language key used to load templates and parameters. Defaults to ``"en"``. Will be the prompt + language + template_pattern : str, optional + Placeholder pattern used inside templates for referencing other + template blocks. Defaults to ``"[-]"``. + gap_ph_pattern : str, optional + Placeholder pattern used for parameter/value substitution. + Defaults to ``"**-**"``. + + Attributes + ---------- + param_dict : dict + Dictionary containing parameter values for the chosen language. + template_dict : dict + Dictionary mapping template names to text blocks, recursively loaded. + prompt_template : str or None + Current assembled prompt template (updated dynamically). + plh_replace_map_dict : dict + Dictionary containing resolved parameter values. + """ + + def __init__( + self, + root=None, + name_root_template='prompt', + name_param_dict='input/param_dict.json', + prompt_lang='en', + template_pattern='[-]', + gap_ph_pattern='**-**' + ): + super().__init__( + root=root, + prompt_template=None, + plh_replace_map_dict={}, + gap_ph_pattern=gap_ph_pattern + ) + self.param_dict = {} + if name_param_dict: + self.param_dict = read_json( + os.path.join(self.root, name_param_dict), + )[prompt_lang] + self.template_dict = self.get_dict_template( + name=name_root_template, + lang=prompt_lang, + dict_template={}, + pattern=template_pattern + ) + self.name_root_template = name_root_template + self.template_pattern = template_pattern + + def join_templates(self, root_template: str) -> str: + """ + Recursively fill template references inside a template string. + + Template references follow a pattern such as ``"[-]"``, where the + hyphen represents the name of a referenced template block. For example, + ``"[-subtemplate]"`` will be replaced by the text of ``subtemplate``. + + Parameters + ---------- + root_template : str + Template string possibly containing template references. + + Returns + ------- + str + Fully expanded template with all referenced blocks resolved. + + Raises + ------ + ValueError + If the pattern does not contain a hyphen, indicating an invalid + template reference format. + """ + PromptMaker.check_text(self.template_pattern, '-') + gap_keys = PromptMaker.get_gap_keys(root_template, self.template_pattern) + prompt_template = root_template + if len(gap_keys) != 0: + for key in gap_keys: + prompt_template = self.fill_text(root_template, self.template_dict, self.template_pattern) + prompt_template = self.join_templates(prompt_template) + return prompt_template + + def get_dict_template( + self, + name, + lang, + dict_template, + pattern + ) -> dict: + """ + Recursively load templates from JSON and resolve dependencies. + + For a given template name, this method loads the corresponding text + from ``input/<name>_template.json`` and scans for referenced templates. + It then recursively loads all dependent templates. + + Parameters + ---------- + name : str, optional + Template name to load. Defaults to ``"prompt"``. + lang : str, optional + Language key for retrieving the template text. Defaults to ``"sp"``. + dict_template : dict, optional + Dictionary that accumulates template name–text mappings. + pattern : str, optional + Pattern used for referencing other templates. + + Returns + ------- + dict + Dictionary mapping template names to template strings. + + Raises + ------ + FileNotFoundError + If the JSON template file does not exist. + KeyError + If the language key is not present in the template file. + """ + json_name = f"{name}_template.json" + json_dict = read_json( + os.path.join(self.root, 'input', json_name) + ) + text_template = json_dict[lang] + dict_template[name] = text_template + gap_keys = PromptMaker.get_gap_keys(text_template, pattern=pattern) + if len(gap_keys) != 0: + for key in gap_keys: + self.get_dict_template(name=key, lang=lang, dict_template=dict_template, pattern=pattern) + return dict_template + + def search_keys_dict(self) -> list: + """ + Identify which placeholder keys in the template correspond to parameter entries. + + Returns + ------- + list of str + Keys that exist both in the template and in ``param_dict``. + """ + key_list = self.get_gap_keys(self.prompt_template, self.gap_ph_pattern) + key_param = self.param_dict.keys() + return [key for key in key_list if key in key_param] + + def check_active_params_dict(self, param_keys, active_params_dict): + for p_key in param_keys: + v_key = active_params_dict.get(p_key) + if v_key is not None: + if self.param_dict[p_key].get(v_key) is None: + raise ValueError(f'Error getting param value "{p_key}": It must be one of the next possible values {list(self.param_dict[p_key].keys())}') + + def get_plh_replace_map_dict(self, active_params_dict: dict): + """ + Build the dictionary of replacement values for parameter substitution. + + Parameters + ---------- + active_params_dict : dict + Input dictionary whose keys select values from the parameter + dictionary. + + Notes + ----- + - Keys present in ``param_dict`` are interpreted as indexed parameter + selectors. + - Keys not present in ``param_dict`` are included as-is. + + Raises + ------ + KeyError + If a required index key is missing in ``active_params_dict``. + """ + param_keys = self.search_keys_dict() + self.check_active_params_dict(param_keys, active_params_dict) + plh_replace_map_dict = { + key: self.param_dict[key][active_params_dict[key]] + for key in param_keys + } + for key, item in active_params_dict.items(): + if key not in param_keys: + plh_replace_map_dict[key] = item + self.plh_replace_map_dict = plh_replace_map_dict + + def edit_template(self, active_params_dict): + """ + Remove template lines referencing unused parameters. + + If a placeholder key exists in the template but is not provided in + ``active_params_dict``, any line containing that placeholder is removed. + + Parameters + ---------- + active_params_dict : dict + Input dictionary specifying which parameters are active. + """ + param_keys = self.get_gap_keys( + self.prompt_template, + self.gap_ph_pattern + ) + sentences = self.prompt_template.split('\n') + for param_key in param_keys: + if active_params_dict.get(param_key) is None: + pat = self.fill_gap(self.gap_ph_pattern, '-', param_key) + sentences = [ + s for s in sentences + if pat not in s + ] + self.prompt_template = "\n".join(sentences) + self.prompt_template = self.prompt_template.replace(':', ':\n') + + def get_prompt(self, active_params_dict: dict) -> str: + """ + Construct the final prompt with all template logic and parameter + substitutions applied. + + Parameters + ---------- + active_params_dict : dict + Input dictionary selecting parameter variants and supplying + free-form values. + + Returns + ------- + str + Fully rendered prompt. + + Notes + ----- + Workflow: + 1. Expand template hierarchy. + 2. Remove unused template segments. + 3. Resolve parameter values. + 4. Fill placeholders and return final text. + """ + self.prompt_template = self.join_templates(self.template_dict[self.name_root_template]) + self.edit_template(active_params_dict) + self.get_plh_replace_map_dict(active_params_dict) + return self.fill_prompt() + + def write_prompt(self, **kwargs): + """ + Parameters + ---------- + **kwargs : + Returns + ------- + TYPE + DESCRIPTION. + """ + active_params_dict = dict(**kwargs) + return self.get_prompt(active_params_dict) diff --git a/synt_data_maker/synt_data_maker.py b/synt_data_maker/synt_data_maker.py new file mode 100644 index 0000000..26e7df1 --- /dev/null +++ b/synt_data_maker/synt_data_maker.py @@ -0,0 +1,838 @@ +# -*- coding: utf-8 -*- +# ------------------------------------------------------------------------------ +# Copyright (C) [2025] Instituto Nacional de Estadística +# +# Este archivo forma parte del proyecto statcodgen. +# +# Licenciado bajo la Licencia Pública de la Unión Europea (EUPL) v.1.2. +# Puede obtener una copia de la licencia en la raiz de este proyecto o en: +# https://eupl.eu/1.2/es/ +# +# A menos que se indique lo contrario, este software se distribuye +# "TAL CUAL", SIN GARANTÍAS NI CONDICIONES DE NINGÚN TIPO. +# Consulte la licencia para conocer los términos específicos. +# ------------------------------------------------------------------------------ +# Copyright (C) [2025] National Institute of Statistics +# +# This file is part of the statcodgen project. +# +# Licensed under the European Union Public License (EUPL) v.1.2. +# You can obtain a copy of the license at the root of this project or at: +# https://eupl.eu/1.2/es/ +# +# Unless otherwise indicated, this software is distributed +# "AS IS", WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. +# See the license for specific terms. +# ------------------------------------------------------------------------------ + +from tqdm import tqdm +from datetime import datetime +import os +import pandas as pd +from abc import ABC, abstractclassmethod +import requests +from collections import Counter, defaultdict +import statistics as stats +import re +from groq import Groq, RateLimitError +from openai import AzureOpenAI + +from data_handler import read_json +from prompt_synt_data_maker import PromptSyntDataMaker + + +class SyntDataGenerator(ABC): + """ + Initialize a synthetic data generator backed by an LLM and a prompt builder. + + This constructor stores configuration, loads (static) examples from disk, + instantiates a prompt builder, and optionally enriches each entry in + `label_contents_map_dict` with examples depending on the selected example strategy. + + Parameters + ---------- + root : str or os.PathLike + Project root directory. Used to locate inputs (e.g., example files). + api_key : str or None + API key used by the concrete backend implementation. + model : str + Model identifier or deployment name used by the backend. + label_contents_map_dict: dict + Mapping of class/label identifiers to dictionaries that must contain + at least a ``'description'`` field. This mapping is mutated in-place + later (e.g., adding ``'prompt'`` and ``'response'``). + body_intro : str + Introductory text included in prompts. + task : str, optional + Task identifier used to select examples from `eg_filename`. + n_responses : int, optional + Number of responses requested to the model (used inside prompts). + p_lang : str, default='en' + Prompt language passed to the prompt builder. + o_lang : str, optional + Output language requested in the prompt. + output_format: str, optional + Output format requested in the prompt (note: parameter name is `output_format` + in the current code). + len_constraint : str or int, optional + Length/longitude constraint used inside prompts. + eg_filename : str, default='static_examples.json' + Filename (under ``{root}/input``) with static examples in JSON format. + eg_strategy : {'static', 'mixed', None}, default='static' + Example strategy: + - ``'static'``: always inject static examples for the selected task. + - ``'mixed'``: only inject static examples when an entry has no example. + + Attributes + ---------- + date : str + Current date formatted as ``'%d%m%Y'``. + synth_data_df : pandas.DataFrame or None + Cached synthetic dataset produced by :meth:`get_synt_data_df`. + prompt_fields : dict + Dictionary of prompt fields passed to the prompt builder. + + Notes + ----- + This class is abstract. Subclasses must implement :meth:`api_call`. + """ + def __init__( + self, + root, + api_key, + model, + label_contents_map_dict, + body_intro=None, + task=None, + n_responses=None, + p_lang='en', + o_lang=None, + output_format=None, + len_constraint=None, + eg_filename='static_examples.json', + eg_strategy='static' + ): + self.date = datetime.now().strftime('%d%m%Y') + self.root = root + # data + self.label_contents_map_dict= label_contents_map_dict + self.synth_data_df = None + # prompt data + self.prompt_fields = { + 'body_intro': body_intro, + 'task': task, + 'number': n_responses, + 'language': o_lang, + 'format': output_format, + 'len_constraint': len_constraint + } + self.prompt_maker = PromptSyntDataMaker(root, prompt_lang=p_lang) + self.eg_filename = read_json( + os.path.join(self.root, 'input', eg_filename) + ) + self.eg_strategy = eg_strategy + self.model = model + self.api_key = api_key + self.get_examples() + + def get_examples(self): + """ + Populate example prompts in `label_contents_map_dict` according to the configured strategy. + + If a `task` is set, this method retrieves examples from the loaded JSON + (`eg_filename`) and injects them into each entry of `label_contents_map_dict` under the key + ``'example'``: + + - If ``eg_strategy == 'static'``: always inject the example for the selected task. + - If ``eg_strategy == 'mixed'``: inject only when an entry has no ``'example'``. + + Returns + ------- + None + The method updates `self.label_contents_map_dict` in-place. + """ + task = self.prompt_fields.get('task') + if task and self.eg_strategy == 'static': + for data_d in self.label_contents_map_dict.values(): + data_d['example'] = self.eg_filename[task] + + if task and self.eg_strategy == 'mixed': + for data_d in self.label_contents_map_dict.values(): + if not data_d.get('example'): + data_d['example'] = self.eg_filename[task] + + def api_call(self, prompt): + """ + Call the underlying LLM backend to generate text from a prompt. + + Parameters + ---------- + prompt : str + Fully rendered prompt to send to the model. + + Returns + ------- + str + Model-generated content as plain text. + + Notes + ----- + Subclasses must implement this method. + """ + pass + + def set_api_responses(self): + self.prompt_maker.logger.info('Setting api responses...') + for data_d in tqdm(self.label_contents_map_dict.values(), total=len(self.label_contents_map_dict.values())): + self.prompt_fields['description'] = data_d.get('description') + if self.eg_strategy: + self.prompt_fields['example'] = data_d.get('example') + prompt = self.prompt_maker.get_prompt(self.prompt_fields) + data_d['prompt'] = prompt + data_d['response'] = self.api_call(prompt) + + def get_synt_data_df(self): + """ + Generate prompts for each entry and store the LLM responses in `label_contents_map_dict`. + + For each value in `self.label_contents_map_dict`, this method: + 1) injects the entry description (and optionally an example) into `self.prompt_fields`, + 2) builds a prompt using the prompt builder, + 3) stores the prompt under ``data_d['prompt']``, + 4) calls :meth:`api_call` and stores the result under ``data_d['response']``. + + Returns + ------- + synth_data_df: pandas.DataFrame + Pandas DataFrame containing labeled model generate data. + + Notes + ----- + This method performs network calls via the concrete backend implementation. + """ + index_ = 0 + row_dict = {} + self.set_api_responses() + self.prompt_maker.logger.info('Getting response df...') + for label, data_d in tqdm(self.label_contents_map_dict.items(), total=len(self.label_contents_map_dict.keys())): + for sentence in data_d['response'].split('\n'): + row_dict[index_] = { + 'label': label, + 'literal': sentence + } + index_ += 1 + self.synth_data_df = pd.DataFrame.from_dict(row_dict, orient='index') + return self.synth_data_df + + @staticmethod + def clean_data_df(data_df, label_col, text_col): + """ + Build and return a DataFrame of synthetic examples from stored responses. + + If responses have not been generated yet, this method calls + :meth:`set_api_responses`. It then splits each stored response by newline + and creates one row per line with columns ``'label'`` and ``'literal'``. + + Returns + ------- + tuple: (pandas.dataframe, int) + pandas.DataFrame + DataFrame with one row per generated sentence, containing: + - ``label``: label key from `label_contents_map_dict` + - ``literal``: generated sentence (line) + + Side Effects + ------------ + Updates `self.synth_data_df` with the generated DataFrame. + """ + if label_col not in data_df.columns or text_col not in data_df.columns: + raise ValueError(f"DataFrame must have columns '{label_col}' and '{text_col}'") + + pruned_df = data_df[[label_col, text_col]].copy() + pruned_df = pruned_df.dropna(subset=[label_col, text_col]) + + pruned_df[text_col] = pruned_df[text_col].astype(str).str.strip() + pruned_df[label_col] = pruned_df[label_col].astype(str).str.strip() + + pruned_df = pruned_df[(pruned_df[text_col] != "") & (pruned_df[label_col] != "")] + if pruned_df.empty: + raise ValueError("No valid examples found in the DataFrame") + return pruned_df, len(data_df)-len(pruned_df) + + @staticmethod + def get_class_distribution(label_l): + """ + Compute label frequency statistics for a classification dataset. + + Parameters + ---------- + label_l : list + List of labels (one per example). + + Returns + ------- + examples_per_label : collections.Counter + Counts per label. + n_labels : int + Number of unique labels. + min_label : int + Minimum examples among labels. + max_label : int + Maximum examples among labels. + ratio_max_min : float or None + Ratio ``max_label / min_label``. Returns None if `min_label` is 0. + sub_rep : list + Labels with counts below the 1st 25-quantile (under-represented). + over_rep : list + Labels with counts above the 24th 25-quantile (over-represented). + + Notes + ----- + Quantiles are computed using ``statistics.quantiles(..., n=25)``. + """ + examples_per_label = Counter(label_l) + n_labels = len(examples_per_label) + min_label = min(examples_per_label.values()) + max_label = max(examples_per_label.values()) + q_list = stats.quantiles(examples_per_label.values(), n=25) + sub_rep = [key for key, value in examples_per_label.items() if value < q_list[0]] + over_rep = [key for key, value in examples_per_label.items() if value > q_list[23]] + ratio_max_min = max_label / min_label if min_label > 0 else None + + return examples_per_label, n_labels, min_label, max_label, ratio_max_min, sub_rep, over_rep + + @staticmethod + def count_words(t: str) -> int: + """ + Count whitespace-separated words in a string. + + Parameters + ---------- + t : str + Input text. + + Returns + ------- + int + Number of tokens obtained via ``t.split()``. + """ + return len(t.split()) + + @staticmethod + def get_length_global_stats(text_l): + """ + Compute global descriptive statistics of text length (in words). + + Parameters + ---------- + text_l : list of str + List of texts. + + Returns + ------- + dict + Dictionary with keys: ``min``, ``max``, ``mean``, ``q1``, ``q2``, ``q3``, + and ``std_dev`` (population std). + """ + lengths = [SyntDataGenerator.count_words(t) for t in text_l] + quantiles = stats.quantiles(lengths, n=4) + length_stats_global = { + "min": min(lengths), + "max": max(lengths), + "mean": stats.mean(lengths), + "q1": quantiles[0], + "q2": quantiles[1], + "q3": quantiles[2], + "std_dev": stats.pstdev(lengths) if len(lengths) > 1 else 0.0, + } + return length_stats_global + + @staticmethod + def get_length_class_stats(text_l, label_l): + """ + Compute per-label descriptive statistics of text length (in words). + + Parameters + ---------- + text_l : list of str + List of texts. + label_l : list + Labels aligned with `text_l`. + + Returns + ------- + dict + Mapping ``label -> stats`` where stats includes: ``n``, ``min``, ``max``, + ``mean``, ``q1``, ``q2``, ``q3``, and ``std_dev``. Quantiles are None + when a label has only one example. + """ + lengths = [SyntDataGenerator.count_words(t) for t in text_l] + lengths_per_label = defaultdict(list) + for lab, length in zip(label_l, lengths): + lengths_per_label[lab].append(length) + length_stats_per_label = {} + for lab, lenghts in lengths_per_label.items(): + if len(lenghts) > 1: + quantiles = stats.quantiles(lenghts, n=4) + else: + quantiles = [None]*3 + length_stats_per_label[lab] = { + "n": len(lenghts), + "min": min(lenghts), + "max": max(lenghts), + "mean": stats.mean(lenghts), + "q1": quantiles[0], + "q2": quantiles[1], + "q3": quantiles[2], + "std_dev": stats.pstdev(lenghts) if len(lenghts) > 1 else 0.0, + } + return length_stats_per_label + + @staticmethod + def get_tokens_stats(text_l, label_l, n_top_vocab): + """ + Compute vocabulary statistics globally and per label. + + Texts are lowercased and tokenized with the regex pattern ``\\w+``. + + Parameters + ---------- + text_l : list of str + List of texts. + label_l : list + Labels aligned with `text_l`. + n_top_vocab : int + Number of top global tokens to return. + + Returns + ------- + vocab_size : int + Number of unique tokens in the full corpus. + lexical_diversity : float + Ratio ``vocab_size / n_tokens`` (0.0 if no tokens). + top_vocab : list of tuple + List of ``(token, count)`` pairs for the `n_top_vocab` most frequent tokens. + vocab_per_label : dict + Mapping ``label -> {'vocab_size': int, 'lexical_diversity': float}``. + + Notes + ----- + Lexical diversity is computed as unique tokens divided by total tokens. + """ + token_pattern = re.compile(r"\w+", re.UNICODE) + all_tokens = [] + tokens_per_label = defaultdict(list) + + for lab, txt in zip(label_l, text_l): + text_norm = txt.lower() + tokens = token_pattern.findall(text_norm) + all_tokens.extend(tokens) + tokens_per_label[lab].extend(tokens) + + if all_tokens: + vocab_counter = Counter(all_tokens) + vocab_size = len(vocab_counter) + lexical_diversity = vocab_size / len(all_tokens) + top_vocab = vocab_counter.most_common(n_top_vocab) + else: + vocab_size = 0 + lexical_diversity = 0.0 + top_vocab = [] + + vocab_per_label = {} + for lab, toks in tokens_per_label.items(): + if toks: + c = Counter(toks) + vocab_per_label[lab] = { + "vocab_size": len(c), + "lexical_diversity": len(c) / len(toks), + } + else: + vocab_per_label[lab] = { + "vocab_size": 0, + "lexical_diversity": 0.0, + } + return vocab_size, lexical_diversity, top_vocab, vocab_per_label + + @staticmethod + def duplicated_text_stat(texts_l, labels_l, n_top_duplicates, n_examples): + """ + Compute duplicate-text statistics for a labeled dataset. + + Texts are normalized using ``strip().lower()`` to detect duplicates. + + Parameters + ---------- + texts_l : list of str + Text examples. + labels_l : list + Labels aligned with `texts_l`. + n_top_duplicates : int + Maximum number of duplicated texts to include in the detailed output. + n_examples : int + Total number of examples (used to compute the repeated ratio). + + Returns + ------- + n_duplicated_texts : int + Number of *distinct* normalized texts that appear more than once. + n_examples_in_duplicated_texts : int + Total number of examples that belong to duplicated texts (counts all occurrences). + n_repeated_examples : int + Number of repeated examples beyond the first occurrence for each duplicated text. + ratio_repeated_examples : float + Ratio ``n_repeated_examples / n_examples``. + top_duplicates : list of dict + List of dictionaries with keys: + - ``text_norm``: normalized duplicated text + - ``frequency``: occurrence count + - ``labels``: sorted list of labels where it appears + """ + texts_norm = [t.strip().lower() for t in texts_l] + text_freq = Counter(texts_norm) + + n_duplicated_texts = sum(1 for _, c in text_freq.items() if c > 1) + n_examples_in_duplicated_texts = sum(c for c in text_freq.values() if c > 1) + n_repeated_examples = n_examples_in_duplicated_texts - n_duplicated_texts + ratio_repeated_examples = n_repeated_examples / n_examples + + # For each normalized text, which labels does it appear in? + labels_per_text = defaultdict(set) + for lab, txt in zip(labels_l, texts_l): + key = txt.strip().lower() + labels_per_text[key].add(lab) + + top_duplicates = [] + for text_n, c in text_freq.most_common(): + if c <= 1: + break + top_duplicates.append( + { + "text_norm": text_n, + "frequency": c, + "labels": sorted(list(labels_per_text[text_n])), + } + ) + if len(top_duplicates) >= n_top_duplicates: + break + return n_duplicated_texts, n_examples_in_duplicated_texts, n_repeated_examples, ratio_repeated_examples, top_duplicates + + @staticmethod + def get_meta_analysis_data_df( + df, + label_col="label", + text_col="literal", + n_top_vocab=30, + n_top_duplicates=20, + ): + """ + Run a meta-analysis on a labeled text dataset. + + This utility computes dataset-level statistics (class distribution), + length statistics (global and per-label), vocabulary statistics (global and + per-label), and duplicate-text statistics. + + Parameters + ---------- + df : pandas.DataFrame + Input DataFrame containing at least `label_col` and `text_col`. + label_col : str, default='label' + Column name containing labels. + text_col : str, default='literal' + Column name containing text samples. + n_top_vocab : int, default=30 + Number of most frequent tokens to return in the global vocabulary summary. + n_top_duplicates : int, default=20 + Maximum number of duplicated texts to include in the detailed duplicates list. + + Returns + ------- + summary : dict + Nested dictionary with the following top-level keys: + - ``dataset``: counts per label and imbalance indicators + - ``lengths``: global and per-label length stats + - ``vocabulary``: global and per-label vocab stats + - ``duplicates``: duplicate-text stats and examples + + Raises + ------ + ValueError + If the DataFrame does not contain the required columns or becomes empty + after cleaning. + + See Also + -------- + clean_data_df, get_class_distribution, get_length_global_stats, + get_length_class_stats, get_tokens_stats, duplicated_text_stat + """ + df, n_empty_cols = SyntDataGenerator.clean_data_df(df, label_col, text_col) + labels = df[label_col].tolist() + texts = df[text_col].tolist() + n_examples = len(texts) + examples_per_label, n_labels, min_label, max_label, ratio_max_min, sub_rep, over_rep = SyntDataGenerator.get_class_distribution(labels) + length_stats_global = SyntDataGenerator.get_length_global_stats(texts) + length_stats_per_label = SyntDataGenerator.get_length_class_stats(texts, labels) + vocab_size, lexical_diversity, top_vocab, vocab_per_label = SyntDataGenerator.get_tokens_stats( + texts, + labels, + n_top_vocab + ) + n_duplicated_texts, n_examples_in_duplicated_texts, n_repeated_examples, ratio_repeated_examples, top_duplicates = SyntDataGenerator.duplicated_text_stat( + texts, + labels, + n_top_duplicates, + n_examples + ) + summary = { + "dataset": { + "n_empy_text": n_empty_cols, + "n_examples": n_examples, + "n_labels": n_labels, + "examples_per_label": dict(examples_per_label), + 'n_max_label': max_label, + 'n_min_label': min_label, + "ratio_max_min_label": ratio_max_min, + "classes_under_5%_data": sub_rep, + "classes_over_95%_data": over_rep + }, + "lengths": { + "global": length_stats_global, + "per_label": length_stats_per_label, + }, + "vocabulary": { + "vocab_size_global": vocab_size, + "lexical_diversity_global": lexical_diversity, + "top_vocab_global": top_vocab, + "per_label": vocab_per_label, + }, + "duplicates": { + "n_duplicated_texts": n_duplicated_texts, + "n_examples_in_duplicated_texts": n_examples_in_duplicated_texts, + "n_repeated_examples": n_repeated_examples, + "ratio_repeated_examples": ratio_repeated_examples, + "top_duplicates": top_duplicates, + }, + } + return summary + + +class OnyxiaSyntDataGenerator(SyntDataGenerator): + + def api_call(self, prompt): + """ + Call the Onyxia LLM chat completion endpoint. + + Parameters + ---------- + prompt : str + User prompt to send to the remote chat-completions API. + + Returns + ------- + str + The model reply content extracted from the JSON response. + + Raises + ------ + requests.RequestException + If the HTTP request fails. + KeyError + If the expected JSON fields are missing in the response payload. + """ + url = "https://llm.lab.sspcloud.fr/api/chat/completions" + headers = { + 'Authorization': f'Bearer {self.api_key}', + 'Content-Type': 'application/json' + } + data = { + "model": f"{self.model}", + "messages": [ + { + "role": "user", + "content": prompt + } + ] + } + response = requests.post(url, headers=headers, json=data) + reply = response.json()["choices"][0]["message"]["content"] + return reply + + +class GroqSyntDataGenerator(SyntDataGenerator): + """ + Initialize a Groq-based synthetic data maker with multiple API keys. + + This subclass extends :class:`SyntDataGenerator` by accepting a list of API keys + (`api_key_l`) that can be rotated when rate limits are reached. + + Parameters + ---------- + root, model, label_contents_map_dict, body_intro, task, n_responses, p_lang, o_lang, output_format, long, eg_filename, eg_strategy + See :meth:`SyntDataGenerator.__init__`. + api_key_l : list of str + List of Groq API keys used for key rotation. + api_key : str or None, optional + Initial key passed to the parent. The actual key used may later be + replaced by elements from `api_key_l`. + + Returns + ------- + None + """ + def __init__( + self, + root, + api_key_l, + model, + label_contents_map_dict, + body_intro, + api_key=None, + task=None, + n_responses=None, + p_lang='en', + o_lang=None, + output_format=None, + len_constraint=None, + eg_filename='static_examples.json', + eg_strategy='static' + ): + super().__init__( + root=root, + model=model, + label_contents_map_dict=label_contents_map_dict, + body_intro=body_intro, + api_key=api_key, + task=task, + n_responses=n_responses, + p_lang=p_lang, + o_lang=o_lang, + output_format=output_format, + len_constraint=len_constraint, + eg_filename=eg_filename, + eg_strategy=eg_strategy + ) + self.api_key_l = api_key_l + + def api_call(self, prompt): + """ + Call the Groq chat completion API. + + Parameters + ---------- + prompt : str + User prompt to send to Groq. + + Returns + ------- + str + Model-generated text, stripped of leading/trailing whitespace. + + Raises + ------ + Exception + Propagates exceptions raised by the Groq client (e.g., authentication, + connectivity, or rate limiting). + """ + client = Groq(api_key=self.api_key) + chat_completion = client.chat.completions.create( + messages=[{"role": "user", "content": prompt}], + model=self.model, + temperature=0.4 + ) + return chat_completion.choices[0].message.content.strip() + + def set_api_responses(self): + """ + Generate prompts and store Groq responses with API-key rotation on rate limits. + + This method behaves like :meth:`SyntDataGenerator.set_api_responses`, but if a + ``RateLimitError`` with status code 429 is raised, it switches to the next + API key from `self.api_key_l` and retries. + + Returns + ------- + None + Responses are stored in-place in `self.label_contents_map_dict`. + + Raises + ------ + RuntimeError + If all API keys in `api_key_l` have reached their limits. + RateLimitError + If a rate-limit error occurs that is not handled (e.g., different status code). + """ + self.prompt_maker.logger.info('Setting api responses...') + x = 0 + self.api_key = self.api_key_l[x] + for data_d in tqdm(self.label_contents_map_dict.values(), total=len(self.label_contents_map_dict.values())): + self.prompt_fields['description'] = data_d.get('description') + if self.eg_strategy: + self.prompt_fields['example'] = data_d.get('example') + prompt = self.prompt_maker.get_prompt(self.prompt_fields) + data_d['prompt'] = prompt + while True: + try: + data_d['response'] = self.api_call(prompt) + break + except RateLimitError as e: + if e.status_code == 429: + print(f"Key {x} limit reached, switching key...") + x += 1 + self.api_key = self.api_key_l[x] + if x >= len(self.api_key_l): + raise RuntimeError("All API keys have reached their limits.") + + +class AzureSyntDataGenerator(SyntDataGenerator): + + def api_call(self, prompt): + """ + Call an Azure OpenAI chat completion deployment. + + Parameters + ---------- + prompt : str + User prompt to send to the Azure OpenAI chat completion endpoint. + + Returns + ------- + str + The model reply content. + + Raises + ------ + Exception + Propagates exceptions raised by the Azure OpenAI client (e.g., + authentication, connectivity, quota/rate limits). + """ + endpoint = "https://open-ai-ine.cognitiveservices.azure.com/" + deployment = self.model + + subscription_key = self.api_key + api_version = "2024-12-01-preview" + + client = AzureOpenAI( + api_version=api_version, + azure_endpoint=endpoint, + api_key=subscription_key, + ) + + response = client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are an expert in generation of sentences for NLP model trainning", + }, + { + "role": "user", + "content": f"{prompt}", + } + ], + max_tokens=4096, + temperature=1.0, + top_p=1.0, + model=deployment + ) + return response.choices[0].message.content + + diff --git a/synt_data_maker/synt_data_maker_tutorial.ipynb b/synt_data_maker/synt_data_maker_tutorial.ipynb new file mode 100644 index 0000000..dbd86e9 --- /dev/null +++ b/synt_data_maker/synt_data_maker_tutorial.ipynb @@ -0,0 +1,1722 @@ +{ + "cells": [ + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:39:48.446738Z", + "start_time": "2026-02-16T11:39:46.382018Z" + } + }, + "cell_type": "code", + "source": [ + "import os\n", + "import json\n", + "import pandas as pd\n", + "from pprint import pprint\n", + "import re" + ], + "id": "900e7ff22587a0f5", + "outputs": [], + "execution_count": 1 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# Prompt Maker\n", + "\n", + "This section explains how to use the PromptSyntDataMaker class with prompt templates and parameters. The application focuses on the Norwegian NACE classification and demonstrates how you can create your own customized application." + ], + "id": "eaa77e80879aa89c" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## PromptMaker\n", + "This abstract class is responsible for constructing and populating prompt templates." + ], + "id": "b77e2226a35e6965" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:40:59.203901Z", + "start_time": "2026-02-16T11:40:59.194183Z" + } + }, + "cell_type": "code", + "source": [ + "from prompt_maker import PromptMaker\n", + "# help(PromptMaker)" + ], + "id": "4d9ef3f89b340861", + "outputs": [], + "execution_count": 2 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T12:45:27.108404Z", + "start_time": "2026-02-16T12:45:27.104243Z" + } + }, + "cell_type": "code", + "source": [ + "prompt_template = 'Hi. [intro], [question]' # The class will examine patterns such as '-' or '[-]', as in the example.\n", + "values = {\n", + " \"intro\": \"Who are you?\",\n", + " \"question\": \"Do you know where the nearest supermarket is?\"\n", + "}\n", + "\n" + ], + "id": "e73abfb0a6f72d01", + "outputs": [], + "execution_count": 55 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T12:45:28.678636Z", + "start_time": "2026-02-16T12:45:28.658947Z" + } + }, + "cell_type": "code", + "source": [ + "prompt_basic = PromptMaker(\n", + " prompt_template=prompt_template,\n", + " plh_replace_map_dict=values,\n", + " gap_ph_pattern='[-]' # Patterns must be specified using the open delimiter - close delimiter format.\n", + ")" + ], + "id": "d606c6a0c3aa6c41", + "outputs": [], + "execution_count": 56 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T12:45:30.437767Z", + "start_time": "2026-02-16T12:45:30.431077Z" + } + }, + "cell_type": "code", + "source": [ + "#help(prompt_basic)\n", + "prompt_basic.fill_prompt() # This main method fills the placeholders using the provided dictionary." + ], + "id": "2430a0708ee54f09", + "outputs": [ + { + "data": { + "text/plain": [ + "'Hi. Who are you?, Do you know where the nearest supermarket is?'" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 57 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:11.457313Z", + "start_time": "2026-02-16T11:41:11.452881Z" + } + }, + "cell_type": "code", + "source": [ + "prompt_template = 'Hi, *intro*, *question*'\n", + "values = {\n", + " 'question': 'Who are you?',\n", + " 'intro': \"Do you know where the nearest supermarket is?\"\n", + "}\n", + "\n", + "prompt_basic = PromptMaker(\n", + " prompt_template=prompt_template,\n", + " plh_replace_map_dict=values, # new pattern\n", + " gap_ph_pattern='*-*'\n", + ")" + ], + "id": "e955b84b759db9b9", + "outputs": [], + "execution_count": 6 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:12.300381Z", + "start_time": "2026-02-16T11:41:12.293805Z" + } + }, + "cell_type": "code", + "source": "prompt_basic.fill_prompt()", + "id": "abb46121c659511b", + "outputs": [ + { + "data": { + "text/plain": [ + "'Hi, Do you know where the nearest supermarket is?, Who are you?'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 7 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:15.513570Z", + "start_time": "2026-02-16T11:41:14.650649Z" + } + }, + "cell_type": "code", + "source": [ + "# All keys specified in the prompt template must be provided (in this case, question and introduction).\n", + "prompt_basic.plh_replace_map_dict = {\n", + " 'question': 'Who are you?',\n", + "}\n", + "\n", + "print(prompt_basic.fill_prompt())" + ], + "id": "aae9b7344bce4f97", + "outputs": [ + { + "ename": "KeyError", + "evalue": "'intro'", + "output_type": "error", + "traceback": [ + "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[1;32mIn[8], line 6\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;66;03m# All keys specified in the prompt template must be provided (in this case, question and introduction).\u001B[39;00m\n\u001B[0;32m 2\u001B[0m prompt_basic\u001B[38;5;241m.\u001B[39mplh_replace_map_dict \u001B[38;5;241m=\u001B[39m {\n\u001B[0;32m 3\u001B[0m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mquestion\u001B[39m\u001B[38;5;124m'\u001B[39m: \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mWho are you?\u001B[39m\u001B[38;5;124m'\u001B[39m,\n\u001B[0;32m 4\u001B[0m }\n\u001B[1;32m----> 6\u001B[0m \u001B[38;5;28mprint\u001B[39m(prompt_basic\u001B[38;5;241m.\u001B[39mfill_prompt())\n", + "File \u001B[1;32m~\\Documents\\metodologia y de muestras\\Programacion\\Gitlab_Projects\\statcodgen\\synt_data_maker\\prompt_maker.py:240\u001B[0m, in \u001B[0;36mPromptMaker.fill_prompt\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 228\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mfill_prompt\u001B[39m(\u001B[38;5;28mself\u001B[39m):\n\u001B[0;32m 229\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[0;32m 230\u001B[0m \u001B[38;5;124;03m Fill the object's prompt template using the instance value dictionary.\u001B[39;00m\n\u001B[0;32m 231\u001B[0m \u001B[38;5;124;03m Returns\u001B[39;00m\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 238\u001B[0m \u001B[38;5;124;03m This is a convenience wrapper around :meth:`fill_text`.\u001B[39;00m\n\u001B[0;32m 239\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[1;32m--> 240\u001B[0m filled_prompt \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mfill_text(\n\u001B[0;32m 241\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mprompt_template,\n\u001B[0;32m 242\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mplh_replace_map_dict,\n\u001B[0;32m 243\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mgap_ph_pattern\n\u001B[0;32m 244\u001B[0m )\n\u001B[0;32m 245\u001B[0m final_prompt \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;241m.\u001B[39mjoin(line\u001B[38;5;241m.\u001B[39mstrip() \u001B[38;5;28;01mfor\u001B[39;00m line \u001B[38;5;129;01min\u001B[39;00m filled_prompt\u001B[38;5;241m.\u001B[39msplitlines() \u001B[38;5;28;01mif\u001B[39;00m line\u001B[38;5;241m.\u001B[39mstrip())\n\u001B[0;32m 246\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m final_prompt\n", + "File \u001B[1;32m~\\Documents\\metodologia y de muestras\\Programacion\\Gitlab_Projects\\statcodgen\\synt_data_maker\\prompt_maker.py:225\u001B[0m, in \u001B[0;36mPromptMaker.fill_text\u001B[1;34m(template, text_dict, pattern)\u001B[0m\n\u001B[0;32m 223\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m gap_key \u001B[38;5;129;01min\u001B[39;00m gap_keys:\n\u001B[0;32m 224\u001B[0m gap \u001B[38;5;241m=\u001B[39m PromptMaker\u001B[38;5;241m.\u001B[39mfill_gap(pattern, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124m-\u001B[39m\u001B[38;5;124m'\u001B[39m, gap_key)\n\u001B[1;32m--> 225\u001B[0m final_text \u001B[38;5;241m=\u001B[39m PromptMaker\u001B[38;5;241m.\u001B[39mfill_gap(final_text, gap, text_dict[gap_key])\n\u001B[0;32m 226\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m final_text\n", + "\u001B[1;31mKeyError\u001B[0m: 'intro'" + ] + } + ], + "execution_count": 8 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:15.808788Z", + "start_time": "2026-02-16T11:41:15.800332Z" + } + }, + "cell_type": "code", + "source": [ + "# But you can provide extra keys\n", + "prompt_basic.plh_replace_map_dict = {\n", + " 'question': 'Who are you?',\n", + " 'intro': \"Do you know where the nearest supermarket is?\",\n", + " 'extra': 'extra'\n", + "}\n", + "print(prompt_basic.fill_prompt())" + ], + "id": "34277d0e1043f21a", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hi, Do you know where the nearest supermarket is?, Who are you?\n" + ] + } + ], + "execution_count": 9 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-05T10:01:02.331723Z", + "start_time": "2026-02-05T10:01:02.327872Z" + } + }, + "cell_type": "markdown", + "source": "## PromptSyntDataMaker", + "id": "2a007bd9e244707e" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:19.377983Z", + "start_time": "2026-02-16T11:41:19.306804Z" + } + }, + "cell_type": "code", + "source": [ + "from prompt_synt_data_maker import PromptSyntDataMaker\n", + "# help(PromptSyntDataMaker)" + ], + "id": "3cff368811d68843", + "outputs": [], + "execution_count": 10 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-01-22T18:02:11.201163Z", + "start_time": "2026-01-22T18:02:11.195480Z" + } + }, + "cell_type": "markdown", + "source": [ + "### Our strategy\n", + "This class implements the generation of synthetic data to train an AI model that classifies Norwegian Bokmål (NB) text according to the NACE standard. The prompt-making strategy uses nested templates and parameter dictionaries to generate synthetic data based on the explanatory notes of the standard.\n", + "\n", + "To run the code above, your project root directory must have the following structure:\n", + "```\n", + " root (dir):\n", + " input (dir):\n", + " *repository's all templates in synt_data_maker/input*\n", + "```\n", + "Or approach divide the prompt in three parts:\n", + "- **body**: where the role (body_intro), task (task), and output parameters (body_param) are defined.\n", + " - input/body_template.json: basic template for writing the body.\n", + " - input/body_param_template.json: basic template of the body parameters.\n", + "- **example**: where an example of the task is added.\n", + " - input/example_template.json: basic template for writing the example.\n", + "- **objetive**: where the explanatory note is added.\n", + " - input/objetive_template.json: basic template for writing the objective." + ], + "id": "12946a255b974b" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:21.273190Z", + "start_time": "2026-02-16T11:41:21.227382Z" + } + }, + "cell_type": "code", + "source": [ + "root = os.path.abspath('') # Add the root directory where the `input` folder is located.\n", + "# print(root)\n", + "prompt_synt = PromptSyntDataMaker(\n", + " root = root\n", + ")" + ], + "id": "46c4f3cafe7a5c25", + "outputs": [], + "execution_count": 11 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:22.150632Z", + "start_time": "2026-02-16T11:41:22.146606Z" + } + }, + "cell_type": "code", + "source": [ + "# The full `prompt_template` is structured as follows:\n", + "prompt_template = prompt_synt.join_templates('[body]\\n[example]\\n[objetive]')\n", + "print('\\nPrompt template\\n',prompt_template)" + ], + "id": "29d53b67d5076316", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Prompt template\n", + " **body_intro**\n", + "**task**\n", + "Write the examples in **language**.\n", + "Write **number** examples.\n", + "The examples must contain **len_constraint** words.\n", + "The examples must be **output_format**.\n", + "You must only answer with examples; do not provide an introduction.\n", + "Here is an example:**example**\n", + "Now it's your turn:Description:**description**\n", + "Key words:**key_words**\n" + ] + } + ], + "execution_count": 12 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:23.125787Z", + "start_time": "2026-02-16T11:41:23.120452Z" + } + }, + "cell_type": "code", + "source": [ + "# PromptSyntDataMaker.write_prompt() parameters\n", + "print(prompt_basic.get_gap_keys(prompt_template, '**-**'))" + ], + "id": "26c23b0d394ea4cb", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['body_intro', 'task', 'language', 'number', 'len_constraint', 'output_format', 'example', 'description', 'key_words']\n" + ] + } + ], + "execution_count": 13 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "There are nine placeholders to fill in the template. Five must be provided by the user, and four are parameterized.\n", + "\n", + "1) **body_intro** (free text): Here, you must define the role of the model.\n", + "\n", + "2) **task** (parameterized):\n", + " - *generate*: Instructs the model to generate new sentences that fit a given description.\n", + " - *process*: Instructs the model to extract all examples of economic activities included in a description.\n", + " - *combine*: Instructs the model to generate new sentences that fit the description using specific keywords.\n", + "3) **language** (parameterized): Indicates the language in which the synthetic output data must be written.\n", + " - *sp*: Generate sentences in Spanish.\n", + " - *en*: Generate sentences in English.\n", + " - *nb*: Generate sentences in Norwegian Bokmål.\n", + "4) **number** (free text): Number of examples the model must generate.\n", + "5) **len_constraint** (parameterized): Sentence length constraint.\n", + " - *short*: Between 1–5 words.\n", + " - *medium*: Between 5–15 words.\n", + " - *long*: More than 15 words.\n", + "6) **output_format** (parameterized): Output format.\n", + " - *lines*: Sentences must be separated by line breaks.\n", + "7) **example** (free text): An example of the expected output.\n", + "8) **description** (free text): Explanatory note.\n", + "9) **key_words** (free text, list format): A list of keywords. Do not include this parameter unless the task is *combine*." + ], + "id": "143008b0e0812596" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:24.698819Z", + "start_time": "2026-02-16T11:41:24.695414Z" + } + }, + "cell_type": "code", + "source": [ + "example = '''\n", + "Description:\n", + "Rice cultivation.\n", + "\n", + "Key words:\n", + "['plantation', 'brown', 'rice', 'basmati']\n", + "\n", + "Output:\n", + "Rice plantation.\n", + "Rice cultivation.\n", + "Brown rice plantation.\n", + "Basmati rice plantation.\n", + "Brown rice cultivation.\n", + "'''\n" + ], + "id": "4b4d8ba83dfe08c3", + "outputs": [], + "execution_count": 14 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:25.520387Z", + "start_time": "2026-02-16T11:41:25.515310Z" + } + }, + "cell_type": "code", + "source": [ + "print(\n", + " prompt_synt.write_prompt(\n", + " body_intro = 'You are an expert in NACE classification',\n", + " task = 'combine',\n", + " language = 'sp',\n", + " len_constraint = 'short',\n", + " number = '5',\n", + " output_format = 'lines',\n", + " example = example,\n", + " description = 'banana cultivation',\n", + " key_words = '[banana, cultivation, plantation]'\n", + " )\n", + ")" + ], + "id": "b124eb8b5a234be3", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "You are an expert in NACE classification\n", + "I am going to give you a list of words and a description of economic activities in Norwegian Bokmål. I would like you to create new examples of economic activities by combining the different words. The examples must fit with the description\n", + "Write the examples in spanish.\n", + "Write 5 examples.\n", + "The examples must contain between 1-5 words.\n", + "The examples must be separated by line break.\n", + "You must only answer with examples; do not provide an introduction.\n", + "Here is an example:\n", + "Description:\n", + "Rice cultivation.\n", + "Key words:\n", + "['plantation', 'brown', 'rice', 'basmati']\n", + "Output:\n", + "Rice plantation.\n", + "Rice cultivation.\n", + "Brown rice plantation.\n", + "Basmati rice plantation.\n", + "Brown rice cultivation.\n", + "Now it's your turn:\n", + "Description:\n", + "banana cultivation\n", + "Key words:\n", + "[banana, cultivation, plantation]\n" + ] + } + ], + "execution_count": 15 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:26.415272Z", + "start_time": "2026-02-16T11:41:26.409767Z" + } + }, + "cell_type": "code", + "source": [ + "# You can remove any parameter by setting it to `None`, which will remove the corresponding sentences from the prompt.\n", + "print(\n", + " prompt_synt.write_prompt(\n", + " body_intro = None,\n", + " task = 'process',\n", + " language = None,\n", + " len_constraint = None,\n", + " number = None,\n", + " output_format = 'lines',\n", + " example = None,\n", + " description = 'banana cultivation',\n", + " key_words = None\n", + " )\n", + ")" + ], + "id": "e1e9064a3471a163", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I want you to extract all specific examples of economic activities included in a given description.\n", + "The examples must be separated by line break.\n", + "You must only answer with examples; do not provide an introduction.\n", + "Now it's your turn:\n", + "Description:\n", + "banana cultivation\n" + ] + } + ], + "execution_count": 16 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-05T09:22:50.490546Z", + "start_time": "2026-02-05T09:22:50.487826Z" + } + }, + "cell_type": "markdown", + "source": "### Custom your strategy", + "id": "963f9f9227976d45" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-04T13:53:22.327983Z", + "start_time": "2026-02-04T13:53:22.323364Z" + } + }, + "cell_type": "markdown", + "source": "#### Make all gaps free text", + "id": "15b444c39006bce" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:28.732817Z", + "start_time": "2026-02-16T11:41:28.722928Z" + } + }, + "cell_type": "code", + "source": [ + "prompt_synt = PromptSyntDataMaker(\n", + " root = root,\n", + " name_param_dict=None # Not load the param dict\n", + ")" + ], + "id": "2de6f4c182199de5", + "outputs": [], + "execution_count": 17 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:29.550379Z", + "start_time": "2026-02-16T11:41:29.545445Z" + } + }, + "cell_type": "code", + "source": [ + "print(\n", + " prompt_synt.write_prompt(\n", + " body_intro = 'You are an expert in ISCO classification',\n", + " task = 'Give five jobs examples related to an economic activity',\n", + " language = 'german',\n", + " len_constraint = 'between 3 and 10',\n", + " number = None,\n", + " output_format = 'json',\n", + " example = None,\n", + " description = 'banana raise',\n", + " key_words = None\n", + " )\n", + ")" + ], + "id": "cd93e1cf4181a329", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "You are an expert in ISCO classification\n", + "Give five jobs examples related to an economic activity\n", + "Write the examples in german.\n", + "The examples must contain between 3 and 10 words.\n", + "The examples must be json.\n", + "You must only answer with examples; do not provide an introduction.\n", + "Now it's your turn:\n", + "Description:\n", + "banana raise\n" + ] + } + ], + "execution_count": 18 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "#### Make your own approach\n", + "All {name}_template.json documents in the input directory share the same structure:\n", + "``` python\n", + "{\n", + "\"en\": \"This is an example [gap_to_fill_with_another_template], **gap_to_fill_with_str**\",\n", + "\"sp\": \"Esto es un ejemplo [gap_to_fill_with_another_template], **gap_to_fill_with_str**\",\n", + "\"{lang}\": \"{prompt_template}\"\n", + "}\n", + "```\n", + "For example, PromptSyntDataMaker requires the following inputs:\n", + "```python\n", + "prompt_synt = PromptSyntDataMaker(\n", + " root=None,\n", + " name_root_template='prompt',\n", + " name_param_dict='input/param_dict.json',\n", + " prompt_lang='en', # Indicates the language of the prompt\n", + " template_pattern='[-]', # Pattern used to identify nested templates\n", + " gap_ph_pattern='**-**' # Pattern used to identify text placeholders\n", + ")\n", + "```\n", + "In this case, the class retrieves the 'en' key from prompt_template.json, which must be located in the root/input directory. It then starts an iterative process to identify all text enclosed in [] and loads the corresponding 'en' templates from JSON files named {text}_template.json.\n", + "\n", + "Let's look at an example. Suppose we want to automate a prompt-writing strategy to ask a model to write a poem. The prompt will have two parts: an introduction and a body, where the poem parameters are defined. The prompt can be written in both Spanish and English." + ], + "id": "2af61ccc740aa243" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:31.432993Z", + "start_time": "2026-02-16T11:41:31.426954Z" + } + }, + "cell_type": "code", + "source": [ + "# Step 1, we create the root directory with the required folder structure.\n", + "os.makedirs('example_new_approach/input', exist_ok=True)\n", + "new_root = os.path.abspath('example_new_approach')\n", + "print(os.path.exists(new_root))" + ], + "id": "133daae89966371a", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n" + ] + } + ], + "execution_count": 19 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:32.262300Z", + "start_time": "2026-02-16T11:41:32.250991Z" + } + }, + "cell_type": "code", + "source": [ + "# Step 2: Create the base `poem_prompt_template.json` file.\n", + "\n", + "structure = {\n", + " \"en\": \"**intro**\\n[poem_body]\",\n", + " \"sp\": \"**intro**\\n[poem_body]\",\n", + "}\n", + "\n", + "poem_body = {\n", + " \"en\": \"The poem must be a **type**.\\nMust be about **topic**.\\nWrite only the poem, without any introduction.\",\n", + " \"sp\": \"El poema debe ser **type**.\\nDebe tratar sobre **topic**.\\nEscribe solo el poema, sin ninguna introducción.\",\n", + "}\n", + "\n", + "with open(\n", + " os.path.join(new_root, 'input', 'structure_template.json'),\n", + " 'w',\n", + " encoding='utf-8'\n", + ") as f: # REMEMBER: all template files must end with `_template.json`\n", + " json.dump(structure, f, ensure_ascii=False, indent=4)\n", + "\n", + "with open(\n", + " os.path.join(new_root, 'input', 'poem_body_template.json'),\n", + " 'w',\n", + " encoding='utf-8'\n", + ") as f: # REMEMBER: the filename must match the gap name inside `[]` in `structure_template.json`\n", + " json.dump(poem_body, f, ensure_ascii=False, indent=4)\n" + ], + "id": "4ac72e64bec1cd3e", + "outputs": [], + "execution_count": 20 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:33.122941Z", + "start_time": "2026-02-16T11:41:33.108865Z" + } + }, + "cell_type": "code", + "source": [ + "# Now everything is ready to generate the instance\n", + "poem_prompt_maker = PromptSyntDataMaker(\n", + " root=new_root,\n", + " name_root_template='structure', # The name of the root prompt template\n", + " name_param_dict=None, # No parameter dictionary is used\n", + " prompt_lang='en', # Indicates the language of the prompt\n", + " template_pattern='[-]', # Pattern used to identify nested templates\n", + " gap_ph_pattern='**-**' # Pattern used to identify text placeholders\n", + ")\n" + ], + "id": "2213ebe44505f705", + "outputs": [], + "execution_count": 21 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:33.969641Z", + "start_time": "2026-02-16T11:41:33.965484Z" + } + }, + "cell_type": "code", + "source": [ + "# Fill the gaps using the parameter names defined in `**-**`\n", + "print(\n", + " poem_prompt_maker.write_prompt(\n", + " intro='You must write a poem.',\n", + " type='romance',\n", + " topic='love'\n", + " )\n", + ")" + ], + "id": "c6cb8953b8c9d09a", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "You must write a poem.\n", + "The poem must be a romance.\n", + "Must be about love.\n", + "Write only the poem, without any introduction.\n" + ] + } + ], + "execution_count": 22 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:35.141056Z", + "start_time": "2026-02-16T11:41:35.136228Z" + } + }, + "cell_type": "code", + "source": [ + "# If a parameter is not provided (set to None), the corresponding sentence will be removed\n", + "print(\n", + " poem_prompt_maker.write_prompt(\n", + " intro='You must write a poem.',\n", + " type=None,\n", + " topic=None\n", + " )\n", + ")\n" + ], + "id": "df90773a3aec85d3", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "You must write a poem.\n", + "Write only the poem, without any introduction.\n" + ] + } + ], + "execution_count": 23 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:35.971767Z", + "start_time": "2026-02-16T11:41:35.942024Z" + } + }, + "cell_type": "code", + "source": [ + "# Finally, you can also parameterize text by defining another .json file\n", + "poem_param_dict = {\n", + " \"en\": {\n", + " \"topic\": {\n", + " \"l\": \"love\",\n", + " \"c\": \"city\",\n", + " \"w\": \"war\"\n", + " },\n", + " },\n", + " \"sp\": {\n", + " \"topic\": {\n", + " \"l\": \"amor\",\n", + " \"c\": \"ciudad\",\n", + " \"w\": \"guerra\"\n", + " },\n", + " },\n", + "}\n", + "\n", + "with open(os.path.join(new_root, 'input', 'poem_param.json'), 'w', encoding='utf-8') as f:\n", + " json.dump(poem_param_dict, f, ensure_ascii=False, indent=4)\n", + "\n", + "# Now use it to easily fill the text placeholders\n", + "poem_prompt_maker = PromptSyntDataMaker(\n", + " root=new_root,\n", + " name_root_template='structure', # The name of the root prompt template\n", + " name_param_dict='input/poem_param.json', # Relative path to the parameter dictionary\n", + " prompt_lang='sp', # Indicates the language of the prompt\n", + " template_pattern='[-]', # Pattern used to identify nested templates\n", + " gap_ph_pattern='**-**' # Pattern used to identify text placeholders\n", + ")\n", + "\n", + "print('Example with love\\n')\n", + "print(\n", + " poem_prompt_maker.write_prompt(\n", + " intro='Tienes que escribir un poema.',\n", + " type='romance',\n", + " topic='l'\n", + " )\n", + ")\n", + "\n", + "print('\\nExample with war\\n')\n", + "print(\n", + " poem_prompt_maker.write_prompt(\n", + " intro='Tienes que escribir un poema.',\n", + " type='romance',\n", + " topic='w'\n", + " )\n", + ")\n" + ], + "id": "4eeb0a2ebb7c65c6", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Example with love\n", + "\n", + "Tienes que escribir un poema.\n", + "El poema debe ser romance.\n", + "Debe tratar sobre amor.\n", + "Escribe solo el poema, sin ninguna introducción.\n", + "\n", + "Example with war\n", + "\n", + "Tienes que escribir un poema.\n", + "El poema debe ser romance.\n", + "Debe tratar sobre guerra.\n", + "Escribe solo el poema, sin ninguna introducción.\n" + ] + } + ], + "execution_count": 24 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-04T14:04:14.458237800Z", + "start_time": "2026-01-21T08:49:40.422173Z" + } + }, + "cell_type": "markdown", + "source": [ + "# Process explicative notes from a standard\n", + "The aim of this class is to prepare the explicative notes of a standard for generating a dict with the notes, examples and key word for each code. First it will show how we have worked with the Norwegian NACE standard." + ], + "id": "d231d3cda8bb4b88" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## ProcessNACENB\n", + "\n", + "In this tutorial, three documents are used:\n", + "\n", + "- **klass-version-3218-codes.csv**: The Norwegian Standard Industrial Classification (NACEnb). You can download it [here](https://www.ssb.no/klass/klassifikasjoner/6).\n", + "\n", + "- **train_norwaydata.csv**: A NACEnb-labeled training dataset created by [Statistics Norway (Statistisk sentralbyrå)](https://www.ssb.no/en). It was shared within Standard Working Group 5.\n", + "\n", + "- **NACE Rev. 2.1 - Index entries - Overview_translated_to_NB_with_deepl_complete.csv**: The NACE index published by Eurostat and translated into Norwegian Bokmål using [DeepL](https://www.deepl.com/es/translator). Please contact us if you would like access to this file.\n" + ], + "id": "fc4d26921b35bf14" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:41:38.529432Z", + "start_time": "2026-02-16T11:41:38.520625Z" + } + }, + "cell_type": "code", + "source": [ + "from process_notes import ProcessNACENB\n", + "# help(ProcessNACENB)" + ], + "id": "d95e578d059b9083", + "outputs": [], + "execution_count": 25 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:42:33.120668Z", + "start_time": "2026-02-16T11:42:29.843136Z" + } + }, + "cell_type": "code", + "source": [ + "# Load docs\n", + "doc_path = r\"\" #path with the docs\n", + "nb_notes = 'klass-version-3218-codes.csv'\n", + "organic_train_set_nb = 'train_norwaydata.csv'\n", + "index_nb = 'NACE Rev. 2.1 - Index entries - Overview_translated_to_NB_with_deepl_complete.csv'\n", + "\n", + "notes_df = pd.read_csv(\n", + " os.path.join(doc_path, nb_notes),\n", + " dtype='str',\n", + " encoding='latin-1',\n", + " sep=';',\n", + ")\n", + "train_df = pd.read_csv(\n", + " os.path.join(doc_path, organic_train_set_nb),\n", + " dtype='str'\n", + ")\n", + "index_df = pd.read_csv(\n", + " os.path.join(doc_path, index_nb),\n", + " dtype='str'\n", + ")\n" + ], + "id": "a297792be3927f8b", + "outputs": [], + "execution_count": 27 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:42:34.559689Z", + "start_time": "2026-02-16T11:42:34.455070Z" + } + }, + "cell_type": "code", + "source": [ + "# Now generate a sample dictionary to create synthetic data\n", + "process_nb = ProcessNACENB(\n", + " notes_df=notes_df[notes_df['level'] == \"4\"],\n", + " sample=3, # Take only three classes\n", + " random_state=23, # For reproducibility\n", + " examples_df=index_df, # Initially use the index to extract keywords\n", + " n_examples=10 # Number of real examples and keyword-based examples\n", + ")\n", + "\n", + "process_nb.set_descriptions(\n", + " col_c='code',\n", + " col_d_l=['name', 'notes']\n", + ")\n", + "\n", + "process_nb.set_key_words(\n", + " col_c='CODE',\n", + " col_k='KEYWORD_NB'\n", + ")\n", + "\n", + "process_nb.example_df = train_df # Now use the real labeled examples\n", + "\n", + "process_nb.set_examples(\n", + " col_c='nace_21_4digit_code',\n", + " col_d='text_for_coding'\n", + ")\n", + "\n", + "nace_nb_dict = process_nb.get_dict_data()" + ], + "id": "ff5a37ed20505782", + "outputs": [], + "execution_count": 28 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:42:37.486529Z", + "start_time": "2026-02-16T11:42:37.480334Z" + } + }, + "cell_type": "code", + "source": "pprint(nace_nb_dict)", + "id": "7ba86c86b71686f9", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'20.14': {'description': 'Produksjon av andre organiske kjemiske råvarer\\n'\n", + " 'Omfatter: Produksjon av kjemikalier ved bruk av '\n", + " 'grunnleggende prosesser, f.eks. termisk spalting '\n", + " 'cracking, destillasjon. Produktene som fremkommer '\n", + " 'ved disse prosessene er vanligvis separate '\n", + " 'kjemiskdefinerte organiske forbindelser. Produksjon '\n", + " 'av organiske basiskjemikalier; asykliske '\n", + " 'hydrokarboner, mettede og umettede, sykliske '\n", + " 'hydrokarboner, mettede og umettede, asykliske og '\n", + " 'sykliske alkoholer, mono- og polykarboksylsyrer, '\n", + " 'inkludert eddiksyre, andre forbindelser med oksygen '\n", + " 'i den funksjonelle gruppen, inkludert aldehyder, '\n", + " 'ketoner, kinoner og forbindelser med oksygen i '\n", + " 'flere forskjellige funksjonelle grupper, syntetisk '\n", + " 'glyserol, organiske forbindelser med nitrogen i den '\n", + " 'funksjonelle gruppen, inkludert aminer, gjæring av '\n", + " 'sukkerrør, mais eller lignende for å produsere '\n", + " 'alkohol og estere, andre organiske forbindelser, '\n", + " 'inkludert destillasjonsprodukter av tre f.eks. '\n", + " 'trekull osv. Produksjon av syntetiske aromatiske '\n", + " 'produkter. Produksjon av metan, ikke for forsyning '\n", + " 'som gassformig brensel gjennom et permanent '\n", + " 'forsyningsnett. Destillasjon av kulltjære. '\n", + " 'Produksjon av etylalkohol, annet enn som drivstoff '\n", + " 'av bioetanol. Produksjon av kjølegasser. '\n", + " 'Ekskluderer: Utvinning av naturgass som '\n", + " 'hovedsakelig består av metan grupperes under. '\n", + " 'Utvinning av naturgass. Produksjon av plast i '\n", + " 'primærform grupperes under. Produksjon av '\n", + " 'basisplast. Produksjon av syntetisk gummi i '\n", + " 'primærform grupperes under. Produksjon av syntetisk '\n", + " 'gummi. Produksjon av rå glyserol grupperes under. '\n", + " 'Produksjon av såpe og vaske-, rense- og '\n", + " 'poleringsmidler. Produksjon av flytende '\n", + " 'biodrivstoff, f.eks. drivstoff av bioetanol, '\n", + " 'grupperes under. Produksjon av flytende biobrensel. '\n", + " 'Produksjon av naturlige eteriske oljer grupperes '\n", + " 'under. Produksjon av andre kjemiske produkter ikke '\n", + " 'nevnt annet sted. Produksjon av salisylsyre og '\n", + " 'O-acetylsalisylsyre grupperes under. Produksjon av '\n", + " 'farmasøytiske råvarer. Produksjon av etanol fra '\n", + " 'biomasse grupperes under. Produksjon av flytende '\n", + " 'biobrensel. Produksjon av metan for forsyning som '\n", + " 'gassformig brensel gjennom et permanent '\n", + " 'forsyningsnett grupperes under. Produksjon av gass.',\n", + " 'example': 'Produksjon av biokarbon og bioenergi. Produksjon og '\n", + " 'salg av biokarbon og bioenergi. Selskapet skal utvikle, '\n", + " 'eie og drifte denne type produksjonsanlegg på egenhånd '\n", + " 'eller i partnerskap.\\n'\n", + " 'Forskning, utvikling, produksjon, markedsføring og salg '\n", + " 'av aktive immunmodulerende ingredienser til bruk i '\n", + " 'farmasøytiske spesialpreparater, medisinsk-teknisk '\n", + " 'utstyr, helsekost, kosmetikk, functional food og '\n", + " 'fr-produkter. Forskning, utvikling, produksjon, '\n", + " 'markedsføring og salg av aktive immunmodulerende '\n", + " 'ingredienser til bruk i farmasøytiske '\n", + " 'spesialpreparater, medisinsk-teknisk utstyr, helsekost, '\n", + " 'kosmetikk, functional food og fr-produkter. Selskapet '\n", + " 'skal fremme utvikling av produkter og teknologisk viten '\n", + " 'innenfor sine virksomhetsområder, og skal kunne gjøre '\n", + " 'investeringer i andre selskaper innenfor disse '\n", + " 'områdene.\\n'\n", + " 'Foredling av biologisk materiale og salg av aktuelle '\n", + " 'produkter. Foredling av biologisk materiale og salg av '\n", + " 'aktuelle produkter samt annen næringsvirksomhet, '\n", + " 'herunder gjennom aksjetegning eller på annen måte delta '\n", + " 'i andre selskap eller foretak.\\n'\n", + " 'Selskapet skal drive virksomhet tilknyttet produksjon '\n", + " 'av etylen og propylen ved selskapets fabrikk på Rafnes, '\n", + " 'produksjon av hjelpeprodukter og ytelse av tjenester i '\n", + " 'tilknytning til annen industriproduksjon på Rafnes og '\n", + " 'Rønningen, deltakelse og investeringer i andre '\n", + " 'selskaper og virksomheter, samt annen beslektet '\n", + " 'virksomhet.\\n'\n", + " 'Forretningsvirksomhet med hovedvekt på utvikling og '\n", + " 'produksjon og salg av spesialisert cellulose og i '\n", + " 'tillegg en rekke andre kjemikaler, utleie av fast '\n", + " 'eiendom, og enhver virksomhet som står i naturlig '\n", + " 'forbindelse med dette. Utvikling, produksjon og salg av '\n", + " 'biokjemikalier, biobaserte spesialprodukter og andre '\n", + " 'kjemikalier, og annen virksomhet som står i naturlig '\n", + " 'forbindelse med dette.\\n'\n", + " 'Fremstilling og salg av finkjemikalier og samhør- ende '\n", + " 'produkter, med tilhørende produktutvikling og andre '\n", + " 'tilknyttede funksjoner, samt eventuell del- tagelse i '\n", + " 'andre selskaper i Norge eller utlandet innenfor det her '\n", + " 'nevnte virksomhetsområde. Fremstilling og salg av '\n", + " 'industrielle produkter, med tilhørende produktutvikling '\n", + " 'og andre tilknyttede funksjoner, samt eventuell '\n", + " 'deltagelse i andre selskaper i Norge eller utlandet.\\n'\n", + " 'Investere, eie og administrere industriell produksjon '\n", + " 'og infrastrukturprosjekter i Norge og andre aktiviteter '\n", + " 'som står i forbindelse med dette. Investere, eie og '\n", + " 'administrere infrastrukturprosjekter i Norge og andre '\n", + " 'aktiviteter som står i forbindelse med dette.\\n'\n", + " 'Industri- og handelsvirksomhet. Industri- og '\n", + " 'handelsvirksomhet og enhver annen virksomhet som står i '\n", + " 'forbindelse dermed, forvaltning av fast eiendom, samt '\n", + " 'deltagelse i andre selskaper uansett virksomhetstype.\\n'\n", + " 'Videreforedling og omsetning av rekeskall til '\n", + " 'industriell formål, samt annen virksomhet i denne '\n", + " 'forbindelse.\\n'\n", + " 'Utvikling, produksjon og salg av syntetiske '\n", + " 'homopolymerer, samt tjenester og produkter knyttet til '\n", + " 'slike. Utvikle, produsere og selge syntetiske '\n", + " 'homopolymerer, samt produkter og tjenester knyttet til '\n", + " 'slike, herunder lisensiering av teknologi og deltakelse '\n", + " 'i andre selskaper, kjøp, salg og pantsettelse av fast '\n", + " 'eiendom og investeringer i finansielle instrumenter.',\n", + " 'key_words': ['bioetanol',\n", + " 'aldehyder',\n", + " 'hydrokarboner,',\n", + " 'med',\n", + " 'drivstoff',\n", + " 'produsere',\n", + " 'estere',\n", + " 'gjennom',\n", + " 'produsere',\n", + " 'trekull']},\n", + " '22.21': {'description': 'Produksjon av plater, film, rør og profiler av '\n", + " 'plast\\n'\n", + " 'Omfatter: Produksjon av halvfabrikater av '\n", + " 'plastprodukter; plater, blokker, film, folie, bånd '\n", + " 'osv. Produksjon av ferdige plastprodukter; rør, '\n", + " 'sylindre og slanger; slange- og rørkoblinge,. '\n", + " 'cellofanfilm eller ark. Produksjon av fleksible '\n", + " 'eller stive plater og ark av cellulært polyuretan. '\n", + " 'Produksjon av plastplater, -folie og -bånd, '\n", + " 'forsterket, laminert, støttet eller på lignende '\n", + " 'måte kombinert med andre materialer. Produksjon av '\n", + " 'selvklebende plastplater, -ark, -film, -folie, '\n", + " '-tape og -bånd. Ekskluderer: Produksjon av plast i '\n", + " 'primærform grupperes under. Produksjon av '\n", + " 'basisplast. Produksjon av artikler av syntetisk '\n", + " 'eller naturlig gummi grupperes under. Produksjon av '\n", + " 'gummiprodukter. Innkapsling av metallrør med '\n", + " 'polyuretanskum og påfølgende innkapsling i et '\n", + " 'plastrør, f.eks. for damp- og '\n", + " 'varmtvannsforsyningstjenester grupperes under. '\n", + " 'Bearbeiding og etterbehandling av plastprodukter.',\n", + " 'example': 'Produksjon og omsetning av plastprodukter. Selskapets '\n", + " 'formål er produksjon og omsetning av plastprodukter og '\n", + " 'hva hermed står i forbindelse, herunder deltagelse i '\n", + " 'annen virksomhet ved aksjetegning eller på annen måte '\n", + " 'gjøre seg interessert i andre foretagender innen '\n", + " 'bransjen.\\n'\n", + " 'Produksjon av halvfabrikater i plast herunder sveising '\n", + " 'og bøying av rørbend. Handel, produksjon, '\n", + " 'tjenesteytende virksomhet, deltakelse i andre selskaper '\n", + " 'samt hermed står i forbindelse.\\n'\n", + " 'Industriell produksjon av plastprodukter. Resirkulering '\n", + " 'av tre- og plastavfall.\\n'\n", + " 'Forvaltning og utleie av eiendom og driftsutstyr samt '\n", + " 'alt som derved står i forbindelse. Videre skal '\n", + " 'selskapet kunne investere i fast eiendom og '\n", + " 'driftsutstyr, samt kjøpe aksjer i andre selskaper.\\n'\n", + " 'Forvaltning av eierinteresse i andre næringsdrivende '\n", + " 'selskaper, samt hva hermed står i forbindelse, herunder '\n", + " 'å yte administrative og teknologiske tjenester.\\n'\n", + " 'Produksjon og salg av plastprodukter og dertil '\n", + " 'forbundne varer, samt ved aksjetegning eller på annen '\n", + " 'måte å gjøre seg interessert i beslektet virksomhet.\\n'\n", + " 'Produksjon og handel og alt hva dermed er for- bundet, '\n", + " 'herunder deltagelse i andre selskaper.\\n'\n", + " 'Kjøp og salg av varer og tjenester, fast eiendom, '\n", + " 'aksjer samt produktutvikling. I tillegg alt som '\n", + " 'naturlig hører innunder ovennevnte.\\n'\n", + " 'Holdingselskap. Salgsselskap. Eierselskap og '\n", + " 'salgsselskap for Industriplast AS. Selskapet kan i '\n", + " 'tillegg investere i aksjer og selskapsandeler og det '\n", + " 'som faller naturlig inn under dette.\\n'\n", + " 'Utvikling, produksjon og sal av plastprodukt med det '\n", + " 'som naturleg høyrer til, her også deltaking i andre '\n", + " 'selskap.',\n", + " 'key_words': ['av',\n", + " 'plast',\n", + " 'plast',\n", + " 'av',\n", + " 'av',\n", + " 'slanger',\n", + " 'plast',\n", + " 'plast',\n", + " 'plater',\n", + " 'rørdeler']},\n", + " '25.21': {'description': 'Produksjon av radiatorer til sentralvarmeanlegg og '\n", + " 'dampgeneratorer og -kjeler\\n'\n", + " 'Omfatter: Produksjon av radiatorer til '\n", + " 'sentralvarmeanlegg som ikke er oppvarmet med '\n", + " 'elektrisitet. Produksjon av kjeler til å danne '\n", + " 'vanndamp eller annen damp. Produksjon av '\n", + " 'hjelpeanlegg for bruk sammen med dampgeneratorer, '\n", + " 'f.eks. varmevekslere, overhetere, dampsamlere og '\n", + " 'akkumulatorer. Produksjon av kondensatorer for '\n", + " 'dampgeneratorer eller andre dampproduserende '\n", + " 'enheter. Produksjon av kjernekraftreaktorer, '\n", + " 'unntatt isotopseparatorer. Produksjon av dampkjeler '\n", + " 'til skip og dampkjeler for elektrisitetsproduksjon. '\n", + " 'Ekskluderer: Produksjon av elektriske ovner og '\n", + " 'elektriske varmtvannsberedere grupperes under. '\n", + " 'Produksjon av elektriske husholdningsmaskiner og '\n", + " '-apparater. Produksjon av dampturbinanlegg '\n", + " 'grupperes under. Produksjon av motorer og turbiner, '\n", + " 'unntatt motorer til luftfartøy og motorvogner. '\n", + " 'Produksjon av utstyr for oppvarming av '\n", + " 'husholdninger grupperes under. Produksjon av ovner '\n", + " 'og permanente varmeapparater til bruk i '\n", + " 'husholdninger. Produksjon av isotopseparatorer '\n", + " 'grupperes under. Produksjon av andre '\n", + " 'spesialmaskiner ikke nevnt annet sted.',\n", + " 'example': 'Engineering, produksjon og salg av varmekjeler. '\n", + " 'Hovedsaklig tilbud av servicetjenester særlig knyttet '\n", + " 'til produksjonsvirksomhet, montasje, engineering, '\n", + " 'handel samt deltakelse i andre selskaper.\\n'\n", + " 'Hovedsaklig tilbud av servicetjenester i tilknytning '\n", + " 'til produksjonsvirksomhet, montasje, engineering, '\n", + " 'handel samt deltakelse i andre selskap. Hovedsaklig '\n", + " 'tilbud av servicetjenester særlig i tilknytning til '\n", + " 'produksjonsvirksomhet, montasje, engineering, handel '\n", + " 'samt deltakelse i andre selskap.\\n'\n", + " 'Produksjon, handel og salg av mekaniske varer og '\n", + " 'tjenester og alt som hermed står i forbindelse samt å '\n", + " 'stablere virksomhet som har dette formål.\\n'\n", + " 'Produksjon, sveise og mekanisk verksted, investering, '\n", + " 'eie fast eiendom, utleie og annen virksomhet som '\n", + " 'naturlig hører inn under det samme.\\n'\n", + " 'Eie aksjer og investere i annen virksomhet av enhver '\n", + " 'art, samt andre aktiviteter som står i naturlig '\n", + " 'forbindelse med dette.\\n'\n", + " 'Handel med og produksjon av utstyr for mekanisk '\n", + " 'industri, fiskeforedlingsvirksomhet og annen '\n", + " 'produksjonsvirksomhet. Selskapet kan som aksjonær eller '\n", + " 'interessent på annen måte delta i andre selskaper med '\n", + " 'økonomisk virksomhet som formål.\\n'\n", + " 'Produsere og montere containerbaserte fyringsanlegg og '\n", + " 'siloer for pellets og andre produkter som står i '\n", + " 'naturlig tilknytning til dette, samt delta på eiersiden '\n", + " 'i andre selskap.',\n", + " 'key_words': ['kraftkjeler',\n", + " 'damp-',\n", + " 'for',\n", + " 'radiatorer',\n", + " 'dampakkumulatorer',\n", + " 'er',\n", + " 'som',\n", + " 'med',\n", + " 'ikke',\n", + " 'og']}}\n" + ] + } + ], + "execution_count": 29 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "### Customize Your Strategy\n", + "\n", + "`NotesProcess` is designed so that advanced users only need to override one method:\n", + "\n", + "`preprocess_text(text: str) -> str`" + ], + "id": "647810e39d660533" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:42:41.486229Z", + "start_time": "2026-02-16T11:42:41.482060Z" + } + }, + "cell_type": "code", + "source": [ + "from process_notes import NotesProcess\n", + "\n", + "class MyStandardProcessor(NotesProcess):\n", + " @staticmethod\n", + " def preprocess_text(text: str) -> str:\n", + " text = text.lower()\n", + " text = re.sub(r\"\\s+\", \" \", text).strip()\n", + " return text" + ], + "id": "62b1cf959c17cfe9", + "outputs": [], + "execution_count": 30 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:42:44.289199Z", + "start_time": "2026-02-16T11:42:44.279479Z" + } + }, + "cell_type": "code", + "source": [ + "notes_df = pd.DataFrame({\n", + " \"code\": [\"A\", \"B\"],\n", + " \"title\": [\"Agriculture\", \"Mining\"],\n", + " \"notes\": [\"Explanatory note A...\", \"Explanatory note B...\"]\n", + "})\n", + "\n", + "examples_df = pd.DataFrame({\n", + " \"code\": [\"A\", \"A\", \"B\"],\n", + " \"example\": [\"Crop farming\", \"Animal breeding\", \"Coal extraction\"],\n", + " \"keywords\": [\"farm crops\", \"cattle breeding\", \"coal mine\"]\n", + "})\n", + "\n", + "processor = MyStandardProcessor(\n", + " notes_df=notes_df,\n", + " examples_df=examples_df,\n", + " n_examples=10 # Maximum number of examples and keywords to extract from examples_df\n", + ")\n", + "\n", + "notes_dict = processor.get_dict_data(\n", + " col_notes_c=\"code\",\n", + " col_notes_d_l=[\"title\", \"notes\"],\n", + " col_eg_c=\"code\", # Column in examples_df containing the code\n", + " col_eg_d=\"example\", # Column in examples_df containing the examples\n", + " col_eg_k=\"keywords\" # Column in examples_df containing the keywords\n", + ")" + ], + "id": "b38469be0b7a88cb", + "outputs": [], + "execution_count": 31 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:42:46.524567Z", + "start_time": "2026-02-16T11:42:46.520418Z" + } + }, + "cell_type": "code", + "source": "pprint(notes_dict)", + "id": "450ac6756c219949", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'A': {'description': 'agriculture explanatory note a...',\n", + " 'example': 'crop farming animal breeding',\n", + " 'key_words': ['cattle', 'farm', 'breeding', 'crops']},\n", + " 'B': {'description': 'mining explanatory note b...',\n", + " 'example': 'coal extraction',\n", + " 'key_words': ['mine', 'coal']}}\n" + ] + } + ], + "execution_count": 32 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Generate Synthetic Data for Your Classification\n", + "\n", + "This class encapsulates the full pipeline for generating synthetic data following our strategy. It is designed to receive a data dictionary (such as one created by `ProcessNACENB`) and iteratively query a model via the API using `PromptSyntDataMaker`.\n", + "\n", + "It is optimized to work with our templates and can be adapted to any classification task.\n", + "\n" + ], + "id": "b2aad2637b23ccf2" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:46:17.787607Z", + "start_time": "2026-02-16T11:46:17.784468Z" + } + }, + "cell_type": "code", + "source": [ + "model = \"gpt-oss:120b\"\n", + "YOUR_API_KEY = \"\"#" + ], + "id": "ffa711ace23bcc46", + "outputs": [], + "execution_count": 34 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T11:46:23.432844Z", + "start_time": "2026-02-16T11:46:19.840037Z" + } + }, + "cell_type": "code", + "source": "from synt_data_maker import OnyxiaSyntDataGenerator", + "id": "b1d25be7802754d3", + "outputs": [], + "execution_count": 35 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T12:28:53.271563Z", + "start_time": "2026-02-16T12:28:53.228351Z" + } + }, + "cell_type": "code", + "source": [ + "generator = OnyxiaSyntDataGenerator(\n", + " root=root, # Project root (must contain the ./input/ directory with all .json templates)\n", + " api_key=YOUR_API_KEY, # Onyxia API key\n", + " model=model, # Available Onyxia model name\n", + " label_contents_map_dict=nace_nb_dict, # Output from\n", + "\n", + " body_intro=\"You are an expert in statistical classification.\", # Free text\n", + " task=\"generate\", # Task type: generate (create new data from notes), process (extract examples from text), combine (generate examples using notes and keywords)\n", + " n_responses='10', # Number of generated examples per label\n", + "\n", + " p_lang=\"en\", # Prompt template language (en or sp)\n", + " o_lang=\"nb\", # Requested output language (en, sp, or nb)\n", + " output_format=\"lines\", # Output format: examples separated by line breaks\n", + " len_constraint=\"medium\", # Output length constraint (short, medium, or long)\n", + "\n", + " eg_strategy=\"dynamic\", # Example strategy: static, dynamic, or mixed\n", + " eg_filename=\"static_examples.json\" # Filename for static examples (if used)\n", + ")" + ], + "id": "d77cdd67df3dbeaa", + "outputs": [], + "execution_count": 41 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "eg_strategy indicates to the class how example gap in prompt template must be filled.\n", + "- **dynamic**: Only include the examples in the label_contents_map_dict. If there aren't any, don't include anything.\n", + "- **static**: Load the same example for each task from static_examples.json\n", + "- **mixed**: Take an organic example if possible, otherwise take the example from the prompt template.." + ], + "id": "ba2013ed749d061b" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T12:29:23.626626Z", + "start_time": "2026-02-16T12:29:00.013918Z" + } + }, + "cell_type": "code", + "source": "df = generator.get_synt_data_df()", + "id": "cea80669bb4778db", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-02-16 13:29:00,019 - INFO - Setting api responses...\n", + "100%|██████████| 3/3 [00:23<00:00, 7.86s/it]\n", + "2026-02-16 13:29:23,597 - INFO - Getting response df...\n", + "100%|██████████| 3/3 [00:00<?, ?it/s]\n" + ] + } + ], + "execution_count": 42 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-16T12:29:28.946698Z", + "start_time": "2026-02-16T12:29:28.887115Z" + } + }, + "cell_type": "code", + "source": "df", + "id": "1a1512f86c2c6a7e", + "outputs": [ + { + "data": { + "text/plain": [ + " label literal\n", + "0 22.21 Produksjon av polypropylenplater, laminert med...\n", + "1 22.21 Fabrikk som lager fleksibel PVC-film til embal...\n", + "2 22.21 Produksjon av PVC-rør og fittings til vanninst...\n", + "3 22.21 Fremstilling av selvklebende polyetylenfilm fo...\n", + "4 22.21 Produksjon av UV-bestandig polykarbonatplater ...\n", + "5 22.21 Produksjon av polyuretan-slaber og -ark for is...\n", + "6 22.21 Produksjon av fleksible gummifrie slanger av n...\n", + "7 22.21 Produksjon av laminert PET-film med metalliser...\n", + "8 22.21 Produksjon av stålforsterkede plastprofiler fo...\n", + "9 22.21 Produksjon av plastbånd med klebende overflate...\n", + "10 25.21 Produksjon og montering av stålradiatorer til ...\n", + "11 25.21 Fremstilling av høyeffektiv dampkjeler til kra...\n", + "12 25.21 Produksjon av varmevekslere for kombinerte dam...\n", + "13 25.21 Montering av overhetere til dampgeneratorer i ...\n", + "14 25.21 Fabrikasjon av dampsamlere og trykkakkumulator...\n", + "15 25.21 Produksjon av kondensatorer til stasjonære dam...\n", + "16 25.21 Stålproduksjon av radiatorer for store byvarme...\n", + "17 25.21 Design og bygging av industrielle dampkjeler f...\n", + "18 25.21 Produksjon av integrerte varmesystemer med rad...\n", + "19 25.21 Fremstilling av korrosjonsbestandige overheter...\n", + "20 20.14 Produksjon av metanol ved dampreformering av n...\n", + "21 20.14 Destillasjon og raffinement av propan fra petr...\n", + "22 20.14 Fremstilling av aceton gjennom oksidativ dehyd...\n", + "23 20.14 Produksjon av tereftalsyre via oksidativ krakk...\n", + "24 20.14 Syntese av etylenglykol ved hydrolyse av etyle...\n", + "25 20.14 Fremstilling av benzosyre fra benzen gjennom n...\n", + "26 20.14 Produksjon av anisole ved alkylasjon av fenol ...\n", + "27 20.14 Destillasjon av trekulltjære for å isolere phe...\n", + "28 20.14 Fremstilling av kapronsyre fra butadien via ok...\n", + "29 20.14 Produksjon av glykoler ved katalytisk hydrogen..." + ], + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>label</th>\n", + " <th>literal</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>22.21</td>\n", + " <td>Produksjon av polypropylenplater, laminert med...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>22.21</td>\n", + " <td>Fabrikk som lager fleksibel PVC-film til embal...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>22.21</td>\n", + " <td>Produksjon av PVC-rør og fittings til vanninst...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>22.21</td>\n", + " <td>Fremstilling av selvklebende polyetylenfilm fo...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>22.21</td>\n", + " <td>Produksjon av UV-bestandig polykarbonatplater ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>22.21</td>\n", + " <td>Produksjon av polyuretan-slaber og -ark for is...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>22.21</td>\n", + " <td>Produksjon av fleksible gummifrie slanger av n...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>22.21</td>\n", + " <td>Produksjon av laminert PET-film med metalliser...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>22.21</td>\n", + " <td>Produksjon av stålforsterkede plastprofiler fo...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>22.21</td>\n", + " <td>Produksjon av plastbånd med klebende overflate...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>25.21</td>\n", + " <td>Produksjon og montering av stålradiatorer til ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>25.21</td>\n", + " <td>Fremstilling av høyeffektiv dampkjeler til kra...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>25.21</td>\n", + " <td>Produksjon av varmevekslere for kombinerte dam...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>25.21</td>\n", + " <td>Montering av overhetere til dampgeneratorer i ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>25.21</td>\n", + " <td>Fabrikasjon av dampsamlere og trykkakkumulator...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>25.21</td>\n", + " <td>Produksjon av kondensatorer til stasjonære dam...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>25.21</td>\n", + " <td>Stålproduksjon av radiatorer for store byvarme...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>25.21</td>\n", + " <td>Design og bygging av industrielle dampkjeler f...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>25.21</td>\n", + " <td>Produksjon av integrerte varmesystemer med rad...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>25.21</td>\n", + " <td>Fremstilling av korrosjonsbestandige overheter...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>20.14</td>\n", + " <td>Produksjon av metanol ved dampreformering av n...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>20.14</td>\n", + " <td>Destillasjon og raffinement av propan fra petr...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>20.14</td>\n", + " <td>Fremstilling av aceton gjennom oksidativ dehyd...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>20.14</td>\n", + " <td>Produksjon av tereftalsyre via oksidativ krakk...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>20.14</td>\n", + " <td>Syntese av etylenglykol ved hydrolyse av etyle...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>20.14</td>\n", + " <td>Fremstilling av benzosyre fra benzen gjennom n...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>20.14</td>\n", + " <td>Produksjon av anisole ved alkylasjon av fenol ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>20.14</td>\n", + " <td>Destillasjon av trekulltjære for å isolere phe...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>20.14</td>\n", + " <td>Fremstilling av kapronsyre fra butadien via ok...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>20.14</td>\n", + " <td>Produksjon av glykoler ved katalytisk hydrogen...</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 43 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}