From 251e610f0e577db58734e02a2f5717430d01d779 Mon Sep 17 00:00:00 2001 From: henryruhs Date: Fri, 21 Feb 2025 09:07:58 +0100 Subject: [PATCH] Follow the lightning naming and call this dataset, Improve config and types --- embedding_converter/README.md | 2 +- embedding_converter/config.ini | 2 +- .../src/{data_loader.py => dataset.py} | 5 +++-- embedding_converter/src/training.py | 18 +++++++++--------- embedding_converter/src/types.py | 4 +++- 5 files changed, 17 insertions(+), 14 deletions(-) rename embedding_converter/src/{data_loader.py => dataset.py} (93%) diff --git a/embedding_converter/README.md b/embedding_converter/README.md index 9c1dbc6..addc7b3 100644 --- a/embedding_converter/README.md +++ b/embedding_converter/README.md @@ -27,7 +27,7 @@ This `config.ini` utilizes the MegaFace dataset to train the Embedding Converter ``` [training.dataset] -dataset_file_pattern = .datasets/images/{}/*.*g +file_pattern = .datasets/images/{}/*.*g ``` ``` diff --git a/embedding_converter/config.ini b/embedding_converter/config.ini index 62504ef..88e01a1 100644 --- a/embedding_converter/config.ini +++ b/embedding_converter/config.ini @@ -1,5 +1,5 @@ [training.dataset] -dataset_file_pattern = +file_pattern = [training.loader] split_ratio = diff --git a/embedding_converter/src/data_loader.py b/embedding_converter/src/dataset.py similarity index 93% rename from embedding_converter/src/data_loader.py rename to embedding_converter/src/dataset.py index 55eeed8..4aae867 100644 --- a/embedding_converter/src/data_loader.py +++ b/embedding_converter/src/dataset.py @@ -2,14 +2,15 @@ import glob import random import cv2 -import torch +from torch import Tensor + from torch.utils.data import Dataset from torchvision import transforms from .types import Batch -class DataLoaderRecognition(Dataset[torch.Tensor]): +class DynamicDataset(Dataset[Tensor]): def __init__(self, dataset_file_pattern : str) -> None: self.image_paths = glob.glob(dataset_file_pattern) self.transforms = self.compose_transforms() diff --git a/embedding_converter/src/training.py b/embedding_converter/src/training.py index 2decc64..2f90890 100644 --- a/embedding_converter/src/training.py +++ b/embedding_converter/src/training.py @@ -1,6 +1,6 @@ import configparser import os -from typing import Any, Tuple +from typing import Tuple import lightning import torch @@ -11,9 +11,9 @@ from lightning.pytorch.tuner import Tuner from torch import Tensor, nn from torch.utils.data import DataLoader, Dataset, random_split -from .data_loader import DataLoaderRecognition +from .dataset import DynamicDataset from .models.embedding_converter import EmbeddingConverter -from .types import Batch, Embedding +from .types import Batch, Embedding, OptimizerConfig CONFIG = configparser.ConfigParser() CONFIG.read('config.ini') @@ -53,7 +53,7 @@ class EmbeddingConverterTrainer(lightning.LightningModule): self.log('validation', validation, prog_bar = True) return validation - def configure_optimizers(self) -> Any: + def configure_optimizers(self) -> OptimizerConfig: learning_rate = CONFIG.getfloat('training.trainer', 'learning_rate') optimizer = torch.optim.Adam(self.parameters(), lr = learning_rate) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer) @@ -71,17 +71,17 @@ class EmbeddingConverterTrainer(lightning.LightningModule): } -def create_loaders(dataset : Dataset[Any]) -> Tuple[DataLoader[Any], DataLoader[Any]]: +def create_loaders(dataset : Dataset[Tensor]) -> Tuple[DataLoader[Tensor], DataLoader[Tensor]]: batch_size = CONFIG.getint('training.loader', 'batch_size') num_workers = CONFIG.getint('training.loader', 'num_workers') training_dataset, validate_dataset = split_dataset(dataset) training_loader = DataLoader(training_dataset, batch_size = batch_size, shuffle = True, num_workers = num_workers, drop_last = True, pin_memory = True, persistent_workers = True) - validation_loader = DataLoader(validate_dataset, batch_size = batch_size, shuffle = False, num_workers = num_workers, drop_last = False, pin_memory = True, persistent_workers = True) + validation_loader = DataLoader(validate_dataset, batch_size = batch_size, shuffle = False, num_workers = num_workers, pin_memory = True, persistent_workers = True) return training_loader, validation_loader -def split_dataset(dataset : Dataset[Any]) -> Tuple[Dataset[Any], Dataset[Any]]: +def split_dataset(dataset : Dataset[Tensor]) -> Tuple[Dataset[Tensor], Dataset[Tensor]]: loader_split_ratio = CONFIG.getfloat('training.loader', 'split_ratio') training_size = int(loader_split_ratio * len(dataset)) # type:ignore[operator, arg-type] validation_size = len(dataset) - training_size # type:ignore[arg-type] @@ -115,10 +115,10 @@ def create_trainer() -> Trainer: def train() -> None: - dataset_file_pattern = CONFIG.get('training.dataset', 'image_pattern') + dataset_file_pattern = CONFIG.get('training.dataset', 'file_pattern') resume_file_path = CONFIG.get('training.output', 'resume_file_path') - dataset = DataLoaderRecognition(dataset_file_pattern) + dataset = DynamicDataset(dataset_file_pattern) training_loader, validation_loader = create_loaders(dataset) embedding_converter_trainer = EmbeddingConverterTrainer() trainer = create_trainer() diff --git a/embedding_converter/src/types.py b/embedding_converter/src/types.py index 73fcd3a..62db931 100644 --- a/embedding_converter/src/types.py +++ b/embedding_converter/src/types.py @@ -1,4 +1,4 @@ -from typing import Any, TypeAlias +from typing import Any, Dict, TypeAlias from numpy.typing import NDArray from torch import Tensor @@ -6,3 +6,5 @@ from torch import Tensor Batch : TypeAlias = Tensor Embedding : TypeAlias = Tensor VisionFrame : TypeAlias = NDArray[Any] + +OptimizerConfig : TypeAlias = Dict[str, Any]