mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-01 18:51:14 +06:00
Update examples with image processors (#21155)
* Update examples to use image processors * Small fixes * Resolve conflicts
This commit is contained in:
parent
fc8a93507c
commit
4bc18e7a83
@ -22,13 +22,7 @@ The cross-attention will be randomly initialized.
|
|||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from transformers import (
|
from transformers import AutoConfig, AutoImageProcessor, AutoTokenizer, FlaxVisionEncoderDecoderModel, HfArgumentParser
|
||||||
AutoConfig,
|
|
||||||
AutoFeatureExtractor,
|
|
||||||
AutoTokenizer,
|
|
||||||
FlaxVisionEncoderDecoderModel,
|
|
||||||
HfArgumentParser,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -108,13 +102,13 @@ def main():
|
|||||||
model.config.decoder_start_token_id = decoder_start_token_id
|
model.config.decoder_start_token_id = decoder_start_token_id
|
||||||
model.config.pad_token_id = pad_token_id
|
model.config.pad_token_id = pad_token_id
|
||||||
|
|
||||||
feature_extractor = AutoFeatureExtractor.from_pretrained(model_args.encoder_model_name_or_path)
|
image_processor = AutoImageProcessor.from_pretrained(model_args.encoder_model_name_or_path)
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_args.decoder_model_name_or_path)
|
tokenizer = AutoTokenizer.from_pretrained(model_args.decoder_model_name_or_path)
|
||||||
tokenizer.pad_token = tokenizer.convert_ids_to_tokens(model.config.pad_token_id)
|
tokenizer.pad_token = tokenizer.convert_ids_to_tokens(model.config.pad_token_id)
|
||||||
|
|
||||||
model.save_pretrained(model_args.output_dir)
|
model.save_pretrained(model_args.output_dir)
|
||||||
feature_extractor.save_pretrained(model_args.output_dir)
|
image_processor.save_pretrained(model_args.output_dir)
|
||||||
tokenizer.save_pretrained(model_args.output_dir)
|
tokenizer.save_pretrained(model_args.output_dir)
|
||||||
|
|
||||||
|
|
||||||
|
@ -47,7 +47,7 @@ from flax.training import train_state
|
|||||||
from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
|
from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
|
||||||
from huggingface_hub import Repository, create_repo
|
from huggingface_hub import Repository, create_repo
|
||||||
from transformers import (
|
from transformers import (
|
||||||
AutoFeatureExtractor,
|
AutoImageProcessor,
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
FlaxVisionEncoderDecoderModel,
|
FlaxVisionEncoderDecoderModel,
|
||||||
HfArgumentParser,
|
HfArgumentParser,
|
||||||
@ -106,12 +106,12 @@ class TrainingArguments:
|
|||||||
default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
|
default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
|
||||||
)
|
)
|
||||||
_block_size_doc = """
|
_block_size_doc = """
|
||||||
The default value `0` will preprocess (tokenization + feature extraction) the whole dataset before training and
|
The default value `0` will preprocess (tokenization + image processing) the whole dataset before training and
|
||||||
cache the results. This uses more disk space, but avoids (repeated) processing time during training. This is a
|
cache the results. This uses more disk space, but avoids (repeated) processing time during training. This is a
|
||||||
good option if your disk space is large enough to store the whole processed dataset.
|
good option if your disk space is large enough to store the whole processed dataset.
|
||||||
If a positive value is given, the captions in the dataset will be tokenized before training and the results are
|
If a positive value is given, the captions in the dataset will be tokenized before training and the results are
|
||||||
cached. During training, it iterates the dataset in chunks of size `block_size`. On each block, images are
|
cached. During training, it iterates the dataset in chunks of size `block_size`. On each block, images are
|
||||||
transformed by the feature extractor with the results being kept in memory (no cache), and batches of size
|
transformed by the image processor with the results being kept in memory (no cache), and batches of size
|
||||||
`batch_size` are yielded before processing the next block. This could avoid the heavy disk usage when the
|
`batch_size` are yielded before processing the next block. This could avoid the heavy disk usage when the
|
||||||
dataset is large.
|
dataset is large.
|
||||||
"""
|
"""
|
||||||
@ -477,7 +477,7 @@ def main():
|
|||||||
dtype=getattr(jnp, model_args.dtype),
|
dtype=getattr(jnp, model_args.dtype),
|
||||||
use_auth_token=True if model_args.use_auth_token else None,
|
use_auth_token=True if model_args.use_auth_token else None,
|
||||||
)
|
)
|
||||||
feature_extractor = AutoFeatureExtractor.from_pretrained(
|
image_processor = AutoImageProcessor.from_pretrained(
|
||||||
model_args.model_name_or_path,
|
model_args.model_name_or_path,
|
||||||
cache_dir=model_args.cache_dir,
|
cache_dir=model_args.cache_dir,
|
||||||
use_auth_token=True if model_args.use_auth_token else None,
|
use_auth_token=True if model_args.use_auth_token else None,
|
||||||
@ -546,7 +546,7 @@ def main():
|
|||||||
for image_file in examples[image_column]:
|
for image_file in examples[image_column]:
|
||||||
try:
|
try:
|
||||||
image = Image.open(image_file)
|
image = Image.open(image_file)
|
||||||
feature_extractor(images=image, return_tensors="np")
|
image_processor(images=image, return_tensors="np")
|
||||||
bools.append(True)
|
bools.append(True)
|
||||||
except Exception:
|
except Exception:
|
||||||
bools.append(False)
|
bools.append(False)
|
||||||
@ -582,9 +582,9 @@ def main():
|
|||||||
|
|
||||||
return model_inputs
|
return model_inputs
|
||||||
|
|
||||||
def feature_extraction_fn(examples, check_image=True):
|
def image_processing_fn(examples, check_image=True):
|
||||||
"""
|
"""
|
||||||
Run feature extraction on images
|
Run preprocessing on images
|
||||||
|
|
||||||
If `check_image` is `True`, the examples that fails during `Image.open()` will be caught and discarded.
|
If `check_image` is `True`, the examples that fails during `Image.open()` will be caught and discarded.
|
||||||
Otherwise, an exception will be thrown.
|
Otherwise, an exception will be thrown.
|
||||||
@ -609,18 +609,18 @@ def main():
|
|||||||
else:
|
else:
|
||||||
images = [Image.open(image_file) for image_file in examples[image_column]]
|
images = [Image.open(image_file) for image_file in examples[image_column]]
|
||||||
|
|
||||||
encoder_inputs = feature_extractor(images=images, return_tensors="np")
|
encoder_inputs = image_processor(images=images, return_tensors="np")
|
||||||
model_inputs["pixel_values"] = encoder_inputs.pixel_values
|
model_inputs["pixel_values"] = encoder_inputs.pixel_values
|
||||||
|
|
||||||
return model_inputs
|
return model_inputs
|
||||||
|
|
||||||
def preprocess_fn(examples, max_target_length, check_image=True):
|
def preprocess_fn(examples, max_target_length, check_image=True):
|
||||||
"""Run tokenization + image feature extraction"""
|
"""Run tokenization + image processing"""
|
||||||
|
|
||||||
model_inputs = {}
|
model_inputs = {}
|
||||||
# This contains image path column
|
# This contains image path column
|
||||||
model_inputs.update(tokenization_fn(examples, max_target_length))
|
model_inputs.update(tokenization_fn(examples, max_target_length))
|
||||||
model_inputs.update(feature_extraction_fn(model_inputs, check_image=check_image))
|
model_inputs.update(image_processing_fn(model_inputs, check_image=check_image))
|
||||||
# Remove image path column
|
# Remove image path column
|
||||||
model_inputs.pop(image_column)
|
model_inputs.pop(image_column)
|
||||||
|
|
||||||
@ -644,15 +644,15 @@ def main():
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# If `block_size` is `0`, tokenization & image feature extraction is done at the beginning
|
# If `block_size` is `0`, tokenization & image processing is done at the beginning
|
||||||
run_feat_ext_at_beginning = training_args.block_size == 0
|
run_img_proc_at_beginning = training_args.block_size == 0
|
||||||
# Used in .map() below
|
# Used in .map() below
|
||||||
function_kwarg = preprocess_fn if run_feat_ext_at_beginning else tokenization_fn
|
function_kwarg = preprocess_fn if run_img_proc_at_beginning else tokenization_fn
|
||||||
# `features` is used only for the final preprocessed dataset (for the performance purpose).
|
# `features` is used only for the final preprocessed dataset (for the performance purpose).
|
||||||
features_kwarg = features if run_feat_ext_at_beginning else None
|
features_kwarg = features if run_img_proc_at_beginning else None
|
||||||
# Keep `image_column` if the feature extraction is done during training
|
# Keep `image_column` if the image processing is done during training
|
||||||
remove_columns_kwarg = [x for x in column_names if x != image_column or run_feat_ext_at_beginning]
|
remove_columns_kwarg = [x for x in column_names if x != image_column or run_img_proc_at_beginning]
|
||||||
processor_names = "tokenizer and feature extractor" if run_feat_ext_at_beginning else "tokenizer"
|
processor_names = "tokenizer and image processor" if run_img_proc_at_beginning else "tokenizer"
|
||||||
|
|
||||||
# Store some constant
|
# Store some constant
|
||||||
train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
|
train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
|
||||||
@ -671,9 +671,9 @@ def main():
|
|||||||
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
train_dataset = train_dataset.select(range(max_train_samples))
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
# remove problematic examples
|
# remove problematic examples
|
||||||
# (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
|
# (if image processing is performed at the beginning, the filtering is done during preprocessing below
|
||||||
# instead here.)
|
# instead here.)
|
||||||
if not run_feat_ext_at_beginning:
|
if not run_img_proc_at_beginning:
|
||||||
train_dataset = train_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
|
train_dataset = train_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
function=function_kwarg,
|
function=function_kwarg,
|
||||||
@ -686,7 +686,7 @@ def main():
|
|||||||
fn_kwargs={"max_target_length": data_args.max_target_length},
|
fn_kwargs={"max_target_length": data_args.max_target_length},
|
||||||
features=features_kwarg,
|
features=features_kwarg,
|
||||||
)
|
)
|
||||||
if run_feat_ext_at_beginning:
|
if run_img_proc_at_beginning:
|
||||||
# set format (for performance) since the dataset is ready to be used
|
# set format (for performance) since the dataset is ready to be used
|
||||||
train_dataset = train_dataset.with_format("numpy")
|
train_dataset = train_dataset.with_format("numpy")
|
||||||
|
|
||||||
@ -705,9 +705,9 @@ def main():
|
|||||||
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
# remove problematic examples
|
# remove problematic examples
|
||||||
# (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
|
# (if image processing is performed at the beginning, the filtering is done during preprocessing below
|
||||||
# instead here.)
|
# instead here.)
|
||||||
if not run_feat_ext_at_beginning:
|
if not run_img_proc_at_beginning:
|
||||||
eval_dataset = eval_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
|
eval_dataset = eval_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
|
||||||
eval_dataset = eval_dataset.map(
|
eval_dataset = eval_dataset.map(
|
||||||
function=function_kwarg,
|
function=function_kwarg,
|
||||||
@ -720,7 +720,7 @@ def main():
|
|||||||
fn_kwargs={"max_target_length": data_args.val_max_target_length},
|
fn_kwargs={"max_target_length": data_args.val_max_target_length},
|
||||||
features=features_kwarg,
|
features=features_kwarg,
|
||||||
)
|
)
|
||||||
if run_feat_ext_at_beginning:
|
if run_img_proc_at_beginning:
|
||||||
# set format (for performance) since the dataset is ready to be used
|
# set format (for performance) since the dataset is ready to be used
|
||||||
eval_dataset = eval_dataset.with_format("numpy")
|
eval_dataset = eval_dataset.with_format("numpy")
|
||||||
|
|
||||||
@ -735,9 +735,9 @@ def main():
|
|||||||
max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
|
max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
|
||||||
predict_dataset = predict_dataset.select(range(max_predict_samples))
|
predict_dataset = predict_dataset.select(range(max_predict_samples))
|
||||||
# remove problematic examples
|
# remove problematic examples
|
||||||
# (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
|
# (if image processing is performed at the beginning, the filtering is done during preprocessing below
|
||||||
# instead here.)
|
# instead here.)
|
||||||
if not run_feat_ext_at_beginning:
|
if not run_img_proc_at_beginning:
|
||||||
predict_dataset = predict_dataset.filter(
|
predict_dataset = predict_dataset.filter(
|
||||||
filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers
|
filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers
|
||||||
)
|
)
|
||||||
@ -752,7 +752,7 @@ def main():
|
|||||||
fn_kwargs={"max_target_length": data_args.val_max_target_length},
|
fn_kwargs={"max_target_length": data_args.val_max_target_length},
|
||||||
features=features_kwarg,
|
features=features_kwarg,
|
||||||
)
|
)
|
||||||
if run_feat_ext_at_beginning:
|
if run_img_proc_at_beginning:
|
||||||
# set format (for performance) since the dataset is ready to be used
|
# set format (for performance) since the dataset is ready to be used
|
||||||
predict_dataset = predict_dataset.with_format("numpy")
|
predict_dataset = predict_dataset.with_format("numpy")
|
||||||
|
|
||||||
@ -771,8 +771,8 @@ def main():
|
|||||||
"""
|
"""
|
||||||
Wrap the simple `data_loader` in a block-wise way if `block_size` > 0, else it's the same as `data_loader`.
|
Wrap the simple `data_loader` in a block-wise way if `block_size` > 0, else it's the same as `data_loader`.
|
||||||
|
|
||||||
If `block_size` > 0, it requires `ds` to have a column that gives image paths in order to perform image feature
|
If `block_size` > 0, it requires `ds` to have a column that gives image paths in order to perform image
|
||||||
extraction (with the column name being specified by `image_column`). The tokenization should be done before
|
processing (with the column name being specified by `image_column`). The tokenization should be done before
|
||||||
training in this case.
|
training in this case.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -804,7 +804,7 @@ def main():
|
|||||||
_ds = ds.select(selected_indices)
|
_ds = ds.select(selected_indices)
|
||||||
|
|
||||||
_ds = _ds.map(
|
_ds = _ds.map(
|
||||||
feature_extraction_fn,
|
image_processing_fn,
|
||||||
batched=True,
|
batched=True,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
remove_columns=[image_column],
|
remove_columns=[image_column],
|
||||||
@ -813,7 +813,7 @@ def main():
|
|||||||
keep_in_memory=keep_in_memory,
|
keep_in_memory=keep_in_memory,
|
||||||
# The images are already checked either in `.filter()` or in `preprocess_fn()`
|
# The images are already checked either in `.filter()` or in `preprocess_fn()`
|
||||||
fn_kwargs={"check_image": False},
|
fn_kwargs={"check_image": False},
|
||||||
desc=f"Running feature extraction on {split} dataset".replace(" ", " "),
|
desc=f"Running image processing on {split} dataset".replace(" ", " "),
|
||||||
)
|
)
|
||||||
_ds = _ds.with_format("numpy")
|
_ds = _ds.with_format("numpy")
|
||||||
|
|
||||||
|
@ -52,15 +52,15 @@ ds = datasets.load_dataset("ydshieh/coco_dataset_script", "2017", data_dir=COCO_
|
|||||||
|
|
||||||
### Create a model from a vision encoder model and a text decoder model
|
### Create a model from a vision encoder model and a text decoder model
|
||||||
Next, we create a [VisionTextDualEncoderModel](https://huggingface.co/docs/transformers/model_doc/vision-text-dual-encoder#visiontextdualencoder).
|
Next, we create a [VisionTextDualEncoderModel](https://huggingface.co/docs/transformers/model_doc/vision-text-dual-encoder#visiontextdualencoder).
|
||||||
The `VisionTextDualEncoderModel` class let's you load any vision and text encoder model to create a dual encoder.
|
The `VisionTextDualEncoderModel` class let's you load any vision and text encoder model to create a dual encoder.
|
||||||
Here is an example of how to load the model using pre-trained vision and text models.
|
Here is an example of how to load the model using pre-trained vision and text models.
|
||||||
|
|
||||||
```python3
|
```python3
|
||||||
from transformers import (
|
from transformers import (
|
||||||
VisionTextDualEncoderModel,
|
VisionTextDualEncoderModel,
|
||||||
VisionTextDualEncoderProcessor,
|
VisionTextDualEncoderProcessor,
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
AutoFeatureExtractor
|
AutoImageProcessor
|
||||||
)
|
)
|
||||||
|
|
||||||
model = VisionTextDualEncoderModel.from_vision_text_pretrained(
|
model = VisionTextDualEncoderModel.from_vision_text_pretrained(
|
||||||
@ -68,8 +68,8 @@ model = VisionTextDualEncoderModel.from_vision_text_pretrained(
|
|||||||
)
|
)
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
||||||
feat_ext = AutoFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32")
|
image_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
||||||
processor = VisionTextDualEncoderProcessor(feat_ext, tokenizer)
|
processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
|
||||||
|
|
||||||
# save the model and processor
|
# save the model and processor
|
||||||
model.save_pretrained("clip-roberta")
|
model.save_pretrained("clip-roberta")
|
||||||
|
@ -38,7 +38,7 @@ from torchvision.transforms.functional import InterpolationMode
|
|||||||
|
|
||||||
import transformers
|
import transformers
|
||||||
from transformers import (
|
from transformers import (
|
||||||
AutoFeatureExtractor,
|
AutoImageProcessor,
|
||||||
AutoModel,
|
AutoModel,
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
HfArgumentParser,
|
HfArgumentParser,
|
||||||
@ -74,7 +74,7 @@ class ModelArguments:
|
|||||||
tokenizer_name: Optional[str] = field(
|
tokenizer_name: Optional[str] = field(
|
||||||
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
|
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
|
||||||
)
|
)
|
||||||
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
|
image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
|
||||||
cache_dir: Optional[str] = field(
|
cache_dir: Optional[str] = field(
|
||||||
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
|
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
|
||||||
)
|
)
|
||||||
@ -308,7 +308,7 @@ def main():
|
|||||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||||
|
|
||||||
# 5. Load pretrained model, tokenizer, and feature extractor
|
# 5. Load pretrained model, tokenizer, and image processor
|
||||||
if model_args.tokenizer_name:
|
if model_args.tokenizer_name:
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
|
model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
|
||||||
@ -323,9 +323,9 @@ def main():
|
|||||||
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
|
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
|
||||||
)
|
)
|
||||||
|
|
||||||
# Load feature_extractor, in this script we only use this to get the mean and std for normalization.
|
# Load image_processor, in this script we only use this to get the mean and std for normalization.
|
||||||
feature_extractor = AutoFeatureExtractor.from_pretrained(
|
image_processor = AutoImageProcessor.from_pretrained(
|
||||||
model_args.feature_extractor_name or model_args.model_name_or_path,
|
model_args.image_processor_name or model_args.model_name_or_path,
|
||||||
cache_dir=model_args.cache_dir,
|
cache_dir=model_args.cache_dir,
|
||||||
revision=model_args.model_revision,
|
revision=model_args.model_revision,
|
||||||
use_auth_token=True if model_args.use_auth_token else None,
|
use_auth_token=True if model_args.use_auth_token else None,
|
||||||
@ -386,7 +386,7 @@ def main():
|
|||||||
# 7. Preprocessing the datasets.
|
# 7. Preprocessing the datasets.
|
||||||
# Initialize torchvision transforms and jit it for faster processing.
|
# Initialize torchvision transforms and jit it for faster processing.
|
||||||
image_transformations = Transform(
|
image_transformations = Transform(
|
||||||
config.vision_config.image_size, feature_extractor.image_mean, feature_extractor.image_std
|
config.vision_config.image_size, image_processor.image_mean, image_processor.image_std
|
||||||
)
|
)
|
||||||
image_transformations = torch.jit.script(image_transformations)
|
image_transformations = torch.jit.script(image_transformations)
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ import transformers
|
|||||||
from transformers import (
|
from transformers import (
|
||||||
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
|
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
AutoFeatureExtractor,
|
AutoImageProcessor,
|
||||||
AutoModelForImageClassification,
|
AutoModelForImageClassification,
|
||||||
HfArgumentParser,
|
HfArgumentParser,
|
||||||
Trainer,
|
Trainer,
|
||||||
@ -141,7 +141,7 @@ class ModelArguments:
|
|||||||
default="main",
|
default="main",
|
||||||
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
|
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
|
||||||
)
|
)
|
||||||
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
|
image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
|
||||||
use_auth_token: bool = field(
|
use_auth_token: bool = field(
|
||||||
default=False,
|
default=False,
|
||||||
metadata={
|
metadata={
|
||||||
@ -283,19 +283,19 @@ def main():
|
|||||||
use_auth_token=True if model_args.use_auth_token else None,
|
use_auth_token=True if model_args.use_auth_token else None,
|
||||||
ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
|
ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
|
||||||
)
|
)
|
||||||
feature_extractor = AutoFeatureExtractor.from_pretrained(
|
image_processor = AutoImageProcessor.from_pretrained(
|
||||||
model_args.feature_extractor_name or model_args.model_name_or_path,
|
model_args.image_processor_name or model_args.model_name_or_path,
|
||||||
cache_dir=model_args.cache_dir,
|
cache_dir=model_args.cache_dir,
|
||||||
revision=model_args.model_revision,
|
revision=model_args.model_revision,
|
||||||
use_auth_token=True if model_args.use_auth_token else None,
|
use_auth_token=True if model_args.use_auth_token else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Define torchvision transforms to be applied to each image.
|
# Define torchvision transforms to be applied to each image.
|
||||||
if "shortest_edge" in feature_extractor.size:
|
if "shortest_edge" in image_processor.size:
|
||||||
size = feature_extractor.size["shortest_edge"]
|
size = image_processor.size["shortest_edge"]
|
||||||
else:
|
else:
|
||||||
size = (feature_extractor.size["height"], feature_extractor.size["width"])
|
size = (image_processor.size["height"], image_processor.size["width"])
|
||||||
normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
|
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
|
||||||
_train_transforms = Compose(
|
_train_transforms = Compose(
|
||||||
[
|
[
|
||||||
RandomResizedCrop(size),
|
RandomResizedCrop(size),
|
||||||
@ -352,7 +352,7 @@ def main():
|
|||||||
train_dataset=dataset["train"] if training_args.do_train else None,
|
train_dataset=dataset["train"] if training_args.do_train else None,
|
||||||
eval_dataset=dataset["validation"] if training_args.do_eval else None,
|
eval_dataset=dataset["validation"] if training_args.do_eval else None,
|
||||||
compute_metrics=compute_metrics,
|
compute_metrics=compute_metrics,
|
||||||
tokenizer=feature_extractor,
|
tokenizer=image_processor,
|
||||||
data_collator=collate_fn,
|
data_collator=collate_fn,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -41,13 +41,7 @@ from accelerate import Accelerator
|
|||||||
from accelerate.logging import get_logger
|
from accelerate.logging import get_logger
|
||||||
from accelerate.utils import set_seed
|
from accelerate.utils import set_seed
|
||||||
from huggingface_hub import Repository, create_repo
|
from huggingface_hub import Repository, create_repo
|
||||||
from transformers import (
|
from transformers import AutoConfig, AutoImageProcessor, AutoModelForImageClassification, SchedulerType, get_scheduler
|
||||||
AutoConfig,
|
|
||||||
AutoFeatureExtractor,
|
|
||||||
AutoModelForImageClassification,
|
|
||||||
SchedulerType,
|
|
||||||
get_scheduler,
|
|
||||||
)
|
|
||||||
from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
|
from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
|
||||||
from transformers.utils.versions import require_version
|
from transformers.utils.versions import require_version
|
||||||
|
|
||||||
@ -294,7 +288,7 @@ def main():
|
|||||||
label2id = {label: str(i) for i, label in enumerate(labels)}
|
label2id = {label: str(i) for i, label in enumerate(labels)}
|
||||||
id2label = {str(i): label for i, label in enumerate(labels)}
|
id2label = {str(i): label for i, label in enumerate(labels)}
|
||||||
|
|
||||||
# Load pretrained model and feature extractor
|
# Load pretrained model and image processor
|
||||||
#
|
#
|
||||||
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
|
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
|
||||||
# download model & vocab.
|
# download model & vocab.
|
||||||
@ -305,7 +299,7 @@ def main():
|
|||||||
label2id=label2id,
|
label2id=label2id,
|
||||||
finetuning_task="image-classification",
|
finetuning_task="image-classification",
|
||||||
)
|
)
|
||||||
feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_name_or_path)
|
image_processor = AutoImageProcessor.from_pretrained(args.model_name_or_path)
|
||||||
model = AutoModelForImageClassification.from_pretrained(
|
model = AutoModelForImageClassification.from_pretrained(
|
||||||
args.model_name_or_path,
|
args.model_name_or_path,
|
||||||
from_tf=bool(".ckpt" in args.model_name_or_path),
|
from_tf=bool(".ckpt" in args.model_name_or_path),
|
||||||
@ -316,11 +310,11 @@ def main():
|
|||||||
# Preprocessing the datasets
|
# Preprocessing the datasets
|
||||||
|
|
||||||
# Define torchvision transforms to be applied to each image.
|
# Define torchvision transforms to be applied to each image.
|
||||||
if "shortest_edge" in feature_extractor.size:
|
if "shortest_edge" in image_processor.size:
|
||||||
size = feature_extractor.size["shortest_edge"]
|
size = image_processor.size["shortest_edge"]
|
||||||
else:
|
else:
|
||||||
size = (feature_extractor.size["height"], feature_extractor.size["width"])
|
size = (image_processor.size["height"], image_processor.size["width"])
|
||||||
normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
|
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
|
||||||
train_transforms = Compose(
|
train_transforms = Compose(
|
||||||
[
|
[
|
||||||
RandomResizedCrop(size),
|
RandomResizedCrop(size),
|
||||||
@ -505,7 +499,7 @@ def main():
|
|||||||
save_function=accelerator.save,
|
save_function=accelerator.save,
|
||||||
)
|
)
|
||||||
if accelerator.is_main_process:
|
if accelerator.is_main_process:
|
||||||
feature_extractor.save_pretrained(args.output_dir)
|
image_processor.save_pretrained(args.output_dir)
|
||||||
repo.push_to_hub(
|
repo.push_to_hub(
|
||||||
commit_message=f"Training in progress {completed_steps} steps",
|
commit_message=f"Training in progress {completed_steps} steps",
|
||||||
blocking=False,
|
blocking=False,
|
||||||
@ -547,7 +541,7 @@ def main():
|
|||||||
args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
|
args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
|
||||||
)
|
)
|
||||||
if accelerator.is_main_process:
|
if accelerator.is_main_process:
|
||||||
feature_extractor.save_pretrained(args.output_dir)
|
image_processor.save_pretrained(args.output_dir)
|
||||||
repo.push_to_hub(
|
repo.push_to_hub(
|
||||||
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
||||||
)
|
)
|
||||||
@ -568,7 +562,7 @@ def main():
|
|||||||
args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
|
args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
|
||||||
)
|
)
|
||||||
if accelerator.is_main_process:
|
if accelerator.is_main_process:
|
||||||
feature_extractor.save_pretrained(args.output_dir)
|
image_processor.save_pretrained(args.output_dir)
|
||||||
if args.push_to_hub:
|
if args.push_to_hub:
|
||||||
repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
|
repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
|
||||||
|
|
||||||
|
@ -29,7 +29,7 @@ from transformers import (
|
|||||||
HfArgumentParser,
|
HfArgumentParser,
|
||||||
Trainer,
|
Trainer,
|
||||||
TrainingArguments,
|
TrainingArguments,
|
||||||
ViTFeatureExtractor,
|
ViTImageProcessor,
|
||||||
ViTMAEConfig,
|
ViTMAEConfig,
|
||||||
ViTMAEForPreTraining,
|
ViTMAEForPreTraining,
|
||||||
)
|
)
|
||||||
@ -102,7 +102,7 @@ class DataTrainingArguments:
|
|||||||
@dataclass
|
@dataclass
|
||||||
class ModelArguments:
|
class ModelArguments:
|
||||||
"""
|
"""
|
||||||
Arguments pertaining to which model/config/feature extractor we are going to pre-train.
|
Arguments pertaining to which model/config/image processor we are going to pre-train.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model_name_or_path: str = field(
|
model_name_or_path: str = field(
|
||||||
@ -132,7 +132,7 @@ class ModelArguments:
|
|||||||
default="main",
|
default="main",
|
||||||
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
|
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
|
||||||
)
|
)
|
||||||
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
|
image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
|
||||||
use_auth_token: bool = field(
|
use_auth_token: bool = field(
|
||||||
default=False,
|
default=False,
|
||||||
metadata={
|
metadata={
|
||||||
@ -230,7 +230,7 @@ def main():
|
|||||||
ds["train"] = split["train"]
|
ds["train"] = split["train"]
|
||||||
ds["validation"] = split["test"]
|
ds["validation"] = split["test"]
|
||||||
|
|
||||||
# Load pretrained model and feature extractor
|
# Load pretrained model and image processor
|
||||||
#
|
#
|
||||||
# Distributed training:
|
# Distributed training:
|
||||||
# The .from_pretrained methods guarantee that only one local process can concurrently
|
# The .from_pretrained methods guarantee that only one local process can concurrently
|
||||||
@ -260,13 +260,13 @@ def main():
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# create feature extractor
|
# create image processor
|
||||||
if model_args.feature_extractor_name:
|
if model_args.image_processor_name:
|
||||||
feature_extractor = ViTFeatureExtractor.from_pretrained(model_args.feature_extractor_name, **config_kwargs)
|
image_processor = ViTImageProcessor.from_pretrained(model_args.image_processor_name, **config_kwargs)
|
||||||
elif model_args.model_name_or_path:
|
elif model_args.model_name_or_path:
|
||||||
feature_extractor = ViTFeatureExtractor.from_pretrained(model_args.model_name_or_path, **config_kwargs)
|
image_processor = ViTImageProcessor.from_pretrained(model_args.model_name_or_path, **config_kwargs)
|
||||||
else:
|
else:
|
||||||
feature_extractor = ViTFeatureExtractor()
|
image_processor = ViTImageProcessor()
|
||||||
|
|
||||||
# create model
|
# create model
|
||||||
if model_args.model_name_or_path:
|
if model_args.model_name_or_path:
|
||||||
@ -298,17 +298,17 @@ def main():
|
|||||||
|
|
||||||
# transformations as done in original MAE paper
|
# transformations as done in original MAE paper
|
||||||
# source: https://github.com/facebookresearch/mae/blob/main/main_pretrain.py
|
# source: https://github.com/facebookresearch/mae/blob/main/main_pretrain.py
|
||||||
if "shortest_edge" in feature_extractor.size:
|
if "shortest_edge" in image_processor.size:
|
||||||
size = feature_extractor.size["shortest_edge"]
|
size = image_processor.size["shortest_edge"]
|
||||||
else:
|
else:
|
||||||
size = (feature_extractor.size["height"], feature_extractor.size["width"])
|
size = (image_processor.size["height"], image_processor.size["width"])
|
||||||
transforms = Compose(
|
transforms = Compose(
|
||||||
[
|
[
|
||||||
Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
|
Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
|
||||||
RandomResizedCrop(size, scale=(0.2, 1.0), interpolation=InterpolationMode.BICUBIC),
|
RandomResizedCrop(size, scale=(0.2, 1.0), interpolation=InterpolationMode.BICUBIC),
|
||||||
RandomHorizontalFlip(),
|
RandomHorizontalFlip(),
|
||||||
ToTensor(),
|
ToTensor(),
|
||||||
Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
|
Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -349,7 +349,7 @@ def main():
|
|||||||
args=training_args,
|
args=training_args,
|
||||||
train_dataset=ds["train"] if training_args.do_train else None,
|
train_dataset=ds["train"] if training_args.do_train else None,
|
||||||
eval_dataset=ds["validation"] if training_args.do_eval else None,
|
eval_dataset=ds["validation"] if training_args.do_eval else None,
|
||||||
tokenizer=feature_extractor,
|
tokenizer=image_processor,
|
||||||
data_collator=collate_fn,
|
data_collator=collate_fn,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -27,10 +27,10 @@ from torchvision.transforms import Compose, Lambda, Normalize, RandomHorizontalF
|
|||||||
import transformers
|
import transformers
|
||||||
from transformers import (
|
from transformers import (
|
||||||
CONFIG_MAPPING,
|
CONFIG_MAPPING,
|
||||||
FEATURE_EXTRACTOR_MAPPING,
|
IMAGE_PROCESSOR_MAPPING,
|
||||||
MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
|
MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
AutoFeatureExtractor,
|
AutoImageProcessor,
|
||||||
AutoModelForMaskedImageModeling,
|
AutoModelForMaskedImageModeling,
|
||||||
HfArgumentParser,
|
HfArgumentParser,
|
||||||
Trainer,
|
Trainer,
|
||||||
@ -115,7 +115,7 @@ class DataTrainingArguments:
|
|||||||
@dataclass
|
@dataclass
|
||||||
class ModelArguments:
|
class ModelArguments:
|
||||||
"""
|
"""
|
||||||
Arguments pertaining to which model/config/feature extractor we are going to pre-train.
|
Arguments pertaining to which model/config/image processor we are going to pre-train.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model_name_or_path: str = field(
|
model_name_or_path: str = field(
|
||||||
@ -152,7 +152,7 @@ class ModelArguments:
|
|||||||
default="main",
|
default="main",
|
||||||
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
|
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
|
||||||
)
|
)
|
||||||
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
|
image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
|
||||||
use_auth_token: bool = field(
|
use_auth_token: bool = field(
|
||||||
default=False,
|
default=False,
|
||||||
metadata={
|
metadata={
|
||||||
@ -334,17 +334,16 @@ def main():
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# create feature extractor
|
# create image processor
|
||||||
if model_args.feature_extractor_name:
|
if model_args.image_processor_name:
|
||||||
feature_extractor = AutoFeatureExtractor.from_pretrained(model_args.feature_extractor_name, **config_kwargs)
|
image_processor = AutoImageProcessor.from_pretrained(model_args.image_processor_name, **config_kwargs)
|
||||||
elif model_args.model_name_or_path:
|
elif model_args.model_name_or_path:
|
||||||
feature_extractor = AutoFeatureExtractor.from_pretrained(model_args.model_name_or_path, **config_kwargs)
|
image_processor = AutoImageProcessor.from_pretrained(model_args.model_name_or_path, **config_kwargs)
|
||||||
else:
|
else:
|
||||||
FEATURE_EXTRACTOR_TYPES = {
|
IMAGE_PROCESSOR_TYPES = {
|
||||||
conf.model_type: feature_extractor_class
|
conf.model_type: image_processor_class for conf, image_processor_class in IMAGE_PROCESSOR_MAPPING.items()
|
||||||
for conf, feature_extractor_class in FEATURE_EXTRACTOR_MAPPING.items()
|
|
||||||
}
|
}
|
||||||
feature_extractor = FEATURE_EXTRACTOR_TYPES[model_args.model_type]()
|
image_processor = IMAGE_PROCESSOR_TYPES[model_args.model_type]()
|
||||||
|
|
||||||
# create model
|
# create model
|
||||||
if model_args.model_name_or_path:
|
if model_args.model_name_or_path:
|
||||||
@ -382,7 +381,7 @@ def main():
|
|||||||
RandomResizedCrop(model_args.image_size, scale=(0.67, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0)),
|
RandomResizedCrop(model_args.image_size, scale=(0.67, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0)),
|
||||||
RandomHorizontalFlip(),
|
RandomHorizontalFlip(),
|
||||||
ToTensor(),
|
ToTensor(),
|
||||||
Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
|
Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -427,7 +426,7 @@ def main():
|
|||||||
args=training_args,
|
args=training_args,
|
||||||
train_dataset=ds["train"] if training_args.do_train else None,
|
train_dataset=ds["train"] if training_args.do_train else None,
|
||||||
eval_dataset=ds["validation"] if training_args.do_eval else None,
|
eval_dataset=ds["validation"] if training_args.do_eval else None,
|
||||||
tokenizer=feature_extractor,
|
tokenizer=image_processor,
|
||||||
data_collator=collate_fn,
|
data_collator=collate_fn,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -40,7 +40,7 @@ from datasets import Dataset, DatasetDict, Image
|
|||||||
|
|
||||||
# your images can of course have a different extension
|
# your images can of course have a different extension
|
||||||
# semantic segmentation maps are typically stored in the png format
|
# semantic segmentation maps are typically stored in the png format
|
||||||
image_paths_train = ["path/to/image_1.jpg/jpg", "path/to/image_2.jpg/jpg", ..., "path/to/image_n.jpg/jpg"]
|
image_paths_train = ["path/to/image_1.jpg/jpg", "path/to/image_2.jpg/jpg", ..., "path/to/image_n.jpg/jpg"]
|
||||||
label_paths_train = ["path/to/annotation_1.png", "path/to/annotation_2.png", ..., "path/to/annotation_n.png"]
|
label_paths_train = ["path/to/annotation_1.png", "path/to/annotation_2.png", ..., "path/to/annotation_n.png"]
|
||||||
|
|
||||||
# same for validation
|
# same for validation
|
||||||
@ -52,7 +52,7 @@ def create_dataset(image_paths, label_paths):
|
|||||||
"label": sorted(label_paths)})
|
"label": sorted(label_paths)})
|
||||||
dataset = dataset.cast_column("image", Image())
|
dataset = dataset.cast_column("image", Image())
|
||||||
dataset = dataset.cast_column("label", Image())
|
dataset = dataset.cast_column("label", Image())
|
||||||
|
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
# step 1: create Dataset objects
|
# step 1: create Dataset objects
|
||||||
@ -91,7 +91,7 @@ You can easily upload this by clicking on "Add file" in the "Files and versions"
|
|||||||
|
|
||||||
## PyTorch version, Trainer
|
## PyTorch version, Trainer
|
||||||
|
|
||||||
Based on the script [`run_semantic_segmentation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py).
|
Based on the script [`run_semantic_segmentation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py).
|
||||||
|
|
||||||
The script leverages the [🤗 Trainer API](https://huggingface.co/docs/transformers/main_classes/trainer) to automatically take care of the training for you, running on distributed environments right away.
|
The script leverages the [🤗 Trainer API](https://huggingface.co/docs/transformers/main_classes/trainer) to automatically take care of the training for you, running on distributed environments right away.
|
||||||
|
|
||||||
@ -130,7 +130,7 @@ Note that you can replace the model and dataset by simply setting the `model_nam
|
|||||||
|
|
||||||
Based on the script [`run_semantic_segmentation_no_trainer.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py).
|
Based on the script [`run_semantic_segmentation_no_trainer.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py).
|
||||||
|
|
||||||
The script leverages [🤗 `Accelerate`](https://github.com/huggingface/accelerate), which allows to write your own training loop in PyTorch, but have it run instantly on any (distributed) environment, including CPU, multi-CPU, GPU, multi-GPU and TPU. It also supports mixed precision.
|
The script leverages [🤗 `Accelerate`](https://github.com/huggingface/accelerate), which allows to write your own training loop in PyTorch, but have it run instantly on any (distributed) environment, including CPU, multi-CPU, GPU, multi-GPU and TPU. It also supports mixed precision.
|
||||||
|
|
||||||
First, run:
|
First, run:
|
||||||
|
|
||||||
@ -161,11 +161,11 @@ The resulting model can be seen here: https://huggingface.co/nielsr/segformer-fi
|
|||||||
This means that after training, you can easily load your trained model as follows:
|
This means that after training, you can easily load your trained model as follows:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import AutoFeatureExtractor, AutoModelForSemanticSegmentation
|
from transformers import AutoImageProcessor, AutoModelForSemanticSegmentation
|
||||||
|
|
||||||
model_name = "name_of_repo_on_the_hub_or_path_to_local_folder"
|
model_name = "name_of_repo_on_the_hub_or_path_to_local_folder"
|
||||||
|
|
||||||
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
|
image_processor = AutoImageProcessor.from_pretrained(model_name)
|
||||||
model = AutoModelForSemanticSegmentation.from_pretrained(model_name)
|
model = AutoModelForSemanticSegmentation.from_pretrained(model_name)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -180,7 +180,7 @@ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|||||||
image = Image.open(requests.get(url, stream=True).raw)
|
image = Image.open(requests.get(url, stream=True).raw)
|
||||||
|
|
||||||
# prepare image for the model
|
# prepare image for the model
|
||||||
inputs = feature_extractor(images=image, return_tensors="pt")
|
inputs = image_processor(images=image, return_tensors="pt")
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
@ -201,4 +201,4 @@ For visualization of the segmentation maps, we refer to the [example notebook](h
|
|||||||
|
|
||||||
Some datasets, like [`scene_parse_150`](https://huggingface.co/datasets/scene_parse_150), contain a "background" label that is not part of the classes. The Scene Parse 150 dataset for instance contains labels between 0 and 150, with 0 being the background class, and 1 to 150 being actual class names (like "tree", "person", etc.). For these kind of datasets, one replaces the background label (0) by 255, which is the `ignore_index` of the PyTorch model's loss function, and reduces all labels by 1. This way, the `labels` are PyTorch tensors containing values between 0 and 149, and 255 for all background/padding.
|
Some datasets, like [`scene_parse_150`](https://huggingface.co/datasets/scene_parse_150), contain a "background" label that is not part of the classes. The Scene Parse 150 dataset for instance contains labels between 0 and 150, with 0 being the background class, and 1 to 150 being actual class names (like "tree", "person", etc.). For these kind of datasets, one replaces the background label (0) by 255, which is the `ignore_index` of the PyTorch model's loss function, and reduces all labels by 1. This way, the `labels` are PyTorch tensors containing values between 0 and 149, and 255 for all background/padding.
|
||||||
|
|
||||||
In case you're training on such a dataset, make sure to set the ``reduce_labels`` flag, which will take care of this.
|
In case you're training on such a dataset, make sure to set the ``reduce_labels`` flag, which will take care of this.
|
||||||
|
@ -34,7 +34,7 @@ import transformers
|
|||||||
from huggingface_hub import hf_hub_download
|
from huggingface_hub import hf_hub_download
|
||||||
from transformers import (
|
from transformers import (
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
AutoFeatureExtractor,
|
AutoImageProcessor,
|
||||||
AutoModelForSemanticSegmentation,
|
AutoModelForSemanticSegmentation,
|
||||||
HfArgumentParser,
|
HfArgumentParser,
|
||||||
Trainer,
|
Trainer,
|
||||||
@ -240,7 +240,7 @@ class ModelArguments:
|
|||||||
default="main",
|
default="main",
|
||||||
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
|
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
|
||||||
)
|
)
|
||||||
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
|
image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
|
||||||
use_auth_token: bool = field(
|
use_auth_token: bool = field(
|
||||||
default=False,
|
default=False,
|
||||||
metadata={
|
metadata={
|
||||||
@ -358,7 +358,7 @@ def main():
|
|||||||
references=labels,
|
references=labels,
|
||||||
num_labels=len(id2label),
|
num_labels=len(id2label),
|
||||||
ignore_index=0,
|
ignore_index=0,
|
||||||
reduce_labels=feature_extractor.do_reduce_labels,
|
reduce_labels=image_processor.do_reduce_labels,
|
||||||
)
|
)
|
||||||
# add per category metrics as individual key-value pairs
|
# add per category metrics as individual key-value pairs
|
||||||
per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
|
per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
|
||||||
@ -385,8 +385,8 @@ def main():
|
|||||||
revision=model_args.model_revision,
|
revision=model_args.model_revision,
|
||||||
use_auth_token=True if model_args.use_auth_token else None,
|
use_auth_token=True if model_args.use_auth_token else None,
|
||||||
)
|
)
|
||||||
feature_extractor = AutoFeatureExtractor.from_pretrained(
|
image_processor = AutoImageProcessor.from_pretrained(
|
||||||
model_args.feature_extractor_name or model_args.model_name_or_path,
|
model_args.image_processor_name or model_args.model_name_or_path,
|
||||||
cache_dir=model_args.cache_dir,
|
cache_dir=model_args.cache_dir,
|
||||||
revision=model_args.model_revision,
|
revision=model_args.model_revision,
|
||||||
use_auth_token=True if model_args.use_auth_token else None,
|
use_auth_token=True if model_args.use_auth_token else None,
|
||||||
@ -395,11 +395,11 @@ def main():
|
|||||||
# Define torchvision transforms to be applied to each image + target.
|
# Define torchvision transforms to be applied to each image + target.
|
||||||
# Not that straightforward in torchvision: https://github.com/pytorch/vision/issues/9
|
# Not that straightforward in torchvision: https://github.com/pytorch/vision/issues/9
|
||||||
# Currently based on official torchvision references: https://github.com/pytorch/vision/blob/main/references/segmentation/transforms.py
|
# Currently based on official torchvision references: https://github.com/pytorch/vision/blob/main/references/segmentation/transforms.py
|
||||||
if "shortest_edge" in feature_extractor.size:
|
if "shortest_edge" in image_processor.size:
|
||||||
# We instead set the target size as (shortest_edge, shortest_edge) to here to ensure all images are batchable.
|
# We instead set the target size as (shortest_edge, shortest_edge) to here to ensure all images are batchable.
|
||||||
size = (feature_extractor.size["shortest_edge"], feature_extractor.size["shortest_edge"])
|
size = (image_processor.size["shortest_edge"], image_processor.size["shortest_edge"])
|
||||||
else:
|
else:
|
||||||
size = (feature_extractor.size["height"], feature_extractor.size["width"])
|
size = (image_processor.size["height"], image_processor.size["width"])
|
||||||
train_transforms = Compose(
|
train_transforms = Compose(
|
||||||
[
|
[
|
||||||
ReduceLabels() if data_args.reduce_labels else Identity(),
|
ReduceLabels() if data_args.reduce_labels else Identity(),
|
||||||
@ -407,7 +407,7 @@ def main():
|
|||||||
RandomHorizontalFlip(flip_prob=0.5),
|
RandomHorizontalFlip(flip_prob=0.5),
|
||||||
PILToTensor(),
|
PILToTensor(),
|
||||||
ConvertImageDtype(torch.float),
|
ConvertImageDtype(torch.float),
|
||||||
Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
|
Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
# Define torchvision transform to be applied to each image.
|
# Define torchvision transform to be applied to each image.
|
||||||
@ -418,7 +418,7 @@ def main():
|
|||||||
Resize(size=size),
|
Resize(size=size),
|
||||||
PILToTensor(),
|
PILToTensor(),
|
||||||
ConvertImageDtype(torch.float),
|
ConvertImageDtype(torch.float),
|
||||||
Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
|
Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -477,7 +477,7 @@ def main():
|
|||||||
train_dataset=dataset["train"] if training_args.do_train else None,
|
train_dataset=dataset["train"] if training_args.do_train else None,
|
||||||
eval_dataset=dataset["validation"] if training_args.do_eval else None,
|
eval_dataset=dataset["validation"] if training_args.do_eval else None,
|
||||||
compute_metrics=compute_metrics,
|
compute_metrics=compute_metrics,
|
||||||
tokenizer=feature_extractor,
|
tokenizer=image_processor,
|
||||||
data_collator=default_data_collator,
|
data_collator=default_data_collator,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ from accelerate.utils import set_seed
|
|||||||
from huggingface_hub import Repository, create_repo, hf_hub_download
|
from huggingface_hub import Repository, create_repo, hf_hub_download
|
||||||
from transformers import (
|
from transformers import (
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
AutoFeatureExtractor,
|
AutoImageProcessor,
|
||||||
AutoModelForSemanticSegmentation,
|
AutoModelForSemanticSegmentation,
|
||||||
SchedulerType,
|
SchedulerType,
|
||||||
default_data_collator,
|
default_data_collator,
|
||||||
@ -397,20 +397,20 @@ def main():
|
|||||||
id2label = {int(k): v for k, v in id2label.items()}
|
id2label = {int(k): v for k, v in id2label.items()}
|
||||||
label2id = {v: k for k, v in id2label.items()}
|
label2id = {v: k for k, v in id2label.items()}
|
||||||
|
|
||||||
# Load pretrained model and feature extractor
|
# Load pretrained model and image processor
|
||||||
config = AutoConfig.from_pretrained(args.model_name_or_path, id2label=id2label, label2id=label2id)
|
config = AutoConfig.from_pretrained(args.model_name_or_path, id2label=id2label, label2id=label2id)
|
||||||
feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_name_or_path)
|
image_processor = AutoImageProcessor.from_pretrained(args.model_name_or_path)
|
||||||
model = AutoModelForSemanticSegmentation.from_pretrained(args.model_name_or_path, config=config)
|
model = AutoModelForSemanticSegmentation.from_pretrained(args.model_name_or_path, config=config)
|
||||||
|
|
||||||
# Preprocessing the datasets
|
# Preprocessing the datasets
|
||||||
# Define torchvision transforms to be applied to each image + target.
|
# Define torchvision transforms to be applied to each image + target.
|
||||||
# Not that straightforward in torchvision: https://github.com/pytorch/vision/issues/9
|
# Not that straightforward in torchvision: https://github.com/pytorch/vision/issues/9
|
||||||
# Currently based on official torchvision references: https://github.com/pytorch/vision/blob/main/references/segmentation/transforms.py
|
# Currently based on official torchvision references: https://github.com/pytorch/vision/blob/main/references/segmentation/transforms.py
|
||||||
if "shortest_edge" in feature_extractor.size:
|
if "shortest_edge" in image_processor.size:
|
||||||
# We instead set the target size as (shortest_edge, shortest_edge) to here to ensure all images are batchable.
|
# We instead set the target size as (shortest_edge, shortest_edge) to here to ensure all images are batchable.
|
||||||
size = (feature_extractor.size["shortest_edge"], feature_extractor.size["shortest_edge"])
|
size = (image_processor.size["shortest_edge"], image_processor.size["shortest_edge"])
|
||||||
else:
|
else:
|
||||||
size = (feature_extractor.size["height"], feature_extractor.size["width"])
|
size = (image_processor.size["height"], image_processor.size["width"])
|
||||||
train_transforms = Compose(
|
train_transforms = Compose(
|
||||||
[
|
[
|
||||||
ReduceLabels() if args.reduce_labels else Identity(),
|
ReduceLabels() if args.reduce_labels else Identity(),
|
||||||
@ -418,7 +418,7 @@ def main():
|
|||||||
RandomHorizontalFlip(flip_prob=0.5),
|
RandomHorizontalFlip(flip_prob=0.5),
|
||||||
PILToTensor(),
|
PILToTensor(),
|
||||||
ConvertImageDtype(torch.float),
|
ConvertImageDtype(torch.float),
|
||||||
Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
|
Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
# Define torchvision transform to be applied to each image.
|
# Define torchvision transform to be applied to each image.
|
||||||
@ -429,7 +429,7 @@ def main():
|
|||||||
Resize(size=size),
|
Resize(size=size),
|
||||||
PILToTensor(),
|
PILToTensor(),
|
||||||
ConvertImageDtype(torch.float),
|
ConvertImageDtype(torch.float),
|
||||||
Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
|
Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -602,7 +602,7 @@ def main():
|
|||||||
save_function=accelerator.save,
|
save_function=accelerator.save,
|
||||||
)
|
)
|
||||||
if accelerator.is_main_process:
|
if accelerator.is_main_process:
|
||||||
feature_extractor.save_pretrained(args.output_dir)
|
image_processor.save_pretrained(args.output_dir)
|
||||||
repo.push_to_hub(
|
repo.push_to_hub(
|
||||||
commit_message=f"Training in progress {completed_steps} steps",
|
commit_message=f"Training in progress {completed_steps} steps",
|
||||||
blocking=False,
|
blocking=False,
|
||||||
@ -657,7 +657,7 @@ def main():
|
|||||||
args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
|
args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
|
||||||
)
|
)
|
||||||
if accelerator.is_main_process:
|
if accelerator.is_main_process:
|
||||||
feature_extractor.save_pretrained(args.output_dir)
|
image_processor.save_pretrained(args.output_dir)
|
||||||
repo.push_to_hub(
|
repo.push_to_hub(
|
||||||
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
||||||
)
|
)
|
||||||
@ -678,7 +678,7 @@ def main():
|
|||||||
args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
|
args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
|
||||||
)
|
)
|
||||||
if accelerator.is_main_process:
|
if accelerator.is_main_process:
|
||||||
feature_extractor.save_pretrained(args.output_dir)
|
image_processor.save_pretrained(args.output_dir)
|
||||||
if args.push_to_hub:
|
if args.push_to_hub:
|
||||||
repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
|
repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
|
||||||
|
|
||||||
|
@ -340,7 +340,7 @@ def main():
|
|||||||
model.config.id2label = id2label
|
model.config.id2label = id2label
|
||||||
|
|
||||||
# Preprocessing the dataset
|
# Preprocessing the dataset
|
||||||
# The processor does everything for us (prepare the image using LayoutLMv3FeatureExtractor
|
# The processor does everything for us (prepare the image using LayoutLMv3ImageProcessor
|
||||||
# and prepare the words, boxes and word-level labels using LayoutLMv3TokenizerFast)
|
# and prepare the words, boxes and word-level labels using LayoutLMv3TokenizerFast)
|
||||||
def prepare_examples(examples):
|
def prepare_examples(examples):
|
||||||
images = examples[image_column_name]
|
images = examples[image_column_name]
|
||||||
|
Loading…
Reference in New Issue
Block a user