onevision-encoder-large / configuration_onevision_encoder.py

Upload folder using huggingface_hub

a2882ce 3 months ago

4.43 kB

	from transformers.configuration_utils import PretrainedConfig
	from transformers.utils import logging


	logger = logging.get_logger(__name__)


	class OneVisionEncoderConfig(PretrainedConfig):
	r"""
	This is the configuration class to store the configuration of a [`OneVisionEncoderModel`]. It is used to instantiate a
	OneVision Encoder model according to the specified arguments, defining the model architecture. Instantiating a configuration
	with the defaults will yield a similar configuration to that of the OneVision Encoder architecture.

	Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
	documentation from [`PretrainedConfig`] for more information.

	Args:
	hidden_size (`int`, optional, defaults to 1024):
	Dimensionality of the encoder layers and the pooler layer.
	intermediate_size (`int`, optional, defaults to 4096):
	Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
	num_hidden_layers (`int`, optional, defaults to 24):
	Number of hidden layers in the Transformer encoder.
	num_attention_heads (`int`, optional, defaults to 16):
	Number of attention heads for each attention layer in the Transformer encoder.
	num_channels (`int`, optional, defaults to 3):
	The number of input channels.
	image_size (`int`, optional, defaults to 224):
	The size (resolution) of each image.
	patch_size (`int`, optional, defaults to 14):
	The size (resolution) of each patch.
	hidden_act (`str` or `function`, optional, defaults to `"gelu"`):
	The non-linear activation function (function or string) in the encoder and pooler.
	layer_norm_eps (`float`, optional, defaults to 1e-6):
	The epsilon used by the layer normalization layers.
	layer_norm_type (`str`, optional, defaults to `"layer_norm"`):
	The type of layer normalization to use. Supported values: `"layer_norm"`, `"rms_norm"`.
	attention_dropout (`float`, optional, defaults to 0.0):
	The dropout ratio for the attention probabilities.
	initializer_range (`float`, optional, defaults to 0.02):
	The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
	rope_theta (`float`, optional, defaults to 10000.0):
	The base period of the RoPE embeddings.
	use_head (`bool`, optional, defaults to `True`):
	Whether to use the pooling head.

	Example:

	```python
	>>> from configuration_onevision_encoder import OneVisionEncoderConfig
	>>> from modeling_onevision_encoder import OneVisionEncoderModel

	>>> # Initializing a OneVisionEncoder configuration
	>>> configuration = OneVisionEncoderConfig()

	>>> # Initializing a model (with random weights) from the configuration
	>>> model = OneVisionEncoderModel(configuration)

	>>> # Accessing the model configuration
	>>> configuration = model.config
	```
	"""

	model_type = "onevision_encoder"

	def __init__(
	self,
	hidden_size=1024,
	intermediate_size=4096,
	num_hidden_layers=24,
	num_attention_heads=16,
	num_channels=3,
	image_size=448,
	patch_size=14,
	hidden_act="gelu",
	layer_norm_eps=1e-6,
	layer_norm_type="layer_norm",
	attention_dropout=0.0,
	initializer_range=0.02,
	rope_theta=10000.0,
	rope_temporal_size=64,
	use_head=True,
	**kwargs,
	):
	super().__init__(**kwargs)
	self.hidden_size = hidden_size
	self.intermediate_size = intermediate_size
	self.num_hidden_layers = num_hidden_layers
	self.num_attention_heads = num_attention_heads
	self.num_channels = num_channels
	self.image_size = image_size
	self.patch_size = patch_size
	self.hidden_act = hidden_act
	self.layer_norm_eps = layer_norm_eps
	self.layer_norm_type = layer_norm_type
	self.attention_dropout = attention_dropout
	self.initializer_range = initializer_range
	self.rope_theta = rope_theta
	self.rope_temporal_size = rope_temporal_size # None=use actual frames, int=fixed size (legacy: 64)
	self.use_head = use_head