Class `swarmauri_embedding_mlm.MlmEmbedding.MlmEmbedding`

swarmauri_embedding_mlm.MlmEmbedding.MlmEmbedding

MlmEmbedding(**kwargs)

Bases: EmbeddingBase

EmbeddingBase implementation that fine-tunes a Masked Language Model (MLM).

Source code in swarmauri_embedding_mlm/MlmEmbedding.py

def __init__(self, **kwargs):
    super().__init__(**kwargs)
    self._tokenizer = AutoTokenizer.from_pretrained(self.embedding_name)
    self._model = AutoModelForMaskedLM.from_pretrained(self.embedding_name)
    self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    self._model.to(self._device)
    self._mask_token_id = self._tokenizer.convert_tokens_to_ids(
        [self._tokenizer.mask_token]
    )[0]

embedding_name `class-attribute` `instance-attribute`

embedding_name = 'bert-base-uncased'

batch_size `class-attribute` `instance-attribute`

batch_size = 32

learning_rate `class-attribute` `instance-attribute`

learning_rate = 5e-05

masking_ratio `class-attribute` `instance-attribute`

masking_ratio = 0.15

randomness_ratio `class-attribute` `instance-attribute`

randomness_ratio = 0.1

epochs `class-attribute` `instance-attribute`

epochs = 0

add_new_tokens `class-attribute` `instance-attribute`

add_new_tokens = False

type `class-attribute` `instance-attribute`

type = 'MlmEmbedding'

model_config `class-attribute` `instance-attribute`

model_config = ConfigDict(
    extra="allow", arbitrary_types_allowed=True
)

id `class-attribute` `instance-attribute`

id = Field(default_factory=generate_id)

members `class-attribute` `instance-attribute`

members = None

owners `class-attribute` `instance-attribute`

owners = None

host `class-attribute` `instance-attribute`

host = None

default_logger `class-attribute`

default_logger = None

logger `class-attribute` `instance-attribute`

logger = None

name `class-attribute` `instance-attribute`

name = None

resource `class-attribute` `instance-attribute`

resource = Field(default=EMBEDDING.value, frozen=True)

version `class-attribute` `instance-attribute`

version = '0.1.0'

extract_features

extract_features()

Extracts the tokens from the vocabulary of the fine-tuned MLM.

Returns: - List[str]: A list of token strings in the model's vocabulary.

Source code in swarmauri_embedding_mlm/MlmEmbedding.py

def extract_features(self) -> List[str]:
    """
    Extracts the tokens from the vocabulary of the fine-tuned MLM.

    Returns:
    - List[str]: A list of token strings in the model's vocabulary.
    """
    # Get the vocabulary size
    vocab_size = len(self._tokenizer)

    # Retrieve the token strings for each id in the vocabulary
    token_strings = [
        self._tokenizer.convert_ids_to_tokens(i) for i in range(vocab_size)
    ]

    return token_strings

fit

fit(documents)

Source code in swarmauri_embedding_mlm/MlmEmbedding.py

def fit(self, documents: List[Union[str, Any]]):
    # Check if we need to add new tokens
    if self.add_new_tokens:
        new_tokens = self.find_new_tokens(documents)
        if new_tokens:
            num_added_toks = self._tokenizer.add_tokens(new_tokens)
            if num_added_toks > 0:
                logging.info(f"Added {num_added_toks} new tokens.")
                self.model.resize_token_embeddings(len(self._tokenizer))

    encodings = self._tokenizer(
        documents,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512,
    )
    input_ids, attention_mask, labels = self._mask_tokens(encodings)
    optimizer = AdamW(self._model.parameters(), lr=self.learning_rate)
    dataset = TensorDataset(input_ids, attention_mask, labels)
    data_loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

    self._model.train()
    for batch in data_loader:
        batch = {
            k: v.to(self._device)
            for k, v in zip(["input_ids", "attention_mask", "labels"], batch)
        }
        outputs = self._model(**batch)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    self.epochs += 1
    logging.info(f"Epoch {self.epochs} complete. Loss {loss.item()}")

find_new_tokens

find_new_tokens(documents)

Source code in swarmauri_embedding_mlm/MlmEmbedding.py

def find_new_tokens(self, documents):
    # Identify unique words in documents that are not in the tokenizer's vocabulary
    unique_words = set()
    for doc in documents:
        tokens = set(doc.split())  # Simple whitespace tokenization
        unique_words.update(tokens)
    existing_vocab = set(self._tokenizer.get_vocab().keys())
    new_tokens = list(unique_words - existing_vocab)
    return new_tokens if new_tokens else None

transform

transform(documents)

Generates embeddings for a list of documents using the fine-tuned MLM.

Source code in swarmauri_embedding_mlm/MlmEmbedding.py

def transform(self, documents: List[Union[str, Any]]) -> List[Vector]:
    """
    Generates embeddings for a list of documents using the fine-tuned MLM.
    """
    self._model.eval()
    embedding_list = []

    for document in documents:
        inputs = self._tokenizer(
            document,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512,
        )
        inputs = {k: v.to(self._device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = self._model(**inputs)
        # Extract embedding (for simplicity, averaging the last hidden states)
        if hasattr(outputs, "last_hidden_state"):
            embedding = outputs.last_hidden_state.mean(1)
        else:
            # Fallback or corrected attribute access
            embedding = outputs["logits"].mean(1)
        embedding = embedding.cpu().numpy()
        embedding_list.append(Vector(value=embedding.squeeze().tolist()))

    return embedding_list

fit_transform

fit_transform(documents, **kwargs)

Fine-tunes the MLM and generates embeddings for the provided documents.

Source code in swarmauri_embedding_mlm/MlmEmbedding.py

def fit_transform(self, documents: List[Union[str, Any]], **kwargs) -> List[Vector]:
    """
    Fine-tunes the MLM and generates embeddings for the provided documents.
    """
    self.fit(documents, **kwargs)
    return self.transform(documents)

infer_vector

infer_vector(data, *args, **kwargs)

Generates an embedding for the input data.

data (Union[str, Any]): The input data, expected to be a textual representation. Could be a single string or a batch of strings.

Source code in swarmauri_embedding_mlm/MlmEmbedding.py

def infer_vector(self, data: Union[str, Any], *args, **kwargs) -> Vector:
    """
    Generates an embedding for the input data.

    Parameters:
    - data (Union[str, Any]): The input data, expected to be a textual representation.
                              Could be a single string or a batch of strings.
    """
    # Tokenize the input data and ensure the tensors are on the correct device.
    self._model.eval()
    inputs = self._tokenizer(
        data, return_tensors="pt", padding=True, truncation=True, max_length=512
    )
    inputs = {k: v.to(self._device) for k, v in inputs.items()}

    # Generate embeddings using the model
    with torch.no_grad():
        outputs = self._model(**inputs)

    if hasattr(outputs, "last_hidden_state"):
        # Access the last layer and calculate the mean across all tokens (simple pooling)
        embedding = outputs.last_hidden_state.mean(dim=1)
    else:
        embedding = outputs["logits"].mean(1)
    # Move the embeddings back to CPU for compatibility with downstream tasks if necessary
    embedding = embedding.cpu().numpy()

    return Vector(value=embedding.squeeze().tolist())

save_model

save_model(path)

Saves the model and tokenizer to the specified directory.

Source code in swarmauri_embedding_mlm/MlmEmbedding.py

def save_model(self, path: str) -> None:
    """
    Saves the model and tokenizer to the specified directory.
    """
    self._model.save_pretrained(path)
    self._tokenizer.save_pretrained(path)

load_model

load_model(path)

Loads the model and tokenizer from the specified directory.

Source code in swarmauri_embedding_mlm/MlmEmbedding.py

def load_model(self, path: str) -> None:
    """
    Loads the model and tokenizer from the specified directory.
    """
    self._model = AutoModelForMaskedLM.from_pretrained(path)
    self._tokenizer = AutoTokenizer.from_pretrained(path)
    self._model.to(self._device)  # Ensure the model is loaded to the correct device

register_model `classmethod`

register_model()

Decorator to register a base model in the unified registry.

RETURNS	DESCRIPTION
`Callable`	A decorator function that registers the model class. TYPE: `Callable[[Type[BaseModel]], Type[BaseModel]]`

Source code in swarmauri_base/DynamicBase.py

@classmethod
def register_model(cls) -> Callable[[Type[BaseModel]], Type[BaseModel]]:
    """
    Decorator to register a base model in the unified registry.

    Returns:
        Callable: A decorator function that registers the model class.
    """

    def decorator(model_cls: Type[BaseModel]):
        """Register ``model_cls`` as a base model."""
        model_name = model_cls.__name__
        if model_name in cls._registry:
            glogger.warning(
                "Model '%s' is already registered; skipping duplicate.", model_name
            )
            return model_cls

        cls._registry[model_name] = {"model_cls": model_cls, "subtypes": {}}
        glogger.debug("Registered base model '%s'.", model_name)
        DynamicBase._recreate_models()
        return model_cls

    return decorator

register_type `classmethod`

register_type(resource_type=None, type_name=None)

Decorator to register a subtype under one or more base models in the unified registry.

PARAMETER	DESCRIPTION
`resource_type`	The base model(s) under which to register the subtype. If None, all direct base classes (except DynamicBase) are used. TYPE: `Optional[Union[Type[T], List[Type[T]]]]` DEFAULT: `None`
`type_name`	An optional custom type name for the subtype. TYPE: `Optional[str]` DEFAULT: `None`

RETURNS	DESCRIPTION
`Callable`	A decorator function that registers the subtype. TYPE: `Callable[[Type[DynamicBase]], Type[DynamicBase]]`

Source code in swarmauri_base/DynamicBase.py

@classmethod
def register_type(
    cls,
    resource_type: Optional[Union[Type[T], List[Type[T]]]] = None,
    type_name: Optional[str] = None,
) -> Callable[[Type["DynamicBase"]], Type["DynamicBase"]]:
    """
    Decorator to register a subtype under one or more base models in the unified registry.

    Parameters:
        resource_type (Optional[Union[Type[T], List[Type[T]]]]):
            The base model(s) under which to register the subtype. If None, all direct base classes (except DynamicBase)
            are used.
        type_name (Optional[str]): An optional custom type name for the subtype.

    Returns:
        Callable: A decorator function that registers the subtype.
    """

    def decorator(subclass: Type["DynamicBase"]):
        """Register ``subclass`` as a subtype."""
        if resource_type is None:
            resource_types = [
                base for base in subclass.__bases__ if base is not cls
            ]
        elif not isinstance(resource_type, list):
            resource_types = [resource_type]
        else:
            resource_types = resource_type

        for rt in resource_types:
            if not issubclass(subclass, rt):
                raise TypeError(
                    f"'{subclass.__name__}' must be a subclass of '{rt.__name__}'."
                )
            final_type_name = type_name or getattr(
                subclass, "_type", subclass.__name__
            )
            base_model_name = rt.__name__

            if base_model_name not in cls._registry:
                cls._registry[base_model_name] = {"model_cls": rt, "subtypes": {}}
                glogger.debug(
                    "Created new registry entry for base model '%s'.",
                    base_model_name,
                )

            subtypes_dict = cls._registry[base_model_name]["subtypes"]
            if final_type_name in subtypes_dict:
                glogger.warning(
                    "Type '%s' already exists under '%s'; skipping duplicate.",
                    final_type_name,
                    base_model_name,
                )
                continue

            subtypes_dict[final_type_name] = subclass
            glogger.debug(
                "Registered '%s' as '%s' under '%s'.",
                subclass.__name__,
                final_type_name,
                base_model_name,
            )

        DynamicBase._recreate_models()
        return subclass

    return decorator

model_validate_toml `classmethod`

model_validate_toml(toml_data)

Validate a model from a TOML string.

Source code in swarmauri_base/TomlMixin.py

@classmethod
def model_validate_toml(cls, toml_data: str):
    """Validate a model from a TOML string."""
    try:
        # Parse TOML into a Python dictionary
        toml_content = tomllib.loads(toml_data)

        # Convert the dictionary to JSON and validate using Pydantic
        return cls.model_validate_json(json.dumps(toml_content))
    except tomllib.TOMLDecodeError as e:
        raise ValueError(f"Invalid TOML data: {e}")
    except ValidationError as e:
        raise ValueError(f"Validation failed: {e}")

model_dump_toml

model_dump_toml(
    fields_to_exclude=None, api_key_placeholder=None
)

Return a TOML representation of the model.

Source code in swarmauri_base/TomlMixin.py

def model_dump_toml(self, fields_to_exclude=None, api_key_placeholder=None):
    """Return a TOML representation of the model."""
    if fields_to_exclude is None:
        fields_to_exclude = []

    # Load the JSON string into a Python dictionary
    json_data = json.loads(self.model_dump_json())

    # Function to recursively remove specific keys and handle api_key placeholders
    def process_fields(data, fields_to_exclude):
        """Recursively filter fields and apply placeholders."""
        if isinstance(data, dict):
            return {
                key: (
                    api_key_placeholder
                    if key == "api_key" and api_key_placeholder is not None
                    else process_fields(value, fields_to_exclude)
                )
                for key, value in data.items()
                if key not in fields_to_exclude
            }
        elif isinstance(data, list):
            return [process_fields(item, fields_to_exclude) for item in data]
        else:
            return data

    # Filter the JSON data
    filtered_data = process_fields(json_data, fields_to_exclude)

    # Convert the filtered data into TOML
    return toml.dumps(filtered_data)

model_validate_yaml `classmethod`

model_validate_yaml(yaml_data)

Validate a model from a YAML string.

Source code in swarmauri_base/YamlMixin.py

@classmethod
def model_validate_yaml(cls, yaml_data: str):
    """Validate a model from a YAML string."""
    try:
        # Parse YAML into a Python dictionary
        yaml_content = yaml.safe_load(yaml_data)

        # Convert the dictionary to JSON and validate using Pydantic
        return cls.model_validate_json(json.dumps(yaml_content))
    except yaml.YAMLError as e:
        raise ValueError(f"Invalid YAML data: {e}")
    except ValidationError as e:
        raise ValueError(f"Validation failed: {e}")

model_dump_yaml

model_dump_yaml(
    fields_to_exclude=None, api_key_placeholder=None
)

Return a YAML representation of the model.

Source code in swarmauri_base/YamlMixin.py

def model_dump_yaml(self, fields_to_exclude=None, api_key_placeholder=None):
    """Return a YAML representation of the model."""
    if fields_to_exclude is None:
        fields_to_exclude = []

    # Load the JSON string into a Python dictionary
    json_data = json.loads(self.model_dump_json())

    # Function to recursively remove specific keys and handle api_key placeholders
    def process_fields(data, fields_to_exclude):
        """Recursively filter fields and apply placeholders."""
        if isinstance(data, dict):
            return {
                key: (
                    api_key_placeholder
                    if key == "api_key" and api_key_placeholder is not None
                    else process_fields(value, fields_to_exclude)
                )
                for key, value in data.items()
                if key not in fields_to_exclude
            }
        elif isinstance(data, list):
            return [process_fields(item, fields_to_exclude) for item in data]
        else:
            return data

    # Filter the JSON data
    filtered_data = process_fields(json_data, fields_to_exclude)

    # Convert the filtered data into YAML using safe mode
    return yaml.safe_dump(filtered_data, default_flow_style=False)

model_post_init

model_post_init(logger=None)

Assign a logger instance after model initialization.

Source code in swarmauri_base/LoggerMixin.py

def model_post_init(self, logger: Optional[FullUnion[LoggerBase]] = None) -> None:
    """Assign a logger instance after model initialization."""

    # Directly assign the provided FullUnion[LoggerBase] or fallback to the
    # class-level default.
    self.logger = self.logger or logger or self.default_logger

Class swarmauri_embedding_mlm.MlmEmbedding.MlmEmbedding

swarmauri_embedding_mlm.MlmEmbedding.MlmEmbedding

embedding_name class-attribute instance-attribute

batch_size class-attribute instance-attribute

learning_rate class-attribute instance-attribute

masking_ratio class-attribute instance-attribute

randomness_ratio class-attribute instance-attribute

epochs class-attribute instance-attribute

add_new_tokens class-attribute instance-attribute

type class-attribute instance-attribute

model_config class-attribute instance-attribute

id class-attribute instance-attribute

members class-attribute instance-attribute

owners class-attribute instance-attribute

host class-attribute instance-attribute

default_logger class-attribute

logger class-attribute instance-attribute

name class-attribute instance-attribute

resource class-attribute instance-attribute

version class-attribute instance-attribute

extract_features

fit

find_new_tokens

transform

fit_transform

infer_vector

save_model

load_model

register_model classmethod

register_type classmethod

model_validate_toml classmethod

model_dump_toml

model_validate_yaml classmethod

model_dump_yaml

model_post_init

Class `swarmauri_embedding_mlm.MlmEmbedding.MlmEmbedding`

embedding_name `class-attribute` `instance-attribute`

batch_size `class-attribute` `instance-attribute`

learning_rate `class-attribute` `instance-attribute`

masking_ratio `class-attribute` `instance-attribute`

randomness_ratio `class-attribute` `instance-attribute`

epochs `class-attribute` `instance-attribute`

add_new_tokens `class-attribute` `instance-attribute`

type `class-attribute` `instance-attribute`

model_config `class-attribute` `instance-attribute`

id `class-attribute` `instance-attribute`

members `class-attribute` `instance-attribute`

owners `class-attribute` `instance-attribute`

host `class-attribute` `instance-attribute`

default_logger `class-attribute`

logger `class-attribute` `instance-attribute`

name `class-attribute` `instance-attribute`

resource `class-attribute` `instance-attribute`

version `class-attribute` `instance-attribute`

register_model `classmethod`

register_type `classmethod`

model_validate_toml `classmethod`

model_validate_yaml `classmethod`