| """ |
| NeuralQuantum NQLM Tokenizer for Hugging Face Transformers |
| """ |
|
|
| import json |
| from typing import List, Optional, Union |
| from transformers import PreTrainedTokenizer |
|
|
|
|
| class NeuralQuantumTokenizer(PreTrainedTokenizer): |
| """Tokenizer for NeuralQuantum NQLM model""" |
| |
| def __init__( |
| self, |
| vocab_file=None, |
| merges_file=None, |
| tokenizer_file=None, |
| unk_token="<|endoftext|>", |
| bos_token="<|endoftext|>", |
| eos_token="<|endoftext|>", |
| pad_token="<|endoftext|>", |
| quantum_token="<|quantum|>", |
| classical_token="<|classical|>", |
| add_prefix_space=False, |
| **kwargs |
| ): |
| |
| vocab = { |
| "<|endoftext|>": 0, |
| "<|quantum|>": 1, |
| "<|classical|>": 2, |
| } |
| |
| |
| for i, char in enumerate("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,!?;:'\"-()[]{}"): |
| vocab[char] = i + 3 |
| |
| |
| self._vocab = vocab |
| self._ids_to_tokens = {v: k for k, v in vocab.items()} |
| |
| super().__init__( |
| unk_token=unk_token, |
| bos_token=bos_token, |
| eos_token=eos_token, |
| pad_token=pad_token, |
| add_prefix_space=add_prefix_space, |
| **kwargs |
| ) |
| |
| self.quantum_token = quantum_token |
| self.classical_token = classical_token |
| |
| @property |
| def vocab_size(self): |
| return len(self._vocab) |
| |
| def get_vocab(self): |
| return dict(self._vocab) |
| |
| def _tokenize(self, text): |
| """Basic tokenization - split by whitespace and characters""" |
| tokens = [] |
| current_token = "" |
| |
| for char in text: |
| if char.isspace(): |
| if current_token: |
| tokens.append(current_token) |
| current_token = "" |
| else: |
| current_token += char |
| |
| if current_token: |
| tokens.append(current_token) |
| |
| return tokens |
| |
| def _convert_token_to_id(self, token): |
| """Convert token to ID""" |
| return self._vocab.get(token, self._vocab[self.unk_token]) |
| |
| def _convert_id_to_token(self, index): |
| """Convert ID to token""" |
| return self._ids_to_tokens.get(index, self.unk_token) |
| |
| def convert_tokens_to_string(self, tokens): |
| """Convert tokens back to string""" |
| return " ".join(tokens) |
| |
| def save_vocabulary(self, save_directory, filename_prefix=None): |
| """Save vocabulary to files""" |
| vocab_file = f"{filename_prefix}-vocab.json" if filename_prefix else "vocab.json" |
| vocab_path = f"{save_directory}/{vocab_file}" |
| |
| with open(vocab_path, 'w') as f: |
| json.dump(self._vocab, f, indent=2) |
| |
| return (vocab_path,) |
| |
| def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): |
| """Build input with special tokens""" |
| if token_ids_1 is None: |
| return token_ids_0 + [self.eos_token_id] |
| return token_ids_0 + token_ids_1 + [self.eos_token_id] |
| |
| def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): |
| """Get special tokens mask""" |
| if already_has_special_tokens: |
| return super().get_special_tokens_mask( |
| token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True |
| ) |
| |
| if token_ids_1 is not None: |
| return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] |
| return [1] + ([0] * len(token_ids_0)) + [1] |