diff --git a/src/models/ocr_model/model/TexTeller.py b/src/models/ocr_model/model/TexTeller.py index 08ca257..f7fab2a 100644 --- a/src/models/ocr_model/model/TexTeller.py +++ b/src/models/ocr_model/model/TexTeller.py @@ -10,18 +10,18 @@ from ...globals import ( from transformers import ( RobertaTokenizerFast, VisionEncoderDecoderModel, - VisionEncoderDecoderConfig + VisionEncoderDecoderConfig, ) class TexTeller(VisionEncoderDecoderModel): REPO_NAME = 'OleehyO/TexTeller' def __init__(self): - config = VisionEncoderDecoderConfig.from_pretrained('/home/lhy/code/TexTeller/src/models/ocr_model/model/trocr-small') - config.encoder.image_size = FIXED_IMG_SIZE - config.encoder.num_channels = IMG_CHANNELS - config.decoder.vocab_size=VOCAB_SIZE - config.decoder.max_position_embeddings=MAX_TOKEN_SIZE + config = VisionEncoderDecoderConfig.from_pretrained(Path(__file__).resolve().parent / "config.json") + config.encoder.image_size = FIXED_IMG_SIZE + config.encoder.num_channels = IMG_CHANNELS + config.decoder.vocab_size = VOCAB_SIZE + config.decoder.max_position_embeddings = MAX_TOKEN_SIZE super().__init__(config=config) diff --git a/src/models/ocr_model/model/config.json b/src/models/ocr_model/model/config.json new file mode 100644 index 0000000..f8ab627 --- /dev/null +++ b/src/models/ocr_model/model/config.json @@ -0,0 +1,156 @@ +{ + "architectures": [ + "VisionEncoderDecoderModel" + ], + "decoder": { + "_name_or_path": "", + "activation_dropout": 0.0, + "activation_function": "gelu", + "add_cross_attention": true, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "bos_token_id": 0, + "chunk_size_feed_forward": 0, + "classifier_dropout": 0.0, + "d_model": 1024, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.0, + "decoder_layers": 12, + "decoder_start_token_id": 2, + "diversity_penalty": 0.0, + "do_sample": false, + "dropout": 0.1, + "early_stopping": false, + "cross_attention_hidden_size": 768, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "init_std": 0.02, + "is_decoder": true, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 512, + "min_length": 0, + "model_type": "trocr", + "no_repeat_ngram_size": 0, + "num_beam_groups": 1, + "num_beams": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 1, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "scale_embedding": false, + "sep_token_id": null, + "task_specific_params": null, + "temperature": 1.0, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "transformers_version": "4.12.0.dev0", + "use_bfloat16": false, + "use_cache": false, + "vocab_size": 50265 + }, + "encoder": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_probs_dropout_prob": 0.0, + "bad_words_ids": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "cross_attention_hidden_size": null, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.0, + "hidden_size": 768, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 384, + "initializer_range": 0.02, + "intermediate_size": 3072, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-12, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "vit", + "no_repeat_ngram_size": 0, + "num_attention_heads": 12, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 12, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 16, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "qkv_bias": false, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "task_specific_params": null, + "temperature": 1.0, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "transformers_version": "4.12.0.dev0", + "use_bfloat16": false + }, + "is_encoder_decoder": true, + "model_type": "vision-encoder-decoder", + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": null +}