% % capture captured_output
!pip uninstall mindspore - y
!pip install - i https: // pypi. mirrors. ustc. edu. cn/ simple mindspore== 2.2 .14 
!pip install mindnlp
!pip install jieba
% env HF_ENDPOINT= https: // hf- mirror. com
import  os
import  mindspore
from  mindspore. dataset import  text,  GeneratorDataset,  transforms
from  mindspore import  nn
from  mindnlp. dataset import  load_dataset
from  mindnlp. _legacy. engine import  Trainer,  Evaluator
from  mindnlp. _legacy. engine. callbacks import  CheckpointCallback,  BestModelCallback
from  mindnlp. _legacy. metrics import  Accuracy
imdb_ds =  load_dataset( 'imdb' ,  split= [ 'train' ,  'test' ] ) 
imdb_train =  imdb_ds[ 'train' ] 
imdb_test =  imdb_ds[ 'test' ] 
imdb_train. get_dataset_size( ) 
import  numpy as  np
def  process_dataset ( dataset,  tokenizer,  max_seq_len= 512 ,  batch_size= 4 ,  shuffle= False ) : 
    is_ascend =  mindspore. get_context( 'device_target' )  ==  'Ascend' 
    def  tokenize ( text) : 
        if  is_ascend: 
            tokenized =  tokenizer( text,  padding= 'max_length' ,  truncation= True ,  max_length= max_seq_len) 
        else : 
            tokenized =  tokenizer( text,  truncation= True ,  max_length= max_seq_len) 
        return  tokenized[ 'input_ids' ] ,  tokenized[ 'attention_mask' ] 
    if  shuffle: 
        dataset =  dataset. shuffle( batch_size) 
    
    dataset =  dataset. map ( operations= [ tokenize] ,  input_columns= "text" ,  output_columns= [ 'input_ids' ,  'attention_mask' ] ) 
    dataset =  dataset. map ( operations= transforms. TypeCast( mindspore. int32) ,  input_columns= "label" ,  output_columns= "labels" ) 
    
    if  is_ascend: 
        dataset =  dataset. batch( batch_size) 
    else : 
        dataset =  dataset. padded_batch( batch_size,  pad_info= { 'input_ids' :  ( None ,  tokenizer. pad_token_id) , 
                                                             'attention_mask' :  ( None ,  0 ) } ) 
    return  dataset
from  mindnlp. transformers import  GPTTokenizer
gpt_tokenizer =  GPTTokenizer. from_pretrained( 'openai-gpt' ) 
special_tokens_dict =  { 
    "bos_token" :  "<bos>" , 
    "eos_token" :  "<eos>" , 
    "pad_token" :  "<pad>" , 
} 
num_added_toks =  gpt_tokenizer. add_special_tokens( special_tokens_dict) 
imdb_train,  imdb_val =  imdb_train. split( [ 0.7 ,  0.3 ] ) 
dataset_train =  process_dataset( imdb_train,  gpt_tokenizer,  shuffle= True ) 
dataset_val =  process_dataset( imdb_val,  gpt_tokenizer) 
dataset_test =  process_dataset( imdb_test,  gpt_tokenizer) 
next ( dataset_train. create_tuple_iterator( ) ) 
from  mindnlp. transformers import  GPTForSequenceClassification
from  mindspore. experimental. optim import  Adam
model =  GPTForSequenceClassification. from_pretrained( 'openai-gpt' ,  num_labels= 2 ) 
model. config. pad_token_id =  gpt_tokenizer. pad_token_id
model. resize_token_embeddings( model. config. vocab_size +  3 ) 
optimizer =  nn. Adam( model. trainable_params( ) ,  learning_rate= 2e-5 ) 
metric =  Accuracy( ) 
ckpoint_cb =  CheckpointCallback( save_path= 'checkpoint' ,  ckpt_name= 'gpt_imdb_finetune' ,  epochs= 1 ,  keep_checkpoint_max= 2 ) 
best_model_cb =  BestModelCallback( save_path= 'checkpoint' ,  ckpt_name= 'gpt_imdb_finetune_best' ,  auto_load= True ) 
trainer =  Trainer( network= model,  train_dataset= dataset_train, 
                  eval_dataset= dataset_train,  metrics= metric, 
                  epochs= 1 ,  optimizer= optimizer,  callbacks= [ ckpoint_cb,  best_model_cb] , 
                  jit= False ) 
trainer. run( tgt_columns= "labels" ) 
evaluator =  Evaluator( network= model,  eval_dataset= dataset_test,  metrics= metric) 
evaluator. run( tgt_columns= "labels" )