Can we have an option to fail without automatic truncation?

Open vrdn-23 opened this issue 1 year ago • 1 comments

Apologies, if this is already possible, but is it possible that we can use a setting such that the model fails when the input is too long and not truncate automatically?

Feb 10 '25 22:02 vrdn-23

One option is to first chunk your input text using the associated tokenizer then score each text chunk and collect the results.

from gliner import GLiNER
from transformers import AutoTokenizer

chunk_size = 384
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large", max_length=chunk_size)
model = GLiNER.from_pretrained("urchade/gliner_base", max_length=chunk_size)

# Chunk text using tokenizer
def chunk_text(text, chunk_size, tokenizer):
    # Tokenize the entire text and get input IDs
    tokens = tokenizer(text, return_tensors="pt", truncation=False, padding=False).input_ids[0]  
    # Convert token IDs into chunks of the specified size
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]    
    # Decode each chunk back to text (optional)
    text_chunks = [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]    
    return text_chunks

# Merge adjacent entities of the same type
def merge_entities(chunk, entities):
    if not entities:
        return []
    merged = []
    current = entities[0]
    for next_entity in entities[1:]:
        if next_entity['label'] == current['label'] and (next_entity['start'] == current['end'] + 1 or next_entity['start'] == current['end']):
            current['text'] = chunk[current['start']: next_entity['end']].strip()
            current['end'] = next_entity['end']
        else:
            merged.append(current)
            current = next_entity
    # Append the last entity
    merged.append(current)
    return merged

# Iterate over chunks and score entities
def score_text_chunks(text_chunks, threshold):
    results = {}
    entities = []
    start = time.time()
    for i, chunk in enumerate(text_chunks, 1):
        # Apply medner_pipe to each chunk to get entities and scores
        entities = model.predict_entities(chunk, labels, flat_ner=True, threshold=threshold)
        merged_entities = merge_entities(chunk, entities)
        # Store the results for each chunk
        results[f"Chunk {i}"] = {
            "text": chunk,
            "entities": [
                {
                    "label": entity["label"],
                    "text": entity["text"],
                    "start": entity["start"],
                    "end": entity["end"],
                    "score": entity["score"]
                }
                for entity in merged_entities
            ]
        }                
        # Display predicted entities and their labels for this chunk
        #print(f"--- Chunk {i} ---")
        #for entity in entities:
        #    print(entity["label"], "=>", entity["text"], "Start:", entity['start'], "End:", entity['end'], "Score:", entity['score'])
        #print("\n")
    end = time.time()
    print(f"\nTotal Scoring Time: {end-start:.2f} seconds \n")
    return results

doctext = "This is the text to be processed"

# Tokenize doctext to assist with chunking
tokens = tokenizer.tokenize(doctext )
token_count = len(tokens)
print("\nToken Count:", token_count)

# Define entities to extract - must be lowcase
labels = ["label1", "label2", "label3"]
print("\nEntity labels:", labels)

# Chunk doctext based on max context length
text_chunks = chunk_text(doctext, chunk_size, tokenizer)

# Iterate over text chunks and score model
results = score_text_chunks(text_chunks, threshold=0.25)

# Print results by chunk
for i, chunk in enumerate(results.values()):
  print(f"--- Chunk {i} ---")
  for entity in chunk['entities']:
      print(entity["label"], "=>", entity["text"], "Start:", entity['start'], "End:", entity['end'], "Score:", entity['score'])
  print("\n")

Note that your offsets will still need to be updated to reflect their position in the pre-chunked text.

Feb 11 '25 15:02 nadolsw