Разработка платформы разметки данных для AI
Платформа разметки данных — это не просто Label Studio с кнопкой "задеплоить". Это система управления очередями задач, контроля качества через IAA (Inter-Annotator Agreement), автоматической предразметки через слабые модели и замкнутого цикла active learning, где модель сама запрашивает самые информативные примеры.
Архитектура платформы
[Raw Data Sources]
↓
[Ingestion & Preprocessing] ← конвертация форматов, deduplification
↓
[Pre-annotation (weak models)] ← экономия 40-70% ручного труда
↓
[Task Queue Management] ← распределение между аннотаторами
↓
[Annotation Interface] ← Label Studio / custom UI
↓
[Quality Control] ← IAA, gold standard, review pipeline
↓
[Export & Model Training] ← JSONL, COCO, YOLO, HuggingFace datasets
↓
[Active Learning Loop] ← модель запрашивает сложные примеры
Управление задачами и аннотаторами
from anthropic import Anthropic
import pandas as pd
from enum import Enum
from dataclasses import dataclass, field
from datetime import datetime
import uuid
import numpy as np
class TaskStatus(Enum):
PENDING = "pending"
PRE_ANNOTATED = "pre_annotated"
IN_REVIEW = "in_review"
COMPLETED = "completed"
DISPUTED = "disputed"
@dataclass
class AnnotationTask:
task_id: str
data: dict # raw data (text, image_url, etc.)
task_type: str # classification, ner, segmentation
annotations: list = field(default_factory=list)
pre_annotation: dict = None
status: TaskStatus = TaskStatus.PENDING
assigned_to: list = field(default_factory=list)
created_at: datetime = field(default_factory=datetime.now)
difficulty_score: float = 0.5
class AnnotationPlatform:
def __init__(self, db_connection):
self.db = db_connection
self.llm = Anthropic()
self.quality_threshold = 0.8 # Minimum IAA
self.annotators_per_task = 2
def ingest_data(self, raw_data: list[dict], task_type: str) -> list[AnnotationTask]:
"""Прием данных и создание задач"""
tasks = []
for item in raw_data:
task = AnnotationTask(
task_id=str(uuid.uuid4()),
data=item,
task_type=task_type
)
tasks.append(task)
# Предварительная оценка сложности
tasks = self._estimate_difficulty(tasks)
# Приоритизация: сначала лёгкие для быстрого старта
tasks.sort(key=lambda t: t.difficulty_score)
return tasks
def _estimate_difficulty(self, tasks: list[AnnotationTask]) -> list[AnnotationTask]:
"""LLM-оценка сложности задач для приоритизации"""
# Батч-оценка через LLM
sample_texts = [t.data.get('text', '')[:200] for t in tasks[:20]]
if not any(sample_texts):
return tasks
text_list = "\n".join([f"{i+1}. {t}" for i, t in enumerate(sample_texts)])
response = self.llm.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=300,
messages=[{
"role": "user",
"content": f"""Rate the annotation difficulty of these texts (0-1, where 1 is hardest).
Consider: ambiguity, domain specificity, length complexity.
Texts:
{text_list}
Return only comma-separated scores, e.g.: 0.3, 0.7, 0.5..."""
}]
)
try:
scores = [float(s.strip()) for s in response.content[0].text.split(',')]
for i, task in enumerate(tasks[:len(scores)]):
task.difficulty_score = scores[i]
except Exception:
pass
return tasks
Контроль качества через IAA
def compute_iaa(self, annotations: list[dict], task_type: str) -> float:
"""
Inter-Annotator Agreement:
- Classification: Cohen's Kappa
- NER: F1 agreement
- Segmentation: IoU agreement
"""
if len(annotations) < 2:
return 1.0
if task_type == 'classification':
return self._cohen_kappa(annotations)
elif task_type == 'ner':
return self._ner_agreement(annotations)
else:
return self._pairwise_agreement(annotations)
def _cohen_kappa(self, annotations: list[dict]) -> float:
"""Cohen's Kappa для классификации"""
from sklearn.metrics import cohen_kappa_score
if len(annotations) == 2:
labels_a = [a['label'] for a in annotations[0]['items']]
labels_b = [a['label'] for a in annotations[1]['items']]
if len(labels_a) != len(labels_b):
return 0.0
try:
return cohen_kappa_score(labels_a, labels_b)
except Exception:
return 0.0
return 0.5 # Default для >2 аннотаторов (нужен Fleiss kappa)
def _ner_agreement(self, annotations: list[dict]) -> float:
"""F1 agreement для именованных сущностей"""
if len(annotations) < 2:
return 1.0
spans_a = set(
(e['start'], e['end'], e['label'])
for e in annotations[0].get('entities', [])
)
spans_b = set(
(e['start'], e['end'], e['label'])
for e in annotations[1].get('entities', [])
)
if not spans_a and not spans_b:
return 1.0
intersection = spans_a & spans_b
if not intersection:
return 0.0
precision = len(intersection) / len(spans_b)
recall = len(intersection) / len(spans_a)
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
return f1
def review_disputed_task(self, task: AnnotationTask,
annotations: list[dict]) -> dict:
"""Разбор спорных случаев через LLM"""
response = self.llm.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=400,
messages=[{
"role": "user",
"content": f"""You are a senior annotation expert. Resolve this labeling dispute.
Task type: {task.task_type}
Text: {task.data.get('text', '')[:500]}
Annotator A: {annotations[0]}
Annotator B: {annotations[1]}
Provide:
1. Correct annotation
2. Brief reasoning (1-2 sentences)
3. Guideline clarification needed (if any)"""
}]
)
return {
'resolution': response.content[0].text,
'resolved_by': 'llm_arbitration',
'task_id': task.task_id
}
Автоматическая предразметка
class PreAnnotationEngine:
"""Предразметка для снижения нагрузки аннотаторов"""
def __init__(self, task_type: str):
self.task_type = task_type
self.weak_model = None
self.confidence_threshold = 0.85 # Только высококонфидентные принять без ревью
def pre_annotate_classification(self, texts: list[str],
labels: list[str]) -> list[dict]:
"""Zero-shot классификация через NLI"""
from transformers import pipeline
if self.weak_model is None:
self.weak_model = pipeline(
"zero-shot-classification",
model="facebook/bart-large-mnli",
device=0
)
results = []
batch_size = 32
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
preds = self.weak_model(batch, candidate_labels=labels, batch_size=batch_size)
for pred in preds:
top_label = pred['labels'][0]
confidence = pred['scores'][0]
results.append({
'label': top_label,
'confidence': confidence,
'auto_accepted': confidence >= self.confidence_threshold
})
return results
def pre_annotate_ner(self, texts: list[str]) -> list[dict]:
"""NER через GLiNER (general NER)"""
from gliner import GLiNER
if self.weak_model is None:
self.weak_model = GLiNER.from_pretrained("urchade/gliner_multi-v2.1")
entity_types = ["person", "organization", "location", "date", "product"]
results = []
for text in texts:
entities = self.weak_model.predict_entities(text, entity_types)
results.append({
'entities': [
{'start': e['start'], 'end': e['end'],
'label': e['label'], 'confidence': e['score']}
for e in entities
],
'auto_accepted': all(e['score'] >= self.confidence_threshold for e in entities)
})
return results
Active Learning цикл
class ActiveLearningLoop:
"""Умный выбор следующих задач для разметки"""
def select_informative_samples(self, unlabeled_pool: list[dict],
current_model,
strategy: str = 'uncertainty',
budget: int = 100) -> list[int]:
"""
Стратегии:
- uncertainty: наименее уверенные предсказания
- diversity: наиболее разнообразные по feature space
- hybrid: комбинация обеих
"""
texts = [item.get('text', '') for item in unlabeled_pool]
if strategy == 'uncertainty':
probs = current_model.predict_proba(texts)
# Наибольшая энтропия = наибольшая неопределённость
entropy = -np.sum(probs * np.log(probs + 1e-10), axis=1)
return np.argsort(entropy)[-budget:].tolist()
elif strategy == 'diversity':
# Core-set: максимально разнообразные примеры
embeddings = current_model.encode(texts) # если есть encoder
selected = [np.random.randint(len(texts))]
for _ in range(budget - 1):
dists = np.min(
np.linalg.norm(
embeddings[:, None] - embeddings[selected],
axis=2
),
axis=1
)
selected.append(np.argmax(dists))
return selected
return list(range(min(budget, len(unlabeled_pool))))
Метрики платформы
| Метрика | Без предразметки | С предразметкой | Active Learning |
|---|---|---|---|
| Задач на 1K документов | 1000 | 300-400 | 150-200 |
| IAA (classification) | 0.82 | 0.88 | 0.91 |
| Время разметки 1K | 8-12 часов | 3-4 часа | 1.5-2 часа |
| Точность финальной модели | 100% | 97-99% | 98-99% |
Полноценная платформа разворачивается за 2-3 недели. Label Studio self-hosted как базовый UI + кастомный бэкенд для оркестрации — типичная архитектура для команд до 20 аннотаторов.







