label data type error in evaluation when using "multi" eval_type
我在用自定义算法跑联邦下的bert,参考教程:https://fate.readthedocs.io/en/develop-1.11.1/tutorial/pipeline/nn_tutorial/Bert-example/#dataset-imdb-sentimental
其中有一些细节我已经做了修改,IMDB的示例数据已经跑通,所以我进一步想跑一下文本多分类,使用了一个医学数据集:https://www.kaggle.com/datasets/jpmiller/layoutlm, 已经处理成和IMDB一样的格式:
id, text, label
1, some text,0
...
用notebook提交任务:
import torch as t
from torch import nn
from pipeline import fate_torch_hook
from pipeline.component import HomoNN
from pipeline.backend.pipeline import PipeLine
from pipeline.component import Reader, Evaluation, DataTransform
from pipeline.interface import Data, Model
import torch
fate_torch_hook(t)
import os
fate_project_path = os.path.abspath('/data/prjects/fate')
guest_0 = 10000
host_1 = 10000
pipeline = PipeLine().set_initiator(role='guest', party_id=guest_0).set_roles(guest=guest_0, host=host_1,
arbiter=guest_0)
data_0 = {"name": "imdb", "namespace": "experiment"}
data_path = '/data/projects/fate/examples/data/train_for_fl.csv'
pipeline.bind_table(name=data_0['name'], namespace=data_0['namespace'], path=data_path)
pipeline.bind_table(name=data_0['name'], namespace=data_0['namespace'], path=data_path)
reader_0 = Reader(name="reader_0")
reader_0.get_party_instance(role='guest', party_id=guest_0).component_param(table=data_0)
reader_0.get_party_instance(role='host', party_id=host_1).component_param(table=data_0)
reader_1 = Reader(name="reader_1")
reader_1.get_party_instance(role='guest', party_id=guest_0).component_param(table=data_0)
reader_1.get_party_instance(role='host', party_id=host_1).component_param(table=data_0)
from pipeline.component.homo_nn import DatasetParam, TrainerParam
model = t.nn.Sequential(
t.nn.CustModel(module_name='bert_', class_name='BertClassifier')
)
nn_component = HomoNN(name='nn_0',
model=model,
loss=t.nn.BCELoss(),
optimizer = t.optim.Adam(lr=0.001, weight_decay=0.001),
dataset=DatasetParam(dataset_name='nlp_tokenizer', tokenizer_name_or_path="bert-base-uncased"), # 使用自定义的dataset
trainer=TrainerParam(trainer_name='fedavg_trainer', epochs=2, batch_size=16, data_loader_worker=8, cuda="cuda:0"),
torch_seed=100
)
pipeline.add_component(reader_0)
pipeline.add_component(reader_1)
pipeline.add_component(nn_component, data=Data(train_data=reader_0.output.data, validate_data=reader_1.output.data))
pipeline.add_component(Evaluation(name='eval_0', eval_type='multi'), data=Data(data=nn_component.output.data))
pipeline.compile()
pipeline.fit()
所以重点是把evaluation改成了“multi”,任务在evaluation报错了:
[ERROR] [2024-07-16 16:31:08,499] [202407161623325775800] [38874:139965857994560] - [task_executor.run] [line:266]: 'numpy.float64' object cannot be interpreted as an integer Traceback (most recent call last): File "/data/projects/fate/fateflow/python/fate_flow/worker/task_executor.py", line 210, in run cpn_output = run_object.run(cpn_input) File "/data/projects/fate/fate/python/federatedml/model_base.py", line 239, in run self._run(cpn_input=cpn_input) File "/data/projects/fate/fate/python/federatedml/model_base.py", line 315, in _run this_data_output = func(*real_param) File "/data/projects/fate/fate/python/federatedml/evaluation/evaluation.py", line 400, in fit return self.callback_metric_data( File "/data/projects/fate/fate/python/federatedml/evaluation/evaluation.py", line 1001, in callback_metric_data self.__save_pr_curve(precision_recall, data_type) File "/data/projects/fate/fate/python/federatedml/evaluation/evaluation.py", line 697, in __save_pr_curve pos_recall_score, recall_cuts = self.__multi_class_label_padding( File "/data/projects/fate/fate/python/federatedml/evaluation/evaluation.py", line 478, in __multi_class_label_padding for i in range(label_num): TypeError: 'numpy.float64' object cannot be interpreted as an integer
顺着报错信息我看到/data/projects/fate/fate/python/federatedml/evaluation/evaluation.py这个文件,核心是label_num是float不是int,因此我修改了evaluation.py, 将label_num强制转成int,任务没有再报错,这里是bug吗?
方便的话可以继续追踪下label_num为什么会是numpy.float64
@kaiwang0112006 想詢問一下我在跑一開始的IMDB到訓練完成一直有下面的 error 不知道你這邊有沒有遇到一樣的問題 想請教一下 謝謝 saved nn_0 model HomoNNParam buffer saved nn_0 model HomoNNMeta buffer
HTTPConnectionPool(host='localhost', port=9380): Read timed out. (read timeout=30.0)
Traceback (most recent call last):
File "/data/projects/python/venv/lib/python3.8/site-packages/urllib3/connectionpool.py", line 445, in _make_request
six.raise_from(e, None)
File "
This issue has been marked as stale because it has been open for 365 days with no activity. If this issue is still relevant or if there is new information, please feel free to update or reopen it.
This issue was closed because it has been inactive for 1 days since being marked as stale. If this issue is still relevant or if there is new information, please feel free to update or reopen it.