Skip to content

Commit fd1483f

Browse files
Moshe ShemeshSaeed Mahameed
authored andcommitted
net/mlx5: Add support for FW reporter dump
Add support of dump callback for mlx5 FW reporter. Once we trigger FW dump, the FW will write the core dump to its raw data buffer. The tracer translates the raw data to traces and save it to a cyclic array. Once dump is done, the saved traces data is filled into the dump buffer. In case syndrome is not zero the health buffer content will be printed as well. FW dump example: $ devlink health dump show pci/0000:82:00.0 reporter fw dump fw traces: timestamp: 509006640427 lost: false event_id: 185 msg: dump general info GVMI=0x0000 timestamp: 509006645474 lost: false event_id: 185 msg: GVMI management info, gvmi_management context: timestamp: 509006654463 lost: false event_id: 185 msg: [000]: 00000000 00000000 00000000 00000000 timestamp: 509006656127 lost: false event_id: 185 msg: [010]: 00000000 00000000 00000000 00000000 timestamp: 509006656255 lost: false event_id: 185 msg: [020]: 00000000 00000000 00000000 00000000 timestamp: 509006656511 lost: false event_id: 185 msg: [030]: 00000000 00000000 00000000 00000000 timestamp: 509006656639 lost: false event_id: 185 msg: [040]: 00000000 00000000 00000000 00000000 timestamp: 509006656895 lost: false event_id: 185 msg: [050]: 00000000 00000000 00000000 00000000 timestamp: 509006657023 lost: false event_id: 185 msg: [060]: 00000000 00000000 00000000 00000000 timestamp: 509006657180 lost: false event_id: 185 msg: [070]: 00000000 00000000 00000000 00000000 timestamp: 509006659839 lost: false event_id: 185 msg: CMDIF dbase from IRON: active_dbase_slots = 0x00000000 timestamp: 509006667391 lost: false event_id: 185 msg: GVMI=0x0000 hw_toc context: timestamp: 509006667647 lost: false event_id: 185 msg: [000]: 00000000 00000000 00000000 fffff000 timestamp: 509006667775 lost: false event_id: 185 msg: [010]: 00000000 00000000 00000000 80d00000 ... ... Signed-off-by: Moshe Shemesh <[email protected]> Signed-off-by: Eran Ben Elisha <[email protected]> Signed-off-by: Saeed Mahameed <[email protected]>
1 parent 1e34f3e commit fd1483f

File tree

3 files changed

+270
-0
lines changed

3 files changed

+270
-0
lines changed

drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,19 @@ static int mlx5_fw_tracer_allocate_strings_db(struct mlx5_fw_tracer *tracer)
243243
return -ENOMEM;
244244
}
245245

246+
static void
247+
mlx5_fw_tracer_init_saved_traces_array(struct mlx5_fw_tracer *tracer)
248+
{
249+
tracer->st_arr.saved_traces_index = 0;
250+
mutex_init(&tracer->st_arr.lock);
251+
}
252+
253+
static void
254+
mlx5_fw_tracer_clean_saved_traces_array(struct mlx5_fw_tracer *tracer)
255+
{
256+
mutex_destroy(&tracer->st_arr.lock);
257+
}
258+
246259
static void mlx5_tracer_read_strings_db(struct work_struct *work)
247260
{
248261
struct mlx5_fw_tracer *tracer = container_of(work, struct mlx5_fw_tracer,
@@ -522,6 +535,24 @@ static void mlx5_fw_tracer_clean_ready_list(struct mlx5_fw_tracer *tracer)
522535
list_del(&str_frmt->list);
523536
}
524537

538+
static void mlx5_fw_tracer_save_trace(struct mlx5_fw_tracer *tracer,
539+
u64 timestamp, bool lost,
540+
u8 event_id, char *msg)
541+
{
542+
struct mlx5_fw_trace_data *trace_data;
543+
544+
mutex_lock(&tracer->st_arr.lock);
545+
trace_data = &tracer->st_arr.straces[tracer->st_arr.saved_traces_index];
546+
trace_data->timestamp = timestamp;
547+
trace_data->lost = lost;
548+
trace_data->event_id = event_id;
549+
strncpy(trace_data->msg, msg, TRACE_STR_MSG);
550+
551+
tracer->st_arr.saved_traces_index =
552+
(tracer->st_arr.saved_traces_index + 1) & (SAVED_TRACES_NUM - 1);
553+
mutex_unlock(&tracer->st_arr.lock);
554+
}
555+
525556
static void mlx5_tracer_print_trace(struct tracer_string_format *str_frmt,
526557
struct mlx5_core_dev *dev,
527558
u64 trace_timestamp)
@@ -540,6 +571,9 @@ static void mlx5_tracer_print_trace(struct tracer_string_format *str_frmt,
540571
trace_mlx5_fw(dev->tracer, trace_timestamp, str_frmt->lost,
541572
str_frmt->event_id, tmp);
542573

574+
mlx5_fw_tracer_save_trace(dev->tracer, trace_timestamp,
575+
str_frmt->lost, str_frmt->event_id, tmp);
576+
543577
/* remove it from hash */
544578
mlx5_tracer_clean_message(str_frmt);
545579
}
@@ -786,6 +820,109 @@ static void mlx5_fw_tracer_ownership_change(struct work_struct *work)
786820
mlx5_fw_tracer_start(tracer);
787821
}
788822

823+
static int mlx5_fw_tracer_set_core_dump_reg(struct mlx5_core_dev *dev,
824+
u32 *in, int size_in)
825+
{
826+
u32 out[MLX5_ST_SZ_DW(core_dump_reg)] = {};
827+
828+
if (!MLX5_CAP_DEBUG(dev, core_dump_general) &&
829+
!MLX5_CAP_DEBUG(dev, core_dump_qp))
830+
return -EOPNOTSUPP;
831+
832+
return mlx5_core_access_reg(dev, in, size_in, out, sizeof(out),
833+
MLX5_REG_CORE_DUMP, 0, 1);
834+
}
835+
836+
int mlx5_fw_tracer_trigger_core_dump_general(struct mlx5_core_dev *dev)
837+
{
838+
struct mlx5_fw_tracer *tracer = dev->tracer;
839+
u32 in[MLX5_ST_SZ_DW(core_dump_reg)] = {};
840+
int err;
841+
842+
if (!MLX5_CAP_DEBUG(dev, core_dump_general) || !tracer)
843+
return -EOPNOTSUPP;
844+
if (!tracer->owner)
845+
return -EPERM;
846+
847+
MLX5_SET(core_dump_reg, in, core_dump_type, 0x0);
848+
849+
err = mlx5_fw_tracer_set_core_dump_reg(dev, in, sizeof(in));
850+
if (err)
851+
return err;
852+
queue_work(tracer->work_queue, &tracer->handle_traces_work);
853+
flush_workqueue(tracer->work_queue);
854+
return 0;
855+
}
856+
857+
static int
858+
mlx5_devlink_fmsg_fill_trace(struct devlink_fmsg *fmsg,
859+
struct mlx5_fw_trace_data *trace_data)
860+
{
861+
int err;
862+
863+
err = devlink_fmsg_obj_nest_start(fmsg);
864+
if (err)
865+
return err;
866+
867+
err = devlink_fmsg_u64_pair_put(fmsg, "timestamp", trace_data->timestamp);
868+
if (err)
869+
return err;
870+
871+
err = devlink_fmsg_bool_pair_put(fmsg, "lost", trace_data->lost);
872+
if (err)
873+
return err;
874+
875+
err = devlink_fmsg_u8_pair_put(fmsg, "event_id", trace_data->event_id);
876+
if (err)
877+
return err;
878+
879+
err = devlink_fmsg_string_pair_put(fmsg, "msg", trace_data->msg);
880+
if (err)
881+
return err;
882+
883+
err = devlink_fmsg_obj_nest_end(fmsg);
884+
if (err)
885+
return err;
886+
return 0;
887+
}
888+
889+
int mlx5_fw_tracer_get_saved_traces_objects(struct mlx5_fw_tracer *tracer,
890+
struct devlink_fmsg *fmsg)
891+
{
892+
struct mlx5_fw_trace_data *straces = tracer->st_arr.straces;
893+
u32 index, start_index, end_index;
894+
u32 saved_traces_index;
895+
int err;
896+
897+
if (!straces[0].timestamp)
898+
return -ENOMSG;
899+
900+
mutex_lock(&tracer->st_arr.lock);
901+
saved_traces_index = tracer->st_arr.saved_traces_index;
902+
if (straces[saved_traces_index].timestamp)
903+
start_index = saved_traces_index;
904+
else
905+
start_index = 0;
906+
end_index = (saved_traces_index - 1) & (SAVED_TRACES_NUM - 1);
907+
908+
err = devlink_fmsg_arr_pair_nest_start(fmsg, "dump fw traces");
909+
if (err)
910+
goto unlock;
911+
index = start_index;
912+
while (index != end_index) {
913+
err = mlx5_devlink_fmsg_fill_trace(fmsg, &straces[index]);
914+
if (err)
915+
goto unlock;
916+
917+
index = (index + 1) & (SAVED_TRACES_NUM - 1);
918+
}
919+
920+
err = devlink_fmsg_arr_pair_nest_end(fmsg);
921+
unlock:
922+
mutex_unlock(&tracer->st_arr.lock);
923+
return err;
924+
}
925+
789926
/* Create software resources (Buffers, etc ..) */
790927
struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev)
791928
{
@@ -833,6 +970,7 @@ struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev)
833970
goto free_log_buf;
834971
}
835972

973+
mlx5_fw_tracer_init_saved_traces_array(tracer);
836974
mlx5_core_dbg(dev, "FWTracer: Tracer created\n");
837975

838976
return tracer;
@@ -917,6 +1055,7 @@ void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer)
9171055
cancel_work_sync(&tracer->read_fw_strings_work);
9181056
mlx5_fw_tracer_clean_ready_list(tracer);
9191057
mlx5_fw_tracer_clean_print_hash(tracer);
1058+
mlx5_fw_tracer_clean_saved_traces_array(tracer);
9201059
mlx5_fw_tracer_free_strings_db(tracer);
9211060
mlx5_fw_tracer_destroy_log_buf(tracer);
9221061
flush_workqueue(tracer->work_queue);

drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,23 @@
4646
#define TRACER_BLOCK_SIZE_BYTE 256
4747
#define TRACES_PER_BLOCK 32
4848

49+
#define TRACE_STR_MSG 256
50+
#define SAVED_TRACES_NUM 8192
51+
4952
#define TRACER_MAX_PARAMS 7
5053
#define MESSAGE_HASH_BITS 6
5154
#define MESSAGE_HASH_SIZE BIT(MESSAGE_HASH_BITS)
5255

5356
#define MASK_52_7 (0x1FFFFFFFFFFF80)
5457
#define MASK_6_0 (0x7F)
5558

59+
struct mlx5_fw_trace_data {
60+
u64 timestamp;
61+
bool lost;
62+
u8 event_id;
63+
char msg[TRACE_STR_MSG];
64+
};
65+
5666
struct mlx5_fw_tracer {
5767
struct mlx5_core_dev *dev;
5868
struct mlx5_nb nb;
@@ -83,6 +93,13 @@ struct mlx5_fw_tracer {
8393
u32 consumer_index;
8494
} buff;
8595

96+
/* Saved Traces Array */
97+
struct {
98+
struct mlx5_fw_trace_data straces[SAVED_TRACES_NUM];
99+
u32 saved_traces_index;
100+
struct mutex lock; /* Protect st_arr access */
101+
} st_arr;
102+
86103
u64 last_timestamp;
87104
struct work_struct handle_traces_work;
88105
struct hlist_head hash[MESSAGE_HASH_SIZE];
@@ -171,5 +188,8 @@ struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev);
171188
int mlx5_fw_tracer_init(struct mlx5_fw_tracer *tracer);
172189
void mlx5_fw_tracer_cleanup(struct mlx5_fw_tracer *tracer);
173190
void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer);
191+
int mlx5_fw_tracer_trigger_core_dump_general(struct mlx5_core_dev *dev);
192+
int mlx5_fw_tracer_get_saved_traces_objects(struct mlx5_fw_tracer *tracer,
193+
struct devlink_fmsg *fmsg);
174194

175195
#endif

drivers/net/ethernet/mellanox/mlx5/core/health.c

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include "lib/eq.h"
4242
#include "lib/mlx5.h"
4343
#include "lib/pci_vsc.h"
44+
#include "diag/fw_tracer.h"
4445

4546
enum {
4647
MLX5_HEALTH_POLL_INTERVAL = 2 * HZ,
@@ -405,9 +406,119 @@ mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
405406
return devlink_fmsg_string_pair_put(fmsg, "Description", hsynd_str(synd));
406407
}
407408

409+
struct mlx5_fw_reporter_ctx {
410+
u8 err_synd;
411+
int miss_counter;
412+
};
413+
414+
static int
415+
mlx5_fw_reporter_ctx_pairs_put(struct devlink_fmsg *fmsg,
416+
struct mlx5_fw_reporter_ctx *fw_reporter_ctx)
417+
{
418+
int err;
419+
420+
err = devlink_fmsg_u8_pair_put(fmsg, "syndrome",
421+
fw_reporter_ctx->err_synd);
422+
if (err)
423+
return err;
424+
err = devlink_fmsg_u32_pair_put(fmsg, "fw_miss_counter",
425+
fw_reporter_ctx->miss_counter);
426+
if (err)
427+
return err;
428+
return 0;
429+
}
430+
431+
static int
432+
mlx5_fw_reporter_heath_buffer_data_put(struct mlx5_core_dev *dev,
433+
struct devlink_fmsg *fmsg)
434+
{
435+
struct mlx5_core_health *health = &dev->priv.health;
436+
struct health_buffer __iomem *h = health->health;
437+
int err;
438+
int i;
439+
440+
if (!ioread8(&h->synd))
441+
return 0;
442+
443+
err = devlink_fmsg_pair_nest_start(fmsg, "health buffer");
444+
if (err)
445+
return err;
446+
err = devlink_fmsg_obj_nest_start(fmsg);
447+
if (err)
448+
return err;
449+
err = devlink_fmsg_arr_pair_nest_start(fmsg, "assert_var");
450+
if (err)
451+
return err;
452+
453+
for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) {
454+
err = devlink_fmsg_u32_put(fmsg, ioread32be(h->assert_var + i));
455+
if (err)
456+
return err;
457+
}
458+
err = devlink_fmsg_arr_pair_nest_end(fmsg);
459+
if (err)
460+
return err;
461+
err = devlink_fmsg_u32_pair_put(fmsg, "assert_exit_ptr",
462+
ioread32be(&h->assert_exit_ptr));
463+
if (err)
464+
return err;
465+
err = devlink_fmsg_u32_pair_put(fmsg, "assert_callra",
466+
ioread32be(&h->assert_callra));
467+
if (err)
468+
return err;
469+
err = devlink_fmsg_u32_pair_put(fmsg, "hw_id", ioread32be(&h->hw_id));
470+
if (err)
471+
return err;
472+
err = devlink_fmsg_u8_pair_put(fmsg, "irisc_index",
473+
ioread8(&h->irisc_index));
474+
if (err)
475+
return err;
476+
err = devlink_fmsg_u8_pair_put(fmsg, "synd", ioread8(&h->synd));
477+
if (err)
478+
return err;
479+
err = devlink_fmsg_u32_pair_put(fmsg, "ext_synd",
480+
ioread16be(&h->ext_synd));
481+
if (err)
482+
return err;
483+
err = devlink_fmsg_u32_pair_put(fmsg, "raw_fw_ver",
484+
ioread32be(&h->fw_ver));
485+
if (err)
486+
return err;
487+
err = devlink_fmsg_obj_nest_end(fmsg);
488+
if (err)
489+
return err;
490+
return devlink_fmsg_pair_nest_end(fmsg);
491+
}
492+
493+
static int
494+
mlx5_fw_reporter_dump(struct devlink_health_reporter *reporter,
495+
struct devlink_fmsg *fmsg, void *priv_ctx)
496+
{
497+
struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
498+
int err;
499+
500+
err = mlx5_fw_tracer_trigger_core_dump_general(dev);
501+
if (err)
502+
return err;
503+
504+
if (priv_ctx) {
505+
struct mlx5_fw_reporter_ctx *fw_reporter_ctx = priv_ctx;
506+
507+
err = mlx5_fw_reporter_ctx_pairs_put(fmsg, fw_reporter_ctx);
508+
if (err)
509+
return err;
510+
}
511+
512+
err = mlx5_fw_reporter_heath_buffer_data_put(dev, fmsg);
513+
if (err)
514+
return err;
515+
return mlx5_fw_tracer_get_saved_traces_objects(dev->tracer, fmsg);
516+
}
517+
408518
static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
409519
.name = "fw",
410520
.diagnose = mlx5_fw_reporter_diagnose,
521+
.dump = mlx5_fw_reporter_dump,
411522
};
412523

413524
static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev)

0 commit comments

Comments
 (0)