From 8b612c43d9999365018409b02d963eded13216d5 Mon Sep 17 00:00:00 2001 From: Keepers Date: Thu, 26 Oct 2023 17:05:35 -0600 Subject: [PATCH] make channel message html human readable (#4556) Adds two-step processing to the html previews for channel messages and replies. First, all inline attachments are replaced with the string `[attachment:name]`. Second, remaining html is stripped out, leaving only plaintext. This transformation is applied to both the exported content and the preview content in details. --- #### Does this PR need a docs update or release note? - [x] :no_entry: No #### Type of change - [x] :sunflower: Feature #### Issue(s) * #4546 #### Test Plan - [x] :zap: Unit test - [x] :green_heart: E2E --- src/go.mod | 3 + src/go.sum | 7 + src/internal/m365/collection/groups/export.go | 48 +++++-- src/pkg/backup/details/groups.go | 4 +- src/pkg/services/m365/api/channels.go | 70 ++++++++-- src/pkg/services/m365/api/channels_test.go | 120 ++++++++++++++++++ 6 files changed, 233 insertions(+), 19 deletions(-) diff --git a/src/go.mod b/src/go.mod index bad6ed174f..324dbf9b5d 100644 --- a/src/go.mod +++ b/src/go.mod @@ -13,6 +13,7 @@ require ( github.com/golang-jwt/jwt/v5 v5.0.0 github.com/google/uuid v1.3.1 github.com/h2non/gock v1.2.0 + github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 github.com/kopia/kopia v0.13.0 github.com/microsoft/kiota-abstractions-go v1.3.0 github.com/microsoft/kiota-authentication-azure-go v1.0.1 @@ -55,11 +56,13 @@ require ( github.com/magiconair/properties v1.8.7 // indirect github.com/microsoft/kiota-serialization-multipart-go v1.0.0 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect + github.com/olekukonko/tablewriter v0.0.5 // indirect github.com/pelletier/go-toml/v2 v2.1.0 // indirect github.com/sagikazarmark/locafero v0.3.0 // indirect github.com/sagikazarmark/slog-shim v0.1.0 // indirect github.com/sourcegraph/conc v0.3.0 // indirect github.com/spf13/afero v1.10.0 // indirect + github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect github.com/std-uritemplate/std-uritemplate/go v0.0.42 // indirect github.com/subosito/gotenv v1.6.0 // indirect github.com/valyala/bytebufferpool v1.0.0 // indirect diff --git a/src/go.sum b/src/go.sum index 24f7902020..1bd2296d16 100644 --- a/src/go.sum +++ b/src/go.sum @@ -235,6 +235,8 @@ github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1: github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 h1:iCHtR9CQyktQ5+f3dMVZfwD2KWJUgm7M0gdL9NGr8KA= +github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk= github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= @@ -280,6 +282,7 @@ github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Ky github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= github.com/mattn/go-runewidth v0.0.13/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U= github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= @@ -326,6 +329,8 @@ github.com/natefinch/atomic v1.0.1 h1:ZPYKxkqQOx3KZ+RsbnP/YsgvxWQPGxjC0oBt2AhwV0 github.com/natefinch/atomic v1.0.1/go.mod h1:N/D/ELrljoqDyT3rZrsUmtsuzvHkeB/wWjHV22AZRbM= github.com/nbio/st v0.0.0-20140626010706-e9e8d9816f32 h1:W6apQkHrMkS0Muv8G/TipAy/FJl/rCYT0+EuS8+Z0z4= github.com/nbio/st v0.0.0-20140626010706-e9e8d9816f32/go.mod h1:9wM+0iRr9ahx58uYLpLIr5fm8diHn0JbqRycJi6w0Ms= +github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= +github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= github.com/pascaldekloe/goe v0.1.0 h1:cBOtyMzM9HTpWjXfbbunk26uA6nG3a8n06Wieeh0MwY= github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pelletier/go-toml/v2 v2.1.0 h1:FnwAJ4oYMvbT/34k9zzHuZNrhlz48GB3/s6at6/MHO4= @@ -399,6 +404,8 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/viper v1.17.0 h1:I5txKw7MJasPL/BrfkbA0Jyo/oELqVmux4pR/UxOMfI= github.com/spf13/viper v1.17.0/go.mod h1:BmMMMLQXSbcHK6KAOiFLz0l5JHrU89OdIRHvsk0+yVI= +github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cmavspvIl9nulOYwdy6IFRRo= +github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM= github.com/std-uritemplate/std-uritemplate/go v0.0.42 h1:rG+XlE4drkVWs2NLfGS15N+vg+CUcjXElQKvJ0fctlI= github.com/std-uritemplate/std-uritemplate/go v0.0.42/go.mod h1:Qov4Ay4U83j37XjgxMYevGJFLbnZ2o9cEOhGufBKgKY= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= diff --git a/src/internal/m365/collection/groups/export.go b/src/internal/m365/collection/groups/export.go index 4418fc9d55..f2b0b6d760 100644 --- a/src/internal/m365/collection/groups/export.go +++ b/src/internal/m365/collection/groups/export.go @@ -90,20 +90,23 @@ func streamItems( type ( minimumChannelMessage struct { - // TODO(keepers): remove attachmentNames when better formatting - // of attachments within the content body is implemented. - AttachmentNames []string `json:"attachmentNames"` - Content string `json:"content"` - CreatedDateTime time.Time `json:"createdDateTime"` - From string `json:"from"` - LastModifiedDateTime time.Time `json:"lastModifiedDateTime"` - Subject string `json:"subject"` + Attachments []minimumAttachment `json:"attachments"` + Content string `json:"content"` + CreatedDateTime time.Time `json:"createdDateTime"` + From string `json:"from"` + LastModifiedDateTime time.Time `json:"lastModifiedDateTime"` + Subject string `json:"subject"` } minimumChannelMessageAndReplies struct { minimumChannelMessage Replies []minimumChannelMessage `json:"replies,omitempty"` } + + minimumAttachment struct { + ID string `json:"id"` + Name string `json:"name"` + } ) func formatChannelMessage( @@ -143,7 +146,7 @@ func formatChannelMessage( mcmar.Replies = append(mcmar.Replies, makeMinimumChannelMesasge(r)) } - bs, err = json.Marshal(mcmar) + bs, err = marshalJSONContainingHTML(mcmar) if err != nil { return nil, clues.Wrap(err, "serializing minimized channel message") } @@ -151,6 +154,21 @@ func formatChannelMessage( return io.NopCloser(bytes.NewReader(bs)), nil } +// json.Marshal will replace many markup tags (ex: "<" and ">") with their unicode +// equivalent. In order to maintain parity with original content that contains html, +// we have to use this alternative encoding behavior. +// https://stackoverflow.com/questions/28595664/how-to-stop-json-marshal-from-escaping-and +func marshalJSONContainingHTML(a any) ([]byte, error) { + buffer := &bytes.Buffer{} + + encoder := json.NewEncoder(buffer) + encoder.SetEscapeHTML(false) + + err := encoder.Encode(a) + + return buffer.Bytes(), clues.Stack(err).OrNil() +} + func makeMinimumChannelMesasge(item models.ChatMessageable) minimumChannelMessage { var content string @@ -158,8 +176,18 @@ func makeMinimumChannelMesasge(item models.ChatMessageable) minimumChannelMessag content = ptr.Val(item.GetBody().GetContent()) } + attachments := item.GetAttachments() + minAttachments := make([]minimumAttachment, 0, len(attachments)) + + for _, a := range attachments { + minAttachments = append(minAttachments, minimumAttachment{ + ID: ptr.Val(a.GetId()), + Name: ptr.Val(a.GetName()), + }) + } + return minimumChannelMessage{ - AttachmentNames: api.GetChatMessageAttachmentNames(item), + Attachments: minAttachments, Content: content, CreatedDateTime: ptr.Val(item.GetCreatedDateTime()), From: api.GetChatMessageFrom(item), diff --git a/src/pkg/backup/details/groups.go b/src/pkg/backup/details/groups.go index 8d7867e9ad..b6e5011fea 100644 --- a/src/pkg/backup/details/groups.go +++ b/src/pkg/backup/details/groups.go @@ -2,6 +2,7 @@ package details import ( "strconv" + "strings" "time" "github.com/alcionai/clues" @@ -100,7 +101,8 @@ func (i GroupsInfo) Values() []string { } return []string{ - i.Message.Preview, + // html parsing may produce newlijnes, which we'll want to avoid + strings.ReplaceAll(i.Message.Preview, "\n", "\\n"), i.ParentPath, i.Message.Subject, strconv.Itoa(i.Message.ReplyCount), diff --git a/src/pkg/services/m365/api/channels.go b/src/pkg/services/m365/api/channels.go index c2ab8f8bb3..d7190dfddc 100644 --- a/src/pkg/services/m365/api/channels.go +++ b/src/pkg/services/m365/api/channels.go @@ -3,9 +3,11 @@ package api import ( "context" "fmt" + "regexp" "time" "github.com/alcionai/clues" + "github.com/jaytaylor/html2text" "github.com/microsoftgraph/msgraph-sdk-go/models" "github.com/microsoftgraph/msgraph-sdk-go/teams" @@ -93,7 +95,7 @@ func (c Channels) GetChannelByName( // Sanity check ID and name cal := gv[0] - if err := CheckIDAndName(cal); err != nil { + if err := checkIDAndName(cal); err != nil { return nil, clues.Stack(err).WithClues(ctx) } @@ -163,7 +165,10 @@ func channelMessageInfo( modTime = lastReplyAt } - preview, contentLen := GetChatMessageContentPreview(msg) + preview, contentLen, err := getChatMessageContentPreview(msg) + if err != nil { + preview = "malformed or unparseable html" + preview + } message := details.ChannelMessageInfo{ AttachmentNames: GetChatMessageAttachmentNames(msg), @@ -178,7 +183,11 @@ func channelMessageInfo( var lr details.ChannelMessageInfo if lastReply != nil { - preview, contentLen = GetChatMessageContentPreview(lastReply) + preview, contentLen, err = getChatMessageContentPreview(lastReply) + if err != nil { + preview = "malformed or unparseable html: " + preview + } + lr = details.ChannelMessageInfo{ AttachmentNames: GetChatMessageAttachmentNames(lastReply), CreatedAt: ptr.Val(lastReply.GetCreatedDateTime()), @@ -196,9 +205,9 @@ func channelMessageInfo( } } -// CheckIDAndName is a validator that ensures the ID +// checkIDAndName is a validator that ensures the ID // and name are populated and not zero valued. -func CheckIDAndName(c models.Channelable) error { +func checkIDAndName(c models.Channelable) error { if c == nil { return clues.New("nil container") } @@ -233,14 +242,59 @@ func GetChatMessageFrom(msg models.ChatMessageable) string { return "" } -func GetChatMessageContentPreview(msg models.ChatMessageable) (string, int64) { - var content string +func getChatMessageContentPreview(msg models.ChatMessageable) (string, int64, error) { + content, origSize, err := stripChatMessageHTML(msg) + return str.Preview(content, 128), origSize, clues.Stack(err).OrNil() +} + +func stripChatMessageHTML(msg models.ChatMessageable) (string, int64, error) { + var ( + content string + origSize int64 + ) if msg.GetBody() != nil { content = ptr.Val(msg.GetBody().GetContent()) } - return str.Preview(content, 128), int64(len(content)) + origSize = int64(len(content)) + + content = replaceAttachmentMarkup(content, msg.GetAttachments()) + content, err := html2text.FromString(content) + + return content, origSize, clues.Stack(err).OrNil() +} + +var attachmentMarkupRE = regexp.MustCompile(``) + +// replaces any instance of `` with `[attachment:{{name-of-attachment}}]` +// assumes that the attachment ID exists in the attachments slice, otherwise defaults to `[attachment]`. +func replaceAttachmentMarkup( + content string, + attachments []models.ChatMessageAttachmentable, +) string { + attMap := map[string]string{} + + for _, att := range attachments { + attMap[ptr.Val(att.GetId())] = ptr.Val(att.GetName()) + } + + replacer := func(sub string) string { + sm := attachmentMarkupRE.FindStringSubmatch(sub) + + if len(sm) > 1 { + name, ok := attMap[sm[1]] + if !ok { + return "[attachment]" + } + + return fmt.Sprintf("[attachment:%s]", name) + } + + return "[attachment]" + } + + return attachmentMarkupRE.ReplaceAllStringFunc(content, replacer) } func GetChatMessageAttachmentNames(msg models.ChatMessageable) []string { diff --git a/src/pkg/services/m365/api/channels_test.go b/src/pkg/services/m365/api/channels_test.go index 9a46c052d0..7ac0f13c8e 100644 --- a/src/pkg/services/m365/api/channels_test.go +++ b/src/pkg/services/m365/api/channels_test.go @@ -1,9 +1,11 @@ package api import ( + "fmt" "testing" "time" + "github.com/alcionai/clues" "github.com/microsoftgraph/msgraph-sdk-go/models" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/suite" @@ -565,3 +567,121 @@ func (suite *ChannelsAPIUnitSuite) TestChannelMessageInfo() { }) } } + +func (suite *ChannelsAPIUnitSuite) TestStripChatMessageContent() { + attach1 := models.NewChatMessageAttachment() + attach1.SetId(ptr.To("id1")) + attach1.SetName(ptr.To("a1")) + + attach2 := models.NewChatMessageAttachment() + attach2.SetId(ptr.To("id2")) + attach2.SetName(ptr.To("a2")) + + attachments := []models.ChatMessageAttachmentable{attach1, attach2} + + attachML := func(id string) string { + return fmt.Sprintf(``, id) + } + + tests := []struct { + name string + content string + attachments []models.ChatMessageAttachmentable + expect string + expectErr assert.ErrorAssertionFunc + }{ + { + name: "empty content", + content: "", + attachments: attachments, + expect: "", + expectErr: assert.NoError, + }, + { + name: "only attachment", + content: attachML("id1"), + attachments: attachments, + expect: "[attachment:a1]", + expectErr: assert.NoError, + }, + { + name: "unknown attachment", + content: attachML("idX"), + attachments: attachments, + expect: "[attachment]", + expectErr: assert.NoError, + }, + { + name: "text and attachment", + content: "some text" + attachML("id1") + "other text", + attachments: attachments, + expect: "some text[attachment:a1]other text", + expectErr: assert.NoError, + }, + { + name: "multiple attachments", + content: attachML("id1") + attachML("id2"), + attachments: attachments, + expect: "[attachment:a1][attachment:a2]", + expectErr: assert.NoError, + }, + { + name: "multiple attachments with unidentified", + content: attachML("id1") + attachML("id2") + attachML("idX"), + attachments: attachments, + expect: "[attachment:a1][attachment:a2][attachment]", + expectErr: assert.NoError, + }, + { + name: "with empty html", + content: "
", + attachments: attachments, + expect: "", + expectErr: assert.NoError, + }, + { + name: "with malformed div", + content: "body
end", + attachments: attachments, + expect: "body\nend", + expectErr: assert.NoError, + }, + { + name: "with malformed html 2", + content: "body

inner

end", + attachments: attachments, + expect: "body\n\ninnerend", + expectErr: assert.NoError, + }, + { + name: "with html", + content: "body
in the div
end", + attachments: attachments, + expect: "body\nin the div\nend", + expectErr: assert.NoError, + }, + { + name: "with html and attachments", + content: "body
" + attachML("id1") + attachML("id2") + attachML("idX") + "
end", + attachments: attachments, + expect: "body\n[attachment:a1][attachment:a2][attachment]\nend", + expectErr: assert.NoError, + }, + } + for _, test := range tests { + suite.Run(test.name, func() { + t := suite.T() + + msg := models.NewChatMessage() + body := models.NewItemBody() + body.SetContent(ptr.To(test.content)) + msg.SetBody(body) + msg.SetAttachments(test.attachments) + + // not testing len; it's effectively covered by the content assertion + result, _, err := stripChatMessageHTML(msg) + assert.Equal(t, test.expect, result) + test.expectErr(t, err, clues.ToCore(err)) + }) + } +}