Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
make channel message html human readable (#4556)
Browse files Browse the repository at this point in the history
Adds two-step processing to the html previews for channel messages and replies.  First, all inline attachments are replaced with the string `[attachment:name]`.
Second, remaining html is stripped out, leaving only plaintext.

This transformation is applied to both the exported content and the preview content in details.

---

#### Does this PR need a docs update or release note?

- [x] ⛔ No

#### Type of change

- [x] 🌻 Feature

#### Issue(s)

* #4546

#### Test Plan

- [x] ⚡ Unit test
- [x] 💚 E2E
  • Loading branch information
ryanfkeepers authored Oct 26, 2023
1 parent ba595a6 commit 8b612c4
Show file tree
Hide file tree
Showing 6 changed files with 233 additions and 19 deletions.
3 changes: 3 additions & 0 deletions src/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ require (
github.com/golang-jwt/jwt/v5 v5.0.0
github.com/google/uuid v1.3.1
github.com/h2non/gock v1.2.0
github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056
github.com/kopia/kopia v0.13.0
github.com/microsoft/kiota-abstractions-go v1.3.0
github.com/microsoft/kiota-authentication-azure-go v1.0.1
Expand Down Expand Up @@ -55,11 +56,13 @@ require (
github.com/magiconair/properties v1.8.7 // indirect
github.com/microsoft/kiota-serialization-multipart-go v1.0.0 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/olekukonko/tablewriter v0.0.5 // indirect
github.com/pelletier/go-toml/v2 v2.1.0 // indirect
github.com/sagikazarmark/locafero v0.3.0 // indirect
github.com/sagikazarmark/slog-shim v0.1.0 // indirect
github.com/sourcegraph/conc v0.3.0 // indirect
github.com/spf13/afero v1.10.0 // indirect
github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect
github.com/std-uritemplate/std-uritemplate/go v0.0.42 // indirect
github.com/subosito/gotenv v1.6.0 // indirect
github.com/valyala/bytebufferpool v1.0.0 // indirect
Expand Down
7 changes: 7 additions & 0 deletions src/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,8 @@ github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:
github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 h1:iCHtR9CQyktQ5+f3dMVZfwD2KWJUgm7M0gdL9NGr8KA=
github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk=
github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8=
Expand Down Expand Up @@ -280,6 +282,7 @@ github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Ky
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
github.com/mattn/go-runewidth v0.0.13/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U=
github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
Expand Down Expand Up @@ -326,6 +329,8 @@ github.com/natefinch/atomic v1.0.1 h1:ZPYKxkqQOx3KZ+RsbnP/YsgvxWQPGxjC0oBt2AhwV0
github.com/natefinch/atomic v1.0.1/go.mod h1:N/D/ELrljoqDyT3rZrsUmtsuzvHkeB/wWjHV22AZRbM=
github.com/nbio/st v0.0.0-20140626010706-e9e8d9816f32 h1:W6apQkHrMkS0Muv8G/TipAy/FJl/rCYT0+EuS8+Z0z4=
github.com/nbio/st v0.0.0-20140626010706-e9e8d9816f32/go.mod h1:9wM+0iRr9ahx58uYLpLIr5fm8diHn0JbqRycJi6w0Ms=
github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
github.com/pascaldekloe/goe v0.1.0 h1:cBOtyMzM9HTpWjXfbbunk26uA6nG3a8n06Wieeh0MwY=
github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
github.com/pelletier/go-toml/v2 v2.1.0 h1:FnwAJ4oYMvbT/34k9zzHuZNrhlz48GB3/s6at6/MHO4=
Expand Down Expand Up @@ -399,6 +404,8 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/spf13/viper v1.17.0 h1:I5txKw7MJasPL/BrfkbA0Jyo/oELqVmux4pR/UxOMfI=
github.com/spf13/viper v1.17.0/go.mod h1:BmMMMLQXSbcHK6KAOiFLz0l5JHrU89OdIRHvsk0+yVI=
github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cmavspvIl9nulOYwdy6IFRRo=
github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM=
github.com/std-uritemplate/std-uritemplate/go v0.0.42 h1:rG+XlE4drkVWs2NLfGS15N+vg+CUcjXElQKvJ0fctlI=
github.com/std-uritemplate/std-uritemplate/go v0.0.42/go.mod h1:Qov4Ay4U83j37XjgxMYevGJFLbnZ2o9cEOhGufBKgKY=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
Expand Down
48 changes: 38 additions & 10 deletions src/internal/m365/collection/groups/export.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,20 +90,23 @@ func streamItems(

type (
minimumChannelMessage struct {
// TODO(keepers): remove attachmentNames when better formatting
// of attachments within the content body is implemented.
AttachmentNames []string `json:"attachmentNames"`
Content string `json:"content"`
CreatedDateTime time.Time `json:"createdDateTime"`
From string `json:"from"`
LastModifiedDateTime time.Time `json:"lastModifiedDateTime"`
Subject string `json:"subject"`
Attachments []minimumAttachment `json:"attachments"`
Content string `json:"content"`
CreatedDateTime time.Time `json:"createdDateTime"`
From string `json:"from"`
LastModifiedDateTime time.Time `json:"lastModifiedDateTime"`
Subject string `json:"subject"`
}

minimumChannelMessageAndReplies struct {
minimumChannelMessage
Replies []minimumChannelMessage `json:"replies,omitempty"`
}

minimumAttachment struct {
ID string `json:"id"`
Name string `json:"name"`
}
)

func formatChannelMessage(
Expand Down Expand Up @@ -143,23 +146,48 @@ func formatChannelMessage(
mcmar.Replies = append(mcmar.Replies, makeMinimumChannelMesasge(r))
}

bs, err = json.Marshal(mcmar)
bs, err = marshalJSONContainingHTML(mcmar)
if err != nil {
return nil, clues.Wrap(err, "serializing minimized channel message")
}

return io.NopCloser(bytes.NewReader(bs)), nil
}

// json.Marshal will replace many markup tags (ex: "<" and ">") with their unicode
// equivalent. In order to maintain parity with original content that contains html,
// we have to use this alternative encoding behavior.
// https://stackoverflow.com/questions/28595664/how-to-stop-json-marshal-from-escaping-and
func marshalJSONContainingHTML(a any) ([]byte, error) {
buffer := &bytes.Buffer{}

encoder := json.NewEncoder(buffer)
encoder.SetEscapeHTML(false)

err := encoder.Encode(a)

return buffer.Bytes(), clues.Stack(err).OrNil()
}

func makeMinimumChannelMesasge(item models.ChatMessageable) minimumChannelMessage {
var content string

if item.GetBody() != nil {
content = ptr.Val(item.GetBody().GetContent())
}

attachments := item.GetAttachments()
minAttachments := make([]minimumAttachment, 0, len(attachments))

for _, a := range attachments {
minAttachments = append(minAttachments, minimumAttachment{
ID: ptr.Val(a.GetId()),
Name: ptr.Val(a.GetName()),
})
}

return minimumChannelMessage{
AttachmentNames: api.GetChatMessageAttachmentNames(item),
Attachments: minAttachments,
Content: content,
CreatedDateTime: ptr.Val(item.GetCreatedDateTime()),
From: api.GetChatMessageFrom(item),
Expand Down
4 changes: 3 additions & 1 deletion src/pkg/backup/details/groups.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package details

import (
"strconv"
"strings"
"time"

"github.com/alcionai/clues"
Expand Down Expand Up @@ -100,7 +101,8 @@ func (i GroupsInfo) Values() []string {
}

return []string{
i.Message.Preview,
// html parsing may produce newlijnes, which we'll want to avoid
strings.ReplaceAll(i.Message.Preview, "\n", "\\n"),
i.ParentPath,
i.Message.Subject,
strconv.Itoa(i.Message.ReplyCount),
Expand Down
70 changes: 62 additions & 8 deletions src/pkg/services/m365/api/channels.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ package api
import (
"context"
"fmt"
"regexp"
"time"

"github.com/alcionai/clues"
"github.com/jaytaylor/html2text"
"github.com/microsoftgraph/msgraph-sdk-go/models"
"github.com/microsoftgraph/msgraph-sdk-go/teams"

Expand Down Expand Up @@ -93,7 +95,7 @@ func (c Channels) GetChannelByName(
// Sanity check ID and name
cal := gv[0]

if err := CheckIDAndName(cal); err != nil {
if err := checkIDAndName(cal); err != nil {
return nil, clues.Stack(err).WithClues(ctx)
}

Expand Down Expand Up @@ -163,7 +165,10 @@ func channelMessageInfo(
modTime = lastReplyAt
}

preview, contentLen := GetChatMessageContentPreview(msg)
preview, contentLen, err := getChatMessageContentPreview(msg)
if err != nil {
preview = "malformed or unparseable html" + preview
}

message := details.ChannelMessageInfo{
AttachmentNames: GetChatMessageAttachmentNames(msg),
Expand All @@ -178,7 +183,11 @@ func channelMessageInfo(
var lr details.ChannelMessageInfo

if lastReply != nil {
preview, contentLen = GetChatMessageContentPreview(lastReply)
preview, contentLen, err = getChatMessageContentPreview(lastReply)
if err != nil {
preview = "malformed or unparseable html: " + preview
}

lr = details.ChannelMessageInfo{
AttachmentNames: GetChatMessageAttachmentNames(lastReply),
CreatedAt: ptr.Val(lastReply.GetCreatedDateTime()),
Expand All @@ -196,9 +205,9 @@ func channelMessageInfo(
}
}

// CheckIDAndName is a validator that ensures the ID
// checkIDAndName is a validator that ensures the ID
// and name are populated and not zero valued.
func CheckIDAndName(c models.Channelable) error {
func checkIDAndName(c models.Channelable) error {
if c == nil {
return clues.New("nil container")
}
Expand Down Expand Up @@ -233,14 +242,59 @@ func GetChatMessageFrom(msg models.ChatMessageable) string {
return ""
}

func GetChatMessageContentPreview(msg models.ChatMessageable) (string, int64) {
var content string
func getChatMessageContentPreview(msg models.ChatMessageable) (string, int64, error) {
content, origSize, err := stripChatMessageHTML(msg)
return str.Preview(content, 128), origSize, clues.Stack(err).OrNil()
}

func stripChatMessageHTML(msg models.ChatMessageable) (string, int64, error) {
var (
content string
origSize int64
)

if msg.GetBody() != nil {
content = ptr.Val(msg.GetBody().GetContent())
}

return str.Preview(content, 128), int64(len(content))
origSize = int64(len(content))

content = replaceAttachmentMarkup(content, msg.GetAttachments())
content, err := html2text.FromString(content)

return content, origSize, clues.Stack(err).OrNil()
}

var attachmentMarkupRE = regexp.MustCompile(`<attachment id=[\\]?"([\d\w-]+)[\\]?"></attachment>`)

// replaces any instance of `<attachment id=\"1693946862569\"></attachment>` with `[attachment:{{name-of-attachment}}]`
// assumes that the attachment ID exists in the attachments slice, otherwise defaults to `[attachment]`.
func replaceAttachmentMarkup(
content string,
attachments []models.ChatMessageAttachmentable,
) string {
attMap := map[string]string{}

for _, att := range attachments {
attMap[ptr.Val(att.GetId())] = ptr.Val(att.GetName())
}

replacer := func(sub string) string {
sm := attachmentMarkupRE.FindStringSubmatch(sub)

if len(sm) > 1 {
name, ok := attMap[sm[1]]
if !ok {
return "[attachment]"
}

return fmt.Sprintf("[attachment:%s]", name)
}

return "[attachment]"
}

return attachmentMarkupRE.ReplaceAllStringFunc(content, replacer)
}

func GetChatMessageAttachmentNames(msg models.ChatMessageable) []string {
Expand Down
Loading

0 comments on commit 8b612c4

Please sign in to comment.