diff --git a/seamless_communication/.git_/FETCH_HEAD b/seamless_communication/.git_/FETCH_HEAD new file mode 100644 index 0000000..330edc7 --- /dev/null +++ b/seamless_communication/.git_/FETCH_HEAD @@ -0,0 +1,17 @@ +c634c446d1cf134b4c063ec8a589f8b257a93ac9 branch 'main' of https://github.com/facebookresearch/seamless_communication +8814dce9f6085bea8daa440ad2cdde4040510b36 not-for-merge branch 'api_update' of https://github.com/facebookresearch/seamless_communication +d31095342cea67b83c0649ed4b3a2e3a0d9dd54e not-for-merge branch 'cndn-patch-1' of https://github.com/facebookresearch/seamless_communication +17c9f811f26587294f6b2121f5df40c7c406029d not-for-merge branch 'cndn-patch-2' of https://github.com/facebookresearch/seamless_communication +4a231725036fe9d22e294eb6c9be9c683837bc66 not-for-merge branch 'cndn-patch-3' of https://github.com/facebookresearch/seamless_communication +ceada0caffa8527a87067fcac2a2d503ae788da7 not-for-merge branch 'expressivity_translator' of https://github.com/facebookresearch/seamless_communication +cee7518bfaa65200df2b0734658044a39d769c2c not-for-merge branch 'fix_no_redef_error' of https://github.com/facebookresearch/seamless_communication +ac6b874c43b5795cdf7421618a044259b177bb65 not-for-merge branch 'ggml_fp16_model' of https://github.com/facebookresearch/seamless_communication +1070b609329296bc8e6c25875ad9f87908696bb3 not-for-merge branch 'header_fix' of https://github.com/facebookresearch/seamless_communication +37858bc085d67aa8bd6cf8bcfcbcb8b7bed3636d not-for-merge branch 'jmp84-patch-1' of https://github.com/facebookresearch/seamless_communication +cd100954048ed82492281da9bd54e000564ddee2 not-for-merge branch 'mem' of https://github.com/facebookresearch/seamless_communication +c8f9a8c0ba265120c74ea0c6cff9d027c2223244 not-for-merge branch 'ngram' of https://github.com/facebookresearch/seamless_communication +7608af8895a747a4caaf36384bc3d0b19c2f775d not-for-merge branch 'no_alloc' of https://github.com/facebookresearch/seamless_communication +e779fcf8caf16f7afbe5ae35c4edcdfa4edbf157 not-for-merge branch 'path_updates' of https://github.com/facebookresearch/seamless_communication +79546ff4e6b89ff89e452f4edcb993a85370b56d not-for-merge branch 'ruslan_final_passes_over_finetuning_scripts' of https://github.com/facebookresearch/seamless_communication +98c13c835dc98ca9cd153a9610052b82e72f8099 not-for-merge branch 'unit_extraction' of https://github.com/facebookresearch/seamless_communication +0605da6bec492b9f71b6acffd6fc05b2c63219af not-for-merge branch 'vocoder-output-lang' of https://github.com/facebookresearch/seamless_communication diff --git a/seamless_communication/.git_/HEAD b/seamless_communication/.git_/HEAD new file mode 100644 index 0000000..b870d82 --- /dev/null +++ b/seamless_communication/.git_/HEAD @@ -0,0 +1 @@ +ref: refs/heads/main diff --git a/seamless_communication/.git_/config b/seamless_communication/.git_/config new file mode 100644 index 0000000..d7b8581 --- /dev/null +++ b/seamless_communication/.git_/config @@ -0,0 +1,11 @@ +[core] + repositoryformatversion = 0 + filemode = true + bare = false + logallrefupdates = true +[remote "origin"] + url = https://github.com/facebookresearch/seamless_communication.git + fetch = +refs/heads/*:refs/remotes/origin/* +[branch "main"] + remote = origin + merge = refs/heads/main diff --git a/seamless_communication/.git_/description b/seamless_communication/.git_/description new file mode 100644 index 0000000..498b267 --- /dev/null +++ b/seamless_communication/.git_/description @@ -0,0 +1 @@ +Unnamed repository; edit this file 'description' to name the repository. diff --git a/seamless_communication/.git_/hooks/applypatch-msg.sample b/seamless_communication/.git_/hooks/applypatch-msg.sample new file mode 100755 index 0000000..a5d7b84 --- /dev/null +++ b/seamless_communication/.git_/hooks/applypatch-msg.sample @@ -0,0 +1,15 @@ +#!/bin/sh +# +# An example hook script to check the commit log message taken by +# applypatch from an e-mail message. +# +# The hook should exit with non-zero status after issuing an +# appropriate message if it wants to stop the commit. The hook is +# allowed to edit the commit message file. +# +# To enable this hook, rename this file to "applypatch-msg". + +. git-sh-setup +commitmsg="$(git rev-parse --git-path hooks/commit-msg)" +test -x "$commitmsg" && exec "$commitmsg" ${1+"$@"} +: diff --git a/seamless_communication/.git_/hooks/commit-msg.sample b/seamless_communication/.git_/hooks/commit-msg.sample new file mode 100755 index 0000000..b58d118 --- /dev/null +++ b/seamless_communication/.git_/hooks/commit-msg.sample @@ -0,0 +1,24 @@ +#!/bin/sh +# +# An example hook script to check the commit log message. +# Called by "git commit" with one argument, the name of the file +# that has the commit message. The hook should exit with non-zero +# status after issuing an appropriate message if it wants to stop the +# commit. The hook is allowed to edit the commit message file. +# +# To enable this hook, rename this file to "commit-msg". + +# Uncomment the below to add a Signed-off-by line to the message. +# Doing this in a hook is a bad idea in general, but the prepare-commit-msg +# hook is more suited to it. +# +# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p') +# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1" + +# This example catches duplicate Signed-off-by lines. + +test "" = "$(grep '^Signed-off-by: ' "$1" | + sort | uniq -c | sed -e '/^[ ]*1[ ]/d')" || { + echo >&2 Duplicate Signed-off-by lines. + exit 1 +} diff --git a/seamless_communication/.git_/hooks/fsmonitor-watchman.sample b/seamless_communication/.git_/hooks/fsmonitor-watchman.sample new file mode 100755 index 0000000..14ed0aa --- /dev/null +++ b/seamless_communication/.git_/hooks/fsmonitor-watchman.sample @@ -0,0 +1,173 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use IPC::Open2; + +# An example hook script to integrate Watchman +# (https://facebook.github.io/watchman/) with git to speed up detecting +# new and modified files. +# +# The hook is passed a version (currently 2) and last update token +# formatted as a string and outputs to stdout a new update token and +# all files that have been modified since the update token. Paths must +# be relative to the root of the working tree and separated by a single NUL. +# +# To enable this hook, rename this file to "query-watchman" and set +# 'git config core.fsmonitor .git/hooks/query-watchman' +# +my ($version, $last_update_token) = @ARGV; + +# Uncomment for debugging +# print STDERR "$0 $version $last_update_token\n"; + +# Check the hook interface version +if ($version ne 2) { + die "Unsupported query-fsmonitor hook version '$version'.\n" . + "Falling back to scanning...\n"; +} + +my $git_work_tree = get_working_dir(); + +my $retry = 1; + +my $json_pkg; +eval { + require JSON::XS; + $json_pkg = "JSON::XS"; + 1; +} or do { + require JSON::PP; + $json_pkg = "JSON::PP"; +}; + +launch_watchman(); + +sub launch_watchman { + my $o = watchman_query(); + if (is_work_tree_watched($o)) { + output_result($o->{clock}, @{$o->{files}}); + } +} + +sub output_result { + my ($clockid, @files) = @_; + + # Uncomment for debugging watchman output + # open (my $fh, ">", ".git/watchman-output.out"); + # binmode $fh, ":utf8"; + # print $fh "$clockid\n@files\n"; + # close $fh; + + binmode STDOUT, ":utf8"; + print $clockid; + print "\0"; + local $, = "\0"; + print @files; +} + +sub watchman_clock { + my $response = qx/watchman clock "$git_work_tree"/; + die "Failed to get clock id on '$git_work_tree'.\n" . + "Falling back to scanning...\n" if $? != 0; + + return $json_pkg->new->utf8->decode($response); +} + +sub watchman_query { + my $pid = open2(\*CHLD_OUT, \*CHLD_IN, 'watchman -j --no-pretty') + or die "open2() failed: $!\n" . + "Falling back to scanning...\n"; + + # In the query expression below we're asking for names of files that + # changed since $last_update_token but not from the .git folder. + # + # To accomplish this, we're using the "since" generator to use the + # recency index to select candidate nodes and "fields" to limit the + # output to file names only. Then we're using the "expression" term to + # further constrain the results. + if (substr($last_update_token, 0, 1) eq "c") { + $last_update_token = "\"$last_update_token\""; + } + my $query = <<" END"; + ["query", "$git_work_tree", { + "since": $last_update_token, + "fields": ["name"], + "expression": ["not", ["dirname", ".git"]] + }] + END + + # Uncomment for debugging the watchman query + # open (my $fh, ">", ".git/watchman-query.json"); + # print $fh $query; + # close $fh; + + print CHLD_IN $query; + close CHLD_IN; + my $response = do {local $/; }; + + # Uncomment for debugging the watch response + # open ($fh, ">", ".git/watchman-response.json"); + # print $fh $response; + # close $fh; + + die "Watchman: command returned no output.\n" . + "Falling back to scanning...\n" if $response eq ""; + die "Watchman: command returned invalid output: $response\n" . + "Falling back to scanning...\n" unless $response =~ /^\{/; + + return $json_pkg->new->utf8->decode($response); +} + +sub is_work_tree_watched { + my ($output) = @_; + my $error = $output->{error}; + if ($retry > 0 and $error and $error =~ m/unable to resolve root .* directory (.*) is not watched/) { + $retry--; + my $response = qx/watchman watch "$git_work_tree"/; + die "Failed to make watchman watch '$git_work_tree'.\n" . + "Falling back to scanning...\n" if $? != 0; + $output = $json_pkg->new->utf8->decode($response); + $error = $output->{error}; + die "Watchman: $error.\n" . + "Falling back to scanning...\n" if $error; + + # Uncomment for debugging watchman output + # open (my $fh, ">", ".git/watchman-output.out"); + # close $fh; + + # Watchman will always return all files on the first query so + # return the fast "everything is dirty" flag to git and do the + # Watchman query just to get it over with now so we won't pay + # the cost in git to look up each individual file. + my $o = watchman_clock(); + $error = $output->{error}; + + die "Watchman: $error.\n" . + "Falling back to scanning...\n" if $error; + + output_result($o->{clock}, ("/")); + $last_update_token = $o->{clock}; + + eval { launch_watchman() }; + return 0; + } + + die "Watchman: $error.\n" . + "Falling back to scanning...\n" if $error; + + return 1; +} + +sub get_working_dir { + my $working_dir; + if ($^O =~ 'msys' || $^O =~ 'cygwin') { + $working_dir = Win32::GetCwd(); + $working_dir =~ tr/\\/\//; + } else { + require Cwd; + $working_dir = Cwd::cwd(); + } + + return $working_dir; +} diff --git a/seamless_communication/.git_/hooks/post-update.sample b/seamless_communication/.git_/hooks/post-update.sample new file mode 100755 index 0000000..ec17ec1 --- /dev/null +++ b/seamless_communication/.git_/hooks/post-update.sample @@ -0,0 +1,8 @@ +#!/bin/sh +# +# An example hook script to prepare a packed repository for use over +# dumb transports. +# +# To enable this hook, rename this file to "post-update". + +exec git update-server-info diff --git a/seamless_communication/.git_/hooks/pre-applypatch.sample b/seamless_communication/.git_/hooks/pre-applypatch.sample new file mode 100755 index 0000000..4142082 --- /dev/null +++ b/seamless_communication/.git_/hooks/pre-applypatch.sample @@ -0,0 +1,14 @@ +#!/bin/sh +# +# An example hook script to verify what is about to be committed +# by applypatch from an e-mail message. +# +# The hook should exit with non-zero status after issuing an +# appropriate message if it wants to stop the commit. +# +# To enable this hook, rename this file to "pre-applypatch". + +. git-sh-setup +precommit="$(git rev-parse --git-path hooks/pre-commit)" +test -x "$precommit" && exec "$precommit" ${1+"$@"} +: diff --git a/seamless_communication/.git_/hooks/pre-commit.sample b/seamless_communication/.git_/hooks/pre-commit.sample new file mode 100755 index 0000000..e144712 --- /dev/null +++ b/seamless_communication/.git_/hooks/pre-commit.sample @@ -0,0 +1,49 @@ +#!/bin/sh +# +# An example hook script to verify what is about to be committed. +# Called by "git commit" with no arguments. The hook should +# exit with non-zero status after issuing an appropriate message if +# it wants to stop the commit. +# +# To enable this hook, rename this file to "pre-commit". + +if git rev-parse --verify HEAD >/dev/null 2>&1 +then + against=HEAD +else + # Initial commit: diff against an empty tree object + against=$(git hash-object -t tree /dev/null) +fi + +# If you want to allow non-ASCII filenames set this variable to true. +allownonascii=$(git config --type=bool hooks.allownonascii) + +# Redirect output to stderr. +exec 1>&2 + +# Cross platform projects tend to avoid non-ASCII filenames; prevent +# them from being added to the repository. We exploit the fact that the +# printable range starts at the space character and ends with tilde. +if [ "$allownonascii" != "true" ] && + # Note that the use of brackets around a tr range is ok here, (it's + # even required, for portability to Solaris 10's /usr/bin/tr), since + # the square bracket bytes happen to fall in the designated range. + test $(git diff --cached --name-only --diff-filter=A -z $against | + LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0 +then + cat <<\EOF +Error: Attempt to add a non-ASCII file name. + +This can cause problems if you want to work with people on other platforms. + +To be portable it is advisable to rename the file. + +If you know what you are doing you can disable this check using: + + git config hooks.allownonascii true +EOF + exit 1 +fi + +# If there are whitespace errors, print the offending file names and fail. +exec git diff-index --check --cached $against -- diff --git a/seamless_communication/.git_/hooks/pre-merge-commit.sample b/seamless_communication/.git_/hooks/pre-merge-commit.sample new file mode 100755 index 0000000..399eab1 --- /dev/null +++ b/seamless_communication/.git_/hooks/pre-merge-commit.sample @@ -0,0 +1,13 @@ +#!/bin/sh +# +# An example hook script to verify what is about to be committed. +# Called by "git merge" with no arguments. The hook should +# exit with non-zero status after issuing an appropriate message to +# stderr if it wants to stop the merge commit. +# +# To enable this hook, rename this file to "pre-merge-commit". + +. git-sh-setup +test -x "$GIT_DIR/hooks/pre-commit" && + exec "$GIT_DIR/hooks/pre-commit" +: diff --git a/seamless_communication/.git_/hooks/pre-push.sample b/seamless_communication/.git_/hooks/pre-push.sample new file mode 100755 index 0000000..4ce688d --- /dev/null +++ b/seamless_communication/.git_/hooks/pre-push.sample @@ -0,0 +1,53 @@ +#!/bin/sh + +# An example hook script to verify what is about to be pushed. Called by "git +# push" after it has checked the remote status, but before anything has been +# pushed. If this script exits with a non-zero status nothing will be pushed. +# +# This hook is called with the following parameters: +# +# $1 -- Name of the remote to which the push is being done +# $2 -- URL to which the push is being done +# +# If pushing without using a named remote those arguments will be equal. +# +# Information about the commits which are being pushed is supplied as lines to +# the standard input in the form: +# +# +# +# This sample shows how to prevent push of commits where the log message starts +# with "WIP" (work in progress). + +remote="$1" +url="$2" + +zero=$(git hash-object --stdin &2 "Found WIP commit in $local_ref, not pushing" + exit 1 + fi + fi +done + +exit 0 diff --git a/seamless_communication/.git_/hooks/pre-rebase.sample b/seamless_communication/.git_/hooks/pre-rebase.sample new file mode 100755 index 0000000..6cbef5c --- /dev/null +++ b/seamless_communication/.git_/hooks/pre-rebase.sample @@ -0,0 +1,169 @@ +#!/bin/sh +# +# Copyright (c) 2006, 2008 Junio C Hamano +# +# The "pre-rebase" hook is run just before "git rebase" starts doing +# its job, and can prevent the command from running by exiting with +# non-zero status. +# +# The hook is called with the following parameters: +# +# $1 -- the upstream the series was forked from. +# $2 -- the branch being rebased (or empty when rebasing the current branch). +# +# This sample shows how to prevent topic branches that are already +# merged to 'next' branch from getting rebased, because allowing it +# would result in rebasing already published history. + +publish=next +basebranch="$1" +if test "$#" = 2 +then + topic="refs/heads/$2" +else + topic=`git symbolic-ref HEAD` || + exit 0 ;# we do not interrupt rebasing detached HEAD +fi + +case "$topic" in +refs/heads/??/*) + ;; +*) + exit 0 ;# we do not interrupt others. + ;; +esac + +# Now we are dealing with a topic branch being rebased +# on top of master. Is it OK to rebase it? + +# Does the topic really exist? +git show-ref -q "$topic" || { + echo >&2 "No such branch $topic" + exit 1 +} + +# Is topic fully merged to master? +not_in_master=`git rev-list --pretty=oneline ^master "$topic"` +if test -z "$not_in_master" +then + echo >&2 "$topic is fully merged to master; better remove it." + exit 1 ;# we could allow it, but there is no point. +fi + +# Is topic ever merged to next? If so you should not be rebasing it. +only_next_1=`git rev-list ^master "^$topic" ${publish} | sort` +only_next_2=`git rev-list ^master ${publish} | sort` +if test "$only_next_1" = "$only_next_2" +then + not_in_topic=`git rev-list "^$topic" master` + if test -z "$not_in_topic" + then + echo >&2 "$topic is already up to date with master" + exit 1 ;# we could allow it, but there is no point. + else + exit 0 + fi +else + not_in_next=`git rev-list --pretty=oneline ^${publish} "$topic"` + /usr/bin/perl -e ' + my $topic = $ARGV[0]; + my $msg = "* $topic has commits already merged to public branch:\n"; + my (%not_in_next) = map { + /^([0-9a-f]+) /; + ($1 => 1); + } split(/\n/, $ARGV[1]); + for my $elem (map { + /^([0-9a-f]+) (.*)$/; + [$1 => $2]; + } split(/\n/, $ARGV[2])) { + if (!exists $not_in_next{$elem->[0]}) { + if ($msg) { + print STDERR $msg; + undef $msg; + } + print STDERR " $elem->[1]\n"; + } + } + ' "$topic" "$not_in_next" "$not_in_master" + exit 1 +fi + +<<\DOC_END + +This sample hook safeguards topic branches that have been +published from being rewound. + +The workflow assumed here is: + + * Once a topic branch forks from "master", "master" is never + merged into it again (either directly or indirectly). + + * Once a topic branch is fully cooked and merged into "master", + it is deleted. If you need to build on top of it to correct + earlier mistakes, a new topic branch is created by forking at + the tip of the "master". This is not strictly necessary, but + it makes it easier to keep your history simple. + + * Whenever you need to test or publish your changes to topic + branches, merge them into "next" branch. + +The script, being an example, hardcodes the publish branch name +to be "next", but it is trivial to make it configurable via +$GIT_DIR/config mechanism. + +With this workflow, you would want to know: + +(1) ... if a topic branch has ever been merged to "next". Young + topic branches can have stupid mistakes you would rather + clean up before publishing, and things that have not been + merged into other branches can be easily rebased without + affecting other people. But once it is published, you would + not want to rewind it. + +(2) ... if a topic branch has been fully merged to "master". + Then you can delete it. More importantly, you should not + build on top of it -- other people may already want to + change things related to the topic as patches against your + "master", so if you need further changes, it is better to + fork the topic (perhaps with the same name) afresh from the + tip of "master". + +Let's look at this example: + + o---o---o---o---o---o---o---o---o---o "next" + / / / / + / a---a---b A / / + / / / / + / / c---c---c---c B / + / / / \ / + / / / b---b C \ / + / / / / \ / + ---o---o---o---o---o---o---o---o---o---o---o "master" + + +A, B and C are topic branches. + + * A has one fix since it was merged up to "next". + + * B has finished. It has been fully merged up to "master" and "next", + and is ready to be deleted. + + * C has not merged to "next" at all. + +We would want to allow C to be rebased, refuse A, and encourage +B to be deleted. + +To compute (1): + + git rev-list ^master ^topic next + git rev-list ^master next + + if these match, topic has not merged in next at all. + +To compute (2): + + git rev-list master..topic + + if this is empty, it is fully merged to "master". + +DOC_END diff --git a/seamless_communication/.git_/hooks/pre-receive.sample b/seamless_communication/.git_/hooks/pre-receive.sample new file mode 100755 index 0000000..a1fd29e --- /dev/null +++ b/seamless_communication/.git_/hooks/pre-receive.sample @@ -0,0 +1,24 @@ +#!/bin/sh +# +# An example hook script to make use of push options. +# The example simply echoes all push options that start with 'echoback=' +# and rejects all pushes when the "reject" push option is used. +# +# To enable this hook, rename this file to "pre-receive". + +if test -n "$GIT_PUSH_OPTION_COUNT" +then + i=0 + while test "$i" -lt "$GIT_PUSH_OPTION_COUNT" + do + eval "value=\$GIT_PUSH_OPTION_$i" + case "$value" in + echoback=*) + echo "echo from the pre-receive-hook: ${value#*=}" >&2 + ;; + reject) + exit 1 + esac + i=$((i + 1)) + done +fi diff --git a/seamless_communication/.git_/hooks/prepare-commit-msg.sample b/seamless_communication/.git_/hooks/prepare-commit-msg.sample new file mode 100755 index 0000000..10fa14c --- /dev/null +++ b/seamless_communication/.git_/hooks/prepare-commit-msg.sample @@ -0,0 +1,42 @@ +#!/bin/sh +# +# An example hook script to prepare the commit log message. +# Called by "git commit" with the name of the file that has the +# commit message, followed by the description of the commit +# message's source. The hook's purpose is to edit the commit +# message file. If the hook fails with a non-zero status, +# the commit is aborted. +# +# To enable this hook, rename this file to "prepare-commit-msg". + +# This hook includes three examples. The first one removes the +# "# Please enter the commit message..." help message. +# +# The second includes the output of "git diff --name-status -r" +# into the message, just before the "git status" output. It is +# commented because it doesn't cope with --amend or with squashed +# commits. +# +# The third example adds a Signed-off-by line to the message, that can +# still be edited. This is rarely a good idea. + +COMMIT_MSG_FILE=$1 +COMMIT_SOURCE=$2 +SHA1=$3 + +/usr/bin/perl -i.bak -ne 'print unless(m/^. Please enter the commit message/..m/^#$/)' "$COMMIT_MSG_FILE" + +# case "$COMMIT_SOURCE,$SHA1" in +# ,|template,) +# /usr/bin/perl -i.bak -pe ' +# print "\n" . `git diff --cached --name-status -r` +# if /^#/ && $first++ == 0' "$COMMIT_MSG_FILE" ;; +# *) ;; +# esac + +# SOB=$(git var GIT_COMMITTER_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p') +# git interpret-trailers --in-place --trailer "$SOB" "$COMMIT_MSG_FILE" +# if test -z "$COMMIT_SOURCE" +# then +# /usr/bin/perl -i.bak -pe 'print "\n" if !$first_line++' "$COMMIT_MSG_FILE" +# fi diff --git a/seamless_communication/.git_/hooks/push-to-checkout.sample b/seamless_communication/.git_/hooks/push-to-checkout.sample new file mode 100755 index 0000000..af5a0c0 --- /dev/null +++ b/seamless_communication/.git_/hooks/push-to-checkout.sample @@ -0,0 +1,78 @@ +#!/bin/sh + +# An example hook script to update a checked-out tree on a git push. +# +# This hook is invoked by git-receive-pack(1) when it reacts to git +# push and updates reference(s) in its repository, and when the push +# tries to update the branch that is currently checked out and the +# receive.denyCurrentBranch configuration variable is set to +# updateInstead. +# +# By default, such a push is refused if the working tree and the index +# of the remote repository has any difference from the currently +# checked out commit; when both the working tree and the index match +# the current commit, they are updated to match the newly pushed tip +# of the branch. This hook is to be used to override the default +# behaviour; however the code below reimplements the default behaviour +# as a starting point for convenient modification. +# +# The hook receives the commit with which the tip of the current +# branch is going to be updated: +commit=$1 + +# It can exit with a non-zero status to refuse the push (when it does +# so, it must not modify the index or the working tree). +die () { + echo >&2 "$*" + exit 1 +} + +# Or it can make any necessary changes to the working tree and to the +# index to bring them to the desired state when the tip of the current +# branch is updated to the new commit, and exit with a zero status. +# +# For example, the hook can simply run git read-tree -u -m HEAD "$1" +# in order to emulate git fetch that is run in the reverse direction +# with git push, as the two-tree form of git read-tree -u -m is +# essentially the same as git switch or git checkout that switches +# branches while keeping the local changes in the working tree that do +# not interfere with the difference between the branches. + +# The below is a more-or-less exact translation to shell of the C code +# for the default behaviour for git's push-to-checkout hook defined in +# the push_to_deploy() function in builtin/receive-pack.c. +# +# Note that the hook will be executed from the repository directory, +# not from the working tree, so if you want to perform operations on +# the working tree, you will have to adapt your code accordingly, e.g. +# by adding "cd .." or using relative paths. + +if ! git update-index -q --ignore-submodules --refresh +then + die "Up-to-date check failed" +fi + +if ! git diff-files --quiet --ignore-submodules -- +then + die "Working directory has unstaged changes" +fi + +# This is a rough translation of: +# +# head_has_history() ? "HEAD" : EMPTY_TREE_SHA1_HEX +if git cat-file -e HEAD 2>/dev/null +then + head=HEAD +else + head=$(git hash-object -t tree --stdin &2 + echo " (if you want, you could supply GIT_DIR then run" >&2 + echo " $0 )" >&2 + exit 1 +fi + +if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then + echo "usage: $0 " >&2 + exit 1 +fi + +# --- Config +allowunannotated=$(git config --type=bool hooks.allowunannotated) +allowdeletebranch=$(git config --type=bool hooks.allowdeletebranch) +denycreatebranch=$(git config --type=bool hooks.denycreatebranch) +allowdeletetag=$(git config --type=bool hooks.allowdeletetag) +allowmodifytag=$(git config --type=bool hooks.allowmodifytag) + +# check for no description +projectdesc=$(sed -e '1q' "$GIT_DIR/description") +case "$projectdesc" in +"Unnamed repository"* | "") + echo "*** Project description file hasn't been set" >&2 + exit 1 + ;; +esac + +# --- Check types +# if $newrev is 0000...0000, it's a commit to delete a ref. +zero=$(git hash-object --stdin &2 + echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2 + exit 1 + fi + ;; + refs/tags/*,delete) + # delete tag + if [ "$allowdeletetag" != "true" ]; then + echo "*** Deleting a tag is not allowed in this repository" >&2 + exit 1 + fi + ;; + refs/tags/*,tag) + # annotated tag + if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1 + then + echo "*** Tag '$refname' already exists." >&2 + echo "*** Modifying a tag is not allowed in this repository." >&2 + exit 1 + fi + ;; + refs/heads/*,commit) + # branch + if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then + echo "*** Creating a branch is not allowed in this repository" >&2 + exit 1 + fi + ;; + refs/heads/*,delete) + # delete branch + if [ "$allowdeletebranch" != "true" ]; then + echo "*** Deleting a branch is not allowed in this repository" >&2 + exit 1 + fi + ;; + refs/remotes/*,commit) + # tracking branch + ;; + refs/remotes/*,delete) + # delete tracking branch + if [ "$allowdeletebranch" != "true" ]; then + echo "*** Deleting a tracking branch is not allowed in this repository" >&2 + exit 1 + fi + ;; + *) + # Anything else (is there anything else?) + echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2 + exit 1 + ;; +esac + +# --- Finished +exit 0 diff --git a/seamless_communication/.git_/index b/seamless_communication/.git_/index new file mode 100644 index 0000000..63e2f74 Binary files /dev/null and b/seamless_communication/.git_/index differ diff --git a/seamless_communication/.git_/info/exclude b/seamless_communication/.git_/info/exclude new file mode 100644 index 0000000..a5196d1 --- /dev/null +++ b/seamless_communication/.git_/info/exclude @@ -0,0 +1,6 @@ +# git ls-files --others --exclude-from=.git/info/exclude +# Lines that start with '#' are comments. +# For a project mostly in C, the following would be a good set of +# exclude patterns (uncomment them if you want to use them): +# *.[oa] +# *~ diff --git a/seamless_communication/.git_/logs/HEAD b/seamless_communication/.git_/logs/HEAD new file mode 100644 index 0000000..28fdc4c --- /dev/null +++ b/seamless_communication/.git_/logs/HEAD @@ -0,0 +1 @@ +0000000000000000000000000000000000000000 c634c446d1cf134b4c063ec8a589f8b257a93ac9 Long Nguyen-Vu 1703438995 +0900 clone: from https://github.com/facebookresearch/seamless_communication.git diff --git a/seamless_communication/.git_/logs/refs/heads/main b/seamless_communication/.git_/logs/refs/heads/main new file mode 100644 index 0000000..28fdc4c --- /dev/null +++ b/seamless_communication/.git_/logs/refs/heads/main @@ -0,0 +1 @@ +0000000000000000000000000000000000000000 c634c446d1cf134b4c063ec8a589f8b257a93ac9 Long Nguyen-Vu 1703438995 +0900 clone: from https://github.com/facebookresearch/seamless_communication.git diff --git a/seamless_communication/.git_/logs/refs/remotes/origin/HEAD b/seamless_communication/.git_/logs/refs/remotes/origin/HEAD new file mode 100644 index 0000000..28fdc4c --- /dev/null +++ b/seamless_communication/.git_/logs/refs/remotes/origin/HEAD @@ -0,0 +1 @@ +0000000000000000000000000000000000000000 c634c446d1cf134b4c063ec8a589f8b257a93ac9 Long Nguyen-Vu 1703438995 +0900 clone: from https://github.com/facebookresearch/seamless_communication.git diff --git a/seamless_communication/.git_/objects/pack/pack-00a1cc40318f21b8a2c613858b4b987498199697.idx b/seamless_communication/.git_/objects/pack/pack-00a1cc40318f21b8a2c613858b4b987498199697.idx new file mode 100644 index 0000000..a6f16b7 Binary files /dev/null and b/seamless_communication/.git_/objects/pack/pack-00a1cc40318f21b8a2c613858b4b987498199697.idx differ diff --git a/seamless_communication/.git_/objects/pack/pack-00a1cc40318f21b8a2c613858b4b987498199697.pack b/seamless_communication/.git_/objects/pack/pack-00a1cc40318f21b8a2c613858b4b987498199697.pack new file mode 100644 index 0000000..6a302da Binary files /dev/null and b/seamless_communication/.git_/objects/pack/pack-00a1cc40318f21b8a2c613858b4b987498199697.pack differ diff --git a/seamless_communication/.git_/packed-refs b/seamless_communication/.git_/packed-refs new file mode 100644 index 0000000..5046806 --- /dev/null +++ b/seamless_communication/.git_/packed-refs @@ -0,0 +1,18 @@ +# pack-refs with: peeled fully-peeled sorted +8814dce9f6085bea8daa440ad2cdde4040510b36 refs/remotes/origin/api_update +d31095342cea67b83c0649ed4b3a2e3a0d9dd54e refs/remotes/origin/cndn-patch-1 +17c9f811f26587294f6b2121f5df40c7c406029d refs/remotes/origin/cndn-patch-2 +4a231725036fe9d22e294eb6c9be9c683837bc66 refs/remotes/origin/cndn-patch-3 +ceada0caffa8527a87067fcac2a2d503ae788da7 refs/remotes/origin/expressivity_translator +cee7518bfaa65200df2b0734658044a39d769c2c refs/remotes/origin/fix_no_redef_error +ac6b874c43b5795cdf7421618a044259b177bb65 refs/remotes/origin/ggml_fp16_model +1070b609329296bc8e6c25875ad9f87908696bb3 refs/remotes/origin/header_fix +37858bc085d67aa8bd6cf8bcfcbcb8b7bed3636d refs/remotes/origin/jmp84-patch-1 +c634c446d1cf134b4c063ec8a589f8b257a93ac9 refs/remotes/origin/main +cd100954048ed82492281da9bd54e000564ddee2 refs/remotes/origin/mem +c8f9a8c0ba265120c74ea0c6cff9d027c2223244 refs/remotes/origin/ngram +7608af8895a747a4caaf36384bc3d0b19c2f775d refs/remotes/origin/no_alloc +e779fcf8caf16f7afbe5ae35c4edcdfa4edbf157 refs/remotes/origin/path_updates +79546ff4e6b89ff89e452f4edcb993a85370b56d refs/remotes/origin/ruslan_final_passes_over_finetuning_scripts +98c13c835dc98ca9cd153a9610052b82e72f8099 refs/remotes/origin/unit_extraction +0605da6bec492b9f71b6acffd6fc05b2c63219af refs/remotes/origin/vocoder-output-lang diff --git a/seamless_communication/.git_/refs/heads/main b/seamless_communication/.git_/refs/heads/main new file mode 100644 index 0000000..eae29be --- /dev/null +++ b/seamless_communication/.git_/refs/heads/main @@ -0,0 +1 @@ +c634c446d1cf134b4c063ec8a589f8b257a93ac9 diff --git a/seamless_communication/.git_/refs/remotes/origin/HEAD b/seamless_communication/.git_/refs/remotes/origin/HEAD new file mode 100644 index 0000000..4b0a875 --- /dev/null +++ b/seamless_communication/.git_/refs/remotes/origin/HEAD @@ -0,0 +1 @@ +ref: refs/remotes/origin/main diff --git a/seamless_communication/.gitignore b/seamless_communication/.gitignore new file mode 100644 index 0000000..e273832 --- /dev/null +++ b/seamless_communication/.gitignore @@ -0,0 +1,149 @@ +# JetBrains PyCharm IDE +.idea/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# macOS dir files +.DS_Store + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Checkpoints +checkpoints + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# Generated files +/fairseq/temporal_convolution_tbc +/fairseq/modules/*_layer/*_forward.cu +/fairseq/modules/*_layer/*_backward.cu +/fairseq/version.py + +# data +data-bin/ + +# reranking +/examples/reranking/rerank_data + +# Cython-generated C++ source files +/fairseq/data/data_utils_fast.cpp +/fairseq/data/token_block_utils_fast.cpp + +# VSCODE +.vscode/ftp-sync.json +.vscode/settings.json + +# Experimental Folder +experimental/* + +# Weights and Biases logs +wandb/ + +# Hydra artifacts +nohup.out +multirun +outputs + + +# symlinks +seamless_communication +# ignore src/seamless_communication +!*/seamless_communication +m4t_scripts +/ggml/test_data/ diff --git a/seamless_communication/.gitmodules b/seamless_communication/.gitmodules new file mode 100644 index 0000000..fb8b097 --- /dev/null +++ b/seamless_communication/.gitmodules @@ -0,0 +1,3 @@ +[submodule "ggml/tracy"] + path = ggml/tracy + url = git@github.com:wolfpld/tracy.git diff --git a/seamless_communication/.pre-commit-config.yaml b/seamless_communication/.pre-commit-config.yaml new file mode 100644 index 0000000..142ed10 --- /dev/null +++ b/seamless_communication/.pre-commit-config.yaml @@ -0,0 +1,15 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.1.0 + hooks: + - id: trailing-whitespace + - id: check-ast + - id: check-merge-conflict + - id: check-added-large-files + args: ["--maxkb=2000"] + - id: end-of-file-fixer + + - repo: https://github.com/psf/black + rev: 22.3.0 + hooks: + - id: black diff --git a/seamless_communication/23-11_SEAMLESS_BlogHero_11.17.jpg b/seamless_communication/23-11_SEAMLESS_BlogHero_11.17.jpg new file mode 100644 index 0000000..7da427e Binary files /dev/null and b/seamless_communication/23-11_SEAMLESS_BlogHero_11.17.jpg differ diff --git a/seamless_communication/ACCEPTABLE_USE_POLICY b/seamless_communication/ACCEPTABLE_USE_POLICY new file mode 100644 index 0000000..e314209 --- /dev/null +++ b/seamless_communication/ACCEPTABLE_USE_POLICY @@ -0,0 +1,49 @@ +Seamless Acceptable Use Policy + +Meta is committed to promoting safe and fair use of its tools and features, including Seamless. If you access or use Seamless, Seamless Materials or the Seamless Demo, you agree to this Acceptable Use Policy (“Policy”). The most recent copy of this policy can be found at [ai.meta.com/seamless/use-policy]. + +Prohibited Uses + +We want everyone to use Seamless safely and responsibly. You agree you will not use, or allow others to use, Seamless to: + +1. Violate the law or others’ rights, including to: + a. Engage in, promote, generate, contribute to, encourage, plan, incite, or further illegal or unlawful activity or content, such as: + i. Violence or terrorism + ii. Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content or failure to report Child Sexual Abuse Material + iii. Human trafficking, exploitation, and sexual violence + iv. The illegal distribution of information or materials to minors, including obscene materials, or failure to employ legally required age-gating in connection with such information or materials + v. Sexual solicitation + vi. Any other criminal activity + b. Engage in, promote, incite, or facilitate the harassment, abuse, threatening, or bullying of individuals or groups of individuals + c. Engage in, promote, incite, or facilitate discrimination or other unlawful or harmful conduct in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services + d. Collect, process, disclose, generate, or infer health, demographic, biometric, or other sensitive personal or private information about individuals without rights and consents required by applicable laws + e. Engage in or facilitate any action or generate any content that infringes, misappropriates, or otherwise violates any third-party rights, including the outputs or results of any products or services using Seamless + f. Create, generate, or facilitate the creation of malicious code, malware, computer viruses or do anything else that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system +2. Engage in, promote, incite, facilitate, or assist in the planning or development of activities that present a risk of death or bodily harm to individuals, including use of Seamless related to the following: + a. Military, warfare, nuclear industries or applications, espionage, use for materials or activities that are subject to the International Traffic Arms Regulations (ITAR) maintained by the United States Department of State + b. Guns and illegal weapons (including weapon development) + c. Illegal drugs and regulated/controlled substances + d. Operation of critical infrastructure, transportation technologies, or heavy machinery + e. Self-harm or harm to others, including suicide, cutting, and eating disorders + f. Any content intended to incite or promote violence, abuse, or any infliction of bodily harm to an individual +3. Intentionally deceive or mislead others, including use of Seamless related to the following: + a. Generating, promoting, or furthering fraud or the creation or promotion of disinformation + b. Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content + c. Generating, promoting, or further distributing spam + d. Impersonating another individual by depiction of their voice or likeness without consent, authorization, or legal right, including non-consensual sexual imagery + e. Representing that the use of Seamless or outputs are human-generated + f. Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement +4. Fail to appropriately disclose to end users any known dangers of your AI system +5. Engage in automated government decision-making in high risk contexts, including, for example, law enforcement, criminal justice, immigration, or asylum, without a qualified person reviewing the outputs +6. Engage in any decision-making related to health, financial, safety, or legal matters +7. Create, develop, access, or disseminate adult content, including in relation to: + a. Erotic, sexual, or romantic chats + b. Sexual solicitation + c. Pornography + d. Content that describes or promotes sexual or adult services + + +Please report any violation of this Policy, software “bug,” or other problems that could lead to a violation of this Policy through one of the following means: +- Reporting issues with the model: github.com/facebookresearch/seamless_communication +- Reporting bugs and security concerns: facebook.com/whitehat/info +- Reporting violations of the Acceptable Use Policy or unlicensed uses of Seamless: SeamlessUseReport@meta.com diff --git a/seamless_communication/CODE_OF_CONDUCT.md b/seamless_communication/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..83f431e --- /dev/null +++ b/seamless_communication/CODE_OF_CONDUCT.md @@ -0,0 +1,80 @@ +# Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to make participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or +advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic +address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a +professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies within all project spaces, and it also applies when +an individual is representing the project or its community in public spaces. +Examples of representing a project or community include using an official +project e-mail address, posting via an official social media account, or acting +as an appointed representative at an online or offline event. Representation of +a project may be further defined and clarified by project maintainers. + +This Code of Conduct also applies outside the project spaces when there is a +reasonable belief that an individual's behavior may have a negative impact on +the project or its community. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at . All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq diff --git a/seamless_communication/CONTRIBUTING.md b/seamless_communication/CONTRIBUTING.md new file mode 100644 index 0000000..b317b08 --- /dev/null +++ b/seamless_communication/CONTRIBUTING.md @@ -0,0 +1,37 @@ +# Contributing to `seamless_communication` +We want to make contributing to this project as easy and transparent as +possible. + +## Our Development Process + +`seamless_communication` is built for Meta AI Seamless Communication team public release. +We engage in multiple projects internally and will update this repository with our progress upon reaching specific milestones. + +## Pull Requests +We actively welcome your pull requests. + +1. Fork the repo and create your branch from `main`. +2. If you've added code that should be tested, add tests. +3. If you've changed APIs, update the documentation. +4. Ensure the test suite passes. +5. Make sure your code lints. +6. If you haven't already, complete the Contributor License Agreement ("CLA"). + +## Contributor License Agreement ("CLA") +In order to accept your pull request, we need you to submit a CLA. You only need +to do this once to work on any of Meta's open source projects. + +Complete your CLA here: + +## Issues +We use GitHub issues to track public bugs. Please ensure your description is +clear and has sufficient instructions to be able to reproduce the issue. + +Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe +disclosure of security bugs. In those cases, please go through the process +outlined on that page and do not file a public issue. + + +## License +By contributing to `seamless_communication`, you agree that your contributions will be licensed +under the MIT_LICENSE file in the root directory of this source tree. diff --git a/seamless_communication/LICENSE b/seamless_communication/LICENSE new file mode 100644 index 0000000..d1bbe80 --- /dev/null +++ b/seamless_communication/LICENSE @@ -0,0 +1,400 @@ + +Attribution-NonCommercial 4.0 International + +======================================================================= + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. Distribution of +Creative Commons public licenses does not create a lawyer-client or +other relationship. Creative Commons makes its licenses and related +information available on an "as-is" basis. Creative Commons gives no +warranties regarding its licenses, any material licensed under their +terms and conditions, or any related information. Creative Commons +disclaims all liability for damages resulting from their use to the +fullest extent possible. + +Using Creative Commons Public Licenses + +Creative Commons public licenses provide a standard set of terms and +conditions that creators and other rights holders may use to share +original works of authorship and other material subject to copyright +and certain other rights specified in the public license below. The +following considerations are for informational purposes only, are not +exhaustive, and do not form part of our licenses. + + Considerations for licensors: Our public licenses are + intended for use by those authorized to give the public + permission to use material in ways otherwise restricted by + copyright and certain other rights. Our licenses are + irrevocable. Licensors should read and understand the terms + and conditions of the license they choose before applying it. + Licensors should also secure all rights necessary before + applying our licenses so that the public can reuse the + material as expected. Licensors should clearly mark any + material not subject to the license. This includes other CC- + licensed material, or material used under an exception or + limitation to copyright. More considerations for licensors: + wiki.creativecommons.org/Considerations_for_licensors + + Considerations for the public: By using one of our public + licenses, a licensor grants the public permission to use the + licensed material under specified terms and conditions. If + the licensor's permission is not necessary for any reason--for + example, because of any applicable exception or limitation to + copyright--then that use is not regulated by the license. Our + licenses grant only permissions under copyright and certain + other rights that a licensor has authority to grant. Use of + the licensed material may still be restricted for other + reasons, including because others have copyright or other + rights in the material. A licensor may make special requests, + such as asking that all changes be marked or described. + Although not required by our licenses, you are encouraged to + respect those requests where reasonable. More_considerations + for the public: + wiki.creativecommons.org/Considerations_for_licensees + +======================================================================= + +Creative Commons Attribution-NonCommercial 4.0 International Public +License + +By exercising the Licensed Rights (defined below), You accept and agree +to be bound by the terms and conditions of this Creative Commons +Attribution-NonCommercial 4.0 International Public License ("Public +License"). To the extent this Public License may be interpreted as a +contract, You are granted the Licensed Rights in consideration of Your +acceptance of these terms and conditions, and the Licensor grants You +such rights in consideration of benefits the Licensor receives from +making the Licensed Material available under these terms and +conditions. + +Section 1 -- Definitions. + + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + + b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + + c. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + d. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + + e. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + + f. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + + g. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + + h. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + + i. NonCommercial means not primarily intended for or directed towards + commercial advantage or monetary compensation. For purposes of + this Public License, the exchange of the Licensed Material for + other material subject to Copyright and Similar Rights by digital + file-sharing or similar means is NonCommercial provided there is + no payment of monetary compensation in connection with the + exchange. + + j. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + + k. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + + l. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + +Section 2 -- Scope. + + a. License grant. + + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + + a. reproduce and Share the Licensed Material, in whole or + in part, for NonCommercial purposes only; and + + b. produce, reproduce, and Share Adapted Material for + NonCommercial purposes only. + + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + + 3. Term. The term of this Public License is specified in Section + 6(a). + + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + + 5. Downstream recipients. + + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + + b. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + + b. Other rights. + + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + + 2. Patent and trademark rights are not licensed under this + Public License. + + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties, including when + the Licensed Material is used other than for NonCommercial + purposes. + +Section 3 -- License Conditions. + +Your exercise of the Licensed Rights is expressly made subject to the +following conditions. + + a. Attribution. + + 1. If You Share the Licensed Material (including in modified + form), You must: + + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + + ii. a copyright notice; + + iii. a notice that refers to this Public License; + + iv. a notice that refers to the disclaimer of + warranties; + + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + + 4. If You Share Adapted Material You produce, the Adapter's + License You apply must not prevent recipients of the Adapted + Material from complying with this Public License. + +Section 4 -- Sui Generis Database Rights. + +Where the Licensed Rights include Sui Generis Database Rights that +apply to Your use of the Licensed Material: + + a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database for NonCommercial purposes + only; + + b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material; and + + c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + +For the avoidance of doubt, this Section 4 supplements and does not +replace Your obligations under this Public License where the Licensed +Rights include other Copyright and Similar Rights. + +Section 5 -- Disclaimer of Warranties and Limitation of Liability. + + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + + c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + +Section 6 -- Term and Termination. + + a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + + b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + + 2. upon express reinstatement by the Licensor. + + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + + c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + + d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + +Section 7 -- Other Terms and Conditions. + + a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + + b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + +Section 8 -- Interpretation. + + a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + + b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + + c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + + d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + +======================================================================= + +Creative Commons is not a party to its public +licenses. Notwithstanding, Creative Commons may elect to apply one of +its public licenses to material it publishes and in those instances +will be considered the “Licensor.” The text of the Creative Commons +public licenses is dedicated to the public domain under the CC0 Public +Domain Dedication. Except for the limited purpose of indicating that +material is shared under a Creative Commons public license or as +otherwise permitted by the Creative Commons policies published at +creativecommons.org/policies, Creative Commons does not authorize the +use of the trademark "Creative Commons" or any other trademark or logo +of Creative Commons without its prior written consent including, +without limitation, in connection with any unauthorized modifications +to any of its public licenses or any other arrangements, +understandings, or agreements concerning use of licensed material. For +the avoidance of doubt, this paragraph does not form part of the +public licenses. + +Creative Commons may be contacted at creativecommons.org. diff --git a/seamless_communication/MIT_LICENSE b/seamless_communication/MIT_LICENSE new file mode 100644 index 0000000..b93be90 --- /dev/null +++ b/seamless_communication/MIT_LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) Meta Platforms, Inc. and affiliates. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/seamless_communication/README.md b/seamless_communication/README.md new file mode 100644 index 0000000..67bfc5f --- /dev/null +++ b/seamless_communication/README.md @@ -0,0 +1,313 @@ +![](23-11_SEAMLESS_BlogHero_11.17.jpg) +# Seamless Intro + +Seamless is a family of AI models that enable more natural and authentic communication across languages. SeamlessM4T is a massive multilingual multimodal machine translation model supporting around 100 languages. SeamlessM4T serves as foundation for SeamlessExpressive, a model that preserves elements of prosody and voice style across languages and SeamlessStreaming, a model supporting simultaneous translation and streaming ASR for around 100 languages. SeamlessExpressive and SeamlessStreaming are combined into Seamless, a unified model featuring multilinguality, real-time and expressive translations. + +## Links + +### Demos + +| | SeamlessM4T v2 | SeamlessExpressive | SeamlessStreaming | +| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------- | +| Demo | [SeamlessM4T v2 Demo](https://seamless.metademolab.com/m4t?utm_source=github&utm_medium=web&utm_campaign=seamless&utm_content=readme) | [SeamlessExpressive Demo](https://seamless.metademolab.com/expressive?utm_source=github&utm_medium=web&utm_campaign=seamless&utm_content=readme) | | +| HuggingFace Space Demo | [🤗 SeamlessM4T v2 Space](https://huggingface.co/spaces/facebook/seamless-m4t-v2-large) | [🤗 SeamlessExpressive Space](https://huggingface.co/spaces/facebook/seamless-expressive) | [🤗 SeamlessStreaming Space](https://huggingface.co/spaces/facebook/seamless-streaming) | + +### Papers +[Seamless](https://ai.facebook.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) + +[EMMA](https://ai.meta.com/research/publications/efficient-monotonic-multihead-attention/) + +[SONAR](https://ai.meta.com/research/publications/sonar-expressive-zero-shot-expressive-speech-to-speech-translation/) + +### Blog +[AI at Meta Blog](https://ai.meta.com/research/seamless-communication/) + +## Tutorial +An exhaustive [tutorial](Seamless_Tutorial.ipynb) given at the NeurIPS 2023 - Seamless EXPO, which is a one-stop shop to learn how to use the entire suite of Seamless models. Please feel free to play with the notebook. + +## SeamlessM4T +SeamlessM4T is our foundational all-in-one **M**assively **M**ultilingual and **M**ultimodal **M**achine **T**ranslation model delivering high-quality translation for speech and text in nearly 100 languages. + +SeamlessM4T models support the tasks of: +- Speech-to-speech translation (S2ST) +- Speech-to-text translation (S2TT) +- Text-to-speech translation (T2ST) +- Text-to-text translation (T2TT) +- Automatic speech recognition (ASR) + +:star2: We are releasing SeamlessM4T v2, an updated version with our novel *UnitY2* architecture. This new model improves over SeamlessM4T v1 in quality as well as inference latency in speech generation tasks. + +To learn more about the collection of SeamlessM4T models, the approach used in each, their language coverage and their performance, visit the [SeamlessM4T README](docs/m4t/README.md) or [🤗 Model Card](https://huggingface.co/facebook/seamless-m4t-v2-large). + +> [!NOTE] +> Seamless M4T is also available in the 🤗 Transformers library. Visit [this section](docs/m4t/README.md#transformers-usage) for more details. + +## SeamlessExpressive + +SeamlessExpressive is a speech-to-speech translation model that captures certain underexplored aspects of prosody such as speech rate and pauses, while preserving the style of one's voice and high content translation quality. + +To learn more about SeamlessExpressive models, visit the [SeamlessExpressive README](docs/expressive/README.md) or [🤗 Model Card](https://huggingface.co/facebook/seamless-expressive) + + +## SeamlessStreaming + +SeamlessStreaming is a streaming translation model. The model supports speech as input modality and speech/text as output modalities. + +The SeamlessStreaming model supports the following tasks: +- Speech-to-speech translation (S2ST) +- Speech-to-text translation (S2TT) +- Automatic speech recognition (ASR) + +To learn more about SeamlessStreaming models, visit the [SeamlessStreaming README](docs/streaming/README.md) or [🤗 Model Card](https://huggingface.co/facebook/seamless-streaming) + +## Seamless + +The Seamless model is the unified model for expressive streaming speech-to-speech translations. + +## What's new +- [12/18/2023] We are open-sourcing our Conformer-based [W2v-BERT 2.0 speech encoder](#w2v-bert-20-speech-encoder) as described in Section 3.2.1 of the [paper](https://arxiv.org/pdf/2312.05187.pdf), which is at the core of our Seamless models. +- [12/14/2023] We are releasing the Seamless [tutorial](#tutorial) given at NeurIPS 2023. + +# Quick Start +## Installation +> [!NOTE] +> One of the prerequisites is [fairseq2](https://github.com/facebookresearch/fairseq2) which has pre-built packages available only +> for Linux x86-64 and Apple-silicon Mac computers. In addition it has a dependency on [libsndfile](https://github.com/libsndfile/libsndfile) which +> might not be installed on your machine. If you experience any installation issues, please refer to its +> [README](https://github.com/facebookresearch/fairseq2) for further instructions. + +``` +pip install . +``` + +> [!NOTE] +> Transcribing inference audio for computing metric uses [Whisper](https://github.com/openai/whisper#setup), which is automatically installed. Whisper in turn requires the command-line tool [`ffmpeg`](https://ffmpeg.org/) to be installed on your system, which is available from most package managers. + + +## Running inference + +### SeamlessM4T Inference +Here’s an example of using the CLI from the root directory to run inference. + +S2ST task: +```bash +m4t_predict --task s2st --tgt_lang --output_path +``` +T2TT task: +```bash +m4t_predict --task t2tt --tgt_lang --src_lang +``` +Please refer to the [inference README](src/seamless_communication/cli/m4t/predict) for detailed instruction on how to run inference and the list of supported languages on the source, target sides for speech, text modalities. + +For running S2TT/ASR natively (without Python) using GGML, please refer to [the unity.cpp section](#unitycpp). + +### SeamlessExpressive Inference +> [!NOTE] +> Please check the [section](#seamlessexpressive-models) on how to download the model. + +Here’s an example of using the CLI from the root directory to run inference. + +```bash +expressivity_predict --tgt_lang --model_name seamless_expressivity --vocoder_name vocoder_pretssel --output_path +``` + +### SeamlessStreaming and Seamless Inference + +[Streaming Evaluation README](src/seamless_communication/cli/streaming) has detailed instructions for running evaluations for the SeamlessStreaming and Seamless models. The CLI has an `--no-scoring` option that can be used to skip the scoring part and just run inference. + + +## Running SeamlessStreaming Demo +You can duplicate the [SeamlessStreaming HF space](https://huggingface.co/spaces/facebook/seamless-streaming?duplicate=true) to run the streaming demo. + + +You can also run the demo locally, by cloning the space from [here](https://huggingface.co/spaces/facebook/seamless-streaming/tree/main). See the [README](https://huggingface.co/spaces/facebook/seamless-streaming/blob/main/README.md) of the SeamlessStreaming HF repo for more details on installation. + +## Running SeamlessM4T & SeamlessExpressive [Gradio](https://github.com/gradio-app/gradio) demos locally + +To launch the same demo Space we host on Hugging Face locally: + +```bash +cd demo +pip install -r requirements.txt +python app.py +``` + +# Resources and usage +## Model +### SeamlessM4T models +| Model Name | #params | checkpoint | metrics | +| ----------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | +| SeamlessM4T-Large v2 | 2.3B | [🤗 Model card](https://huggingface.co/facebook/seamless-m4t-v2-large) - [checkpoint](https://huggingface.co/facebook/seamless-m4t-v2-large/resolve/main/seamlessM4T_v2_large.pt ) | [metrics](https://dl.fbaipublicfiles.com/seamless/metrics/seamlessM4T_large_v2.zip) | +| SeamlessM4T-Large (v1) | 2.3B | [🤗 Model card](https://huggingface.co/facebook/seamless-m4t-large) - [checkpoint](https://huggingface.co/facebook/seamless-m4t-large/resolve/main/multitask_unity_large.pt) | [metrics](https://dl.fbaipublicfiles.com/seamless/metrics/seamlessM4T_large.zip) | +| SeamlessM4T-Medium (v1) | 1.2B | [🤗 Model card](https://huggingface.co/facebook/seamless-m4t-medium) - [checkpoint](https://huggingface.co/facebook/seamless-m4t-medium/resolve/main/multitask_unity_medium.pt) | [metrics](https://dl.fbaipublicfiles.com/seamless/metrics/seamlessM4T_medium.zip) | + +### SeamlessExpressive models + +[🤗 Model card](https://huggingface.co/facebook/seamless-expressive) + +To access and download SeamlessExpressive, please request the model artifacts through [this request form](https://ai.meta.com/resources/models-and-libraries/seamless-downloads/). Upon approval, you will then receive an email with download links to each model artifact. + +Please note that SeamlessExpressive is made available under its own [License](SEAMLESS_LICENSE) and [Acceptable Use Policy](ACCEPTABLE_USE_POLICY). + +### SeamlessStreaming models +| Model Name | #params | checkpoint | metrics | +| ----------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- | +| SeamlessStreaming | 2.5B | [🤗 Model card](https://huggingface.co/facebook/seamless-streaming) - [monotonic decoder checkpoint](https://huggingface.co/facebook/seamless-streaming/resolve/main/seamless_streaming_monotonic_decoder.pt) - [streaming UnitY2 checkpoint](https://huggingface.co/facebook/seamless-streaming/resolve/main/seamless_streaming_unity.pt) | [metrics](https://dl.fbaipublicfiles.com/seamless/metrics/streaming/seamless_streaming.zip) | + +### Seamless models +Seamless model is simply the SeamlessStreaming model with the non-expressive `vocoder_v2` swapped out with the expressive `vocoder_pretssel`. +Please check out above [section](#seamlessexpressive-models) on how to acquire `vocoder_pretssel` checkpoint. + +### W2v-BERT 2.0 speech encoder +| Model Name | #params | checkpoint | +| ----------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| W2v-BERT 2.0 | 600M | [🤗 Model card](https://huggingface.co/facebook/conformer-shaw) - [checkpoint](https://huggingface.co/facebook/conformer-shaw/resolve/main/conformer_shaw.pt) + +Here's how you should do a foward pass through the speech encoder: + +```python +import torch + +from fairseq2.data.audio import AudioDecoder, WaveformToFbankConverter +from fairseq2.memory import MemoryBlock +from fairseq2.nn.padding import get_seqs_and_padding_mask +from pathlib import Path +from seamless_communication.models.conformer_shaw import load_conformer_shaw_model + + +audio_wav_path, device, dtype = ... +audio_decoder = AudioDecoder(dtype=torch.float32, device=device) +fbank_converter = WaveformToFbankConverter( + num_mel_bins=80, + waveform_scale=2**15, + channel_last=True, + standardize=True, + device=device, + dtype=dtype, +) +collater = Collater(pad_value=1) + +model = load_conformer_shaw_model("conformer_shaw", device=device, dtype=dtype) +model.eval() + +with Path(audio_wav_path).open("rb") as fb: + block = MemoryBlock(fb.read()) + +decoded_audio = audio_decoder(block) +src = collater(fbank_converter(decoded_audio))["fbank"] +seqs, padding_mask = get_seqs_and_padding_mask(src) + +with torch.inference_mode(): + seqs, padding_mask = model.encoder_frontend(seqs, padding_mask) + seqs, padding_mask = model.encoder(seqs, padding_mask) +``` + +## Evaluation + +### SeamlessM4T Evaluation +To reproduce our results, or to evaluate using the same metrics over your own test sets, please check out the [README here](src/seamless_communication/cli/m4t/evaluate). +### SeamlessExpressive Evaluation + +Below is the script for efficient batched evaluation. + +```bash +export MODEL_DIR="/path/to/SeamlessExpressive/model" +export TEST_SET_TSV="input.tsv" # Your dataset in a TSV file, with headers "id", "audio" +export TGT_LANG="spa" # Target language to translate into, options including "fra", "deu", "eng" ("cmn" and "ita" are experimental) +export OUTPUT_DIR="tmp/" # Output directory for generated text/unit/waveform +export TGT_TEXT_COL="tgt_text" # The column in your ${TEST_SET_TSV} for reference target text to calcuate BLEU score. You can skip this argument. +export DFACTOR="1.0" # Duration factor for model inference to tune predicted duration (preddur=DFACTOR*preddur) per each position which affects output speech rate. Greater value means slower speech rate (default to 1.0). See expressive evaluation README for details on duration factor we used. +expressivity_evaluate ${TEST_SET_TSV} \ + --gated-model-dir ${MODEL_DIR} --task s2st --tgt_lang ${TGT_LANG} \ + --audio_root_dir "" --output_path ${OUTPUT_DIR} --ref_field ${TGT_TEXT_COL} \ + --model_name seamless_expressivity --vocoder_name vocoder_pretssel \ + --text_unk_blocking True --duration_factor ${DFACTOR} +``` + +Please check out this [README section](docs/expressive/README.md#automatic-evaluation) + +### SeamlessStreaming and Seamless Evaluation + +[Streaming Evaluation README](src/seamless_communication/cli/streaming) has detailed instructions for running evaluations on the SeamlessStreaming and Seamless models. + +## Unity.cpp +To enable Seamless Communication Everywhere, we implemented unity.cpp so users could run SeamlessM4T models in GGML - a C tensor library allowing easier integration on verbose platforms. + +To transcribe/translte a given audio, + +``` +./ggml/bin/unity --model seamlessM4T_medium.ggml input.wav +``` + +For details of build and more usage please check out [unity.cpp](ggml) + +## Expressive Datasets + +We created two expressive speech-to-speech translation datasets, mExpresso and mDRAL, between English and five other languages -- French, German, Italian, Mandarin and Spanish. We currently open source the speech-to-text of mExpresso for out-of-English directions, and we will open source the remaining part of the datasets soon. For details, please check out [README](docs/expressive/README.md#benchmark-datasets) + +### SeamlessAlignExpressive +We’re introducing the first expressive speech alignment procedure. Starting with raw data, the expressive alignment procedure automatically discovers pairs of audio segments sharing not only the same meaning, but the same overall expressivity. To showcase this procedure, we are making metadata available to create a benchmarking dataset called SeamlessAlignExpressive, that can be used to validate the quality of our alignment method. SeamlessAlignExpressive is the first large-scale (11k+ hours) collection of multilingual audio alignments for expressive translation. More details can be found on the [SeamlessAlignExpressive README](docs/expressive/seamless_align_expressive_README.md). + + +## Converting raw audio to units +Please check out the [README here](src/seamless_communication/cli/m4t/audio_to_units). Note that SeamlessM4T v1 model uses reduced units and other models use non-reduced units. + +# Libraries + +Seamless Communication depends on 4 libraries developed by Meta. + +## [fairseq2](https://github.com/facebookresearch/fairseq2) +fairseq2 is our next-generation open-source library of sequence modeling components that provides researchers and developers with building blocks for machine translation, language modeling, and other sequence generation tasks. All SeamlessM4T models in this repository are powered by fairseq2. + +## [SONAR and BLASER 2.0](https://github.com/facebookresearch/SONAR) +SONAR, Sentence-level multimOdal and laNguage-Agnostic Representations is a new multilingual and -modal sentence embedding space which outperforms existing sentence embeddings such as LASER3 and LabSE on the xsim and xsim++ multilingual similarity search tasks. SONAR provides text and speech encoders for many languages. SeamlessAlign was mined based on SONAR embeddings. + +BLASER 2.0 is our latest model-based evaluation metric for multimodal translation. It is an extension of BLASER, supporting both speech and text. It operates directly on the source signal, and as such, does not require any intermediate ASR system like ASR-BLEU. As in the first version, BLASER 2.0 leverages the similarity between input and output sentence embeddings. SONAR is the underlying embedding space for BLASER 2.0. Scripts to run evaluation with BLASER 2.0 can be found in the [SONAR repo](https://github.com/facebookresearch/SONAR). + +## [stopes](https://github.com/facebookresearch/stopes) +As part of the seamless communication project, we've extended the stopes library. Version 1 provided a text-to-text mining tool to build training dataset for translation models. Version 2 has been extended thanks to SONAR, to support tasks around training large speech translation models. In particular, we provide tools to read/write the fairseq audiozip datasets and a new mining pipeline that can do speech-to-speech, text-to-speech, speech-to-text and text-to-text mining, all based on the new SONAR embedding space. + +## [SimulEval](https://github.com/facebookresearch/SimulEval) +SimulEval is a library used for evaluating simulaneous translation models. SimulEval also provides a backend for generation using partial/incremental inputs with flexible/extensible states, which is used to implement streaming inference. Users define agents which implement SimulEval's interface, which can be connected together in a pipeline. You can find agents implemented for SeamlessStreaming [here](src/seamless_communication/streaming/agents). + +## [Legacy] SeamlessM4T v1 instructions +#### Finetuning SeamlessM4T v1 models +Please check out the [README here](src/seamless_communication/cli/m4t/finetune). + +#### On-device models +Apart from Seamless-M4T large (2.3B) and medium (1.2B) models, we are also releasing a small model (281M) targeted for on-device inference. To learn more about the usage and model details check out the [README here](docs/m4t/on_device_README.md). + +#### SeamlessAlign mined dataset +We open-source the metadata to SeamlessAlign, the largest open dataset for multimodal translation, totaling 270k+ hours of aligned Speech and Text data. The dataset can be rebuilt by the community based on the [SeamlessAlign readme](docs/m4t/seamless_align_README.md). + + +# Citation +If you use Seamless in your work or any models/datasets/artifacts published in Seamless, please cite : + +```bibtex +@inproceedings{seamless2023, + title="Seamless: Multilingual Expressive and Streaming Speech Translation", + author="{Seamless Communication}, Lo{\"i}c Barrault, Yu-An Chung, Mariano Coria Meglioli, David Dale, Ning Dong, Mark Duppenthaler, Paul-Ambroise Duquenne, Brian Ellis, Hady Elsahar, Justin Haaheim, John Hoffman, Min-Jae Hwang, Hirofumi Inaguma, Christopher Klaiber, Ilia Kulikov, Pengwei Li, Daniel Licht, Jean Maillard, Ruslan Mavlyutov, Alice Rakotoarison, Kaushik Ram Sadagopan, Abinesh Ramakrishnan, Tuan Tran, Guillaume Wenzek, Yilin Yang, Ethan Ye, Ivan Evtimov, Pierre Fernandez, Cynthia Gao, Prangthip Hansanti, Elahe Kalbassi, Amanda Kallet, Artyom Kozhevnikov, Gabriel Mejia, Robin San Roman, Christophe Touret, Corinne Wong, Carleigh Wood, Bokai Yu, Pierre Andrews, Can Balioglu, Peng-Jen Chen, Marta R. Costa-juss{\`a}, Maha Elbayad, Hongyu Gong, Francisco Guzm{\'a}n, Kevin Heffernan, Somya Jain, Justine Kao, Ann Lee, Xutai Ma, Alex Mourachko, Benjamin Peloquin, Juan Pino, Sravya Popuri, Christophe Ropers, Safiyyah Saleem, Holger Schwenk, Anna Sun, Paden Tomasello, Changhan Wang, Jeff Wang, Skyler Wang, Mary Williamson", + journal={ArXiv}, + year={2023} +} +``` + +# License + +We have three license categories. + +The following non-generative components are MIT licensed as found in [MIT_LICENSE](MIT_LICENSE): +- [W2v-BERT 2.0 speech encoder](#w2v-bert-20-speech-encoder) +- Code +- Text only part of the mExpresso dataset found in the [SeamlessExpressive README](docs/expressive/README.md). +- UnitY2 forced alignment extractor found in the [UnitY2 Aligner README](docs/m4t/unity2_aligner_README.md). +- Speech toxicity tool with the etox dataset found in the [Toxicity README](src/seamless_communication/cli/toxicity). + +The following models are CC-BY-NC 4.0 licensed as found in the [LICENSE](LICENSE): +- SeamlessM4T models (v1 and v2). +- SeamlessStreaming models. + +The following models are Seamless licensed as found in [SEAMLESS_LICENSE](SEAMLESS_LICENSE): +- Seamless models. +- SeamlessExpressive models. diff --git a/seamless_communication/SEAMLESS_LICENSE b/seamless_communication/SEAMLESS_LICENSE new file mode 100644 index 0000000..0fd24ab --- /dev/null +++ b/seamless_communication/SEAMLESS_LICENSE @@ -0,0 +1,44 @@ +Seamless Licensing Agreement + +“Agreement” means this “Seamless Licensing Agreement”, including, the terms and conditions for use, reproduction, distribution and modification of the Seamless Materials set forth herein. + +“Documentation” means the specifications, manuals and documentation accompanying Seamless distributed by Meta at [https://ai.meta.com/resources/models-and-libraries/seamless-downloads](https://ai.meta.com/resources/models-and-libraries/seamless-downloads). + +“Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf. + +“Meta” or “we” means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland). + +“Noncommercial Research Uses” means noncommercial research use cases related to research, development, education, processing, or analysis in each case with no direct or indirect commercial gain to you or others. + +“Seamless” means the foundational translation and transcription models and software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code, demonstration materials and other elements of the foregoing distributed by Meta at [https://ai.meta.com/resources/models-and-libraries/seamless-downloads](https://ai.meta.com/resources/models-and-libraries/seamless-downloads). + +“Seamless Materials” means, collectively, Meta’s proprietary Seamless and Documentation (and any portion thereof) made available under this Agreement. + +“Trade Control Laws” means any applicable U.S. and non-U.S. export control and trade sanctions laws and regulations. + +By clicking “I Accept” below or by using or distributing any portion or element of the Seamless Materials, you agree to be bound by this Agreement. + +1. License Rights and Redistribution. + a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the Seamless Materials to use, reproduce, distribute, copy, create derivative works of, translate speech and text, and make modifications to the Seamless Materials solely for Noncommercial Research Uses. + b. Redistribution and Use. + i. Distribution of Seamless Materials, and any derivative works thereof, are subject to the terms of this Agreement. If you distribute or make the Seamless Materials, or any derivative works thereof, available to a third party, you may only do so under this Agreement. You shall also provide a copy of this Agreement to such third party. + ii. If you submit for publication the results of research you perform on, using, or otherwise in connection with Seamless Materials, you must acknowledge the use of Seamless Materials in your publication as follows (or an equivalent acknowledgement of your choosing): “This material is based on work supported by the Seamless Licensing Agreement, Copyright © Meta Platforms, Inc. All Rights Reserved.” + iii. If you receive Seamless Materials, or any derivative works thereof, from a Licensee as part of an integrated end user product, then Section 2 of this Agreement will not apply to you. + iv. You must retain in all copies of the Seamless Materials that you distribute the following attribution notice within a “Notice” text file distributed as a part of such copies: “Seamless is licensed under the Seamless Licensing Agreement, Copyright © Meta Platforms, Inc. All Rights Reserved.” + v. Your use of the Seamless Materials must comply with applicable laws and regulations (including Trade Control Laws)) and adhere to the Acceptable Use Policy for the Seamless Materials [https://ai.meta.com/resources/models-and-libraries/seamless-use-policy](https://ai.meta.com/resources/models-and-libraries/seamless-use-policy), which is hereby incorporated by reference into this Agreement. +2. Restrictions. You will not, and will not permit, assist or cause any third party to: + a. use the Seamless Materials or any outputs or results of the Seamless Materials in connection with any commercial uses or for any uses other than Noncommercial Research Uses; + b. utilize any equipment, device, software, or other means to circumvent or remove any security or protection used by Meta in connection with the Seamless Materials, or to circumvent or remove any usage restrictions, or to enable functionality disabled by Meta; + c. disguise your or their location through IP proxying or other methods; + d. use or download Seamless if you or they are: (a) located in a comprehensively sanctioned jurisdiction, (b) currently listed on any U.S. or non-U.S. restricted parties list, or (c) for any purpose prohibited by Trade Control Laws; or + e. directly or indirectly export, re-export, provide, or otherwise transfer Seamless Materials: (a) to any individual, entity, or country prohibited by Trade Control Laws; (b) to anyone on U.S. or non-U.S. government restricted parties lists; or (c) for any purpose prohibited by Trade Control Laws, including nuclear, chemical or biological weapons, or missile technology applications. +3. User Support. Your Noncommercial Research Use of the Seamless Materials is done at your own discretion; Meta does not process any information nor provide any service in relation to such use. Meta is under no obligation to provide any support services for the Seamless Materials. Any support provided is “as is”, “with all faults”, and without warranty of any kind. +4. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE SEAMLESS MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE SEAMLESS MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE SEAMLESS MATERIALS AND ANY OUTPUT AND RESULTS. +5. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING. +6. Intellectual Property. + a. No trademark licenses are granted under this Agreement, and in connection with the Seamless Materials, neither Meta nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Seamless Materials. + b. Subject to Meta’s ownership of Seamless Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Seamless Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications. + c. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Seamless Materials or Seamless outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses and rights granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Seamless Materials. +7. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Seamless Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Seamless Materials. Sections 3, 4, 5, 6(c), 7, 8 and 9 shall survive the termination of this Agreement. +8. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement. +9. Modifications and Amendments. Meta may modify this Agreement from time to time by posting a revised version at [https://ai.meta.com/resources/models-and-libraries/seamless-license/](https://ai.meta.com/resources/models-and-libraries/seamless-license/); provided that they are similar in spirit to the current version of the Agreement, but may differ in detail to address new problems or concerns. All such changes will be effective immediately. Your continued use of the Seamless Materials after any modification to this Agreement constitutes your agreement to such modification. Except as provided in this Agreement, no modification or addition to any provision of this Agreement will be binding unless it is in writing and signed by an authorized representative of both you and Meta. diff --git a/seamless_communication/Seamless_Tutorial.ipynb b/seamless_communication/Seamless_Tutorial.ipynb new file mode 100644 index 0000000..e1dfb14 --- /dev/null +++ b/seamless_communication/Seamless_Tutorial.ipynb @@ -0,0 +1,2430 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "# Seamless Tutorial\n" + ], + "metadata": { + "id": "SbI8G4-0V1OG" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1p2d9R1LHJL2" + }, + "source": [ + "## Quick Links" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nLlZgJvBpWxT" + }, + "source": [ + "1. seamless_communication GitHub repository: https://github.com/facebookresearch/seamless_communication\n", + "2. fairseq2 Github repository: https://github.com/facebookresearch/fairseq2\n", + "3. HuggingFace: https://huggingface.co/collections/facebook/seamless-communication-6568d486ef451c6ba62c7724\n", + "4. Seamless demos: https://seamless.metademolab.com/\n", + "5. Fleurs datasets for evaluation: https://huggingface.co/datasets/google/fleurs/tree/main/data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YICcqOErh-om" + }, + "source": [ + "### Set up seamless_communication, fairseq2 and some utilities." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1Ei8HSHamsBG" + }, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install fairseq2\n", + "!pip install pydub sentencepiece\n", + "!pip install git+https://github.com/facebookresearch/seamless_communication.git" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TWlkq20jms6V" + }, + "outputs": [], + "source": [ + "import io\n", + "import json\n", + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "import mmap\n", + "import numpy\n", + "import soundfile\n", + "import torchaudio\n", + "import torch\n", + "\n", + "from collections import defaultdict\n", + "from IPython.display import Audio, display\n", + "from pathlib import Path\n", + "from pydub import AudioSegment\n", + "\n", + "from seamless_communication.inference import Translator\n", + "from seamless_communication.streaming.dataloaders.s2tt import SileroVADSilenceRemover" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "j25uCSvKHRKu" + }, + "source": [ + "# SeamlessM4T Inference:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "06JLP7rIEzfP" + }, + "source": [ + "## Initialize the models:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fA4iPYnoMLkK", + "outputId": "c19ae7c7-c0c9-4b85-be2c-561f57d279f1" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Downloading the checkpoint of seamlessM4T_v2_large...\n", + "100%|██████████| 8.45G/8.45G [01:14<00:00, 122MB/s]\n", + "Downloading the tokenizer of seamlessM4T_v2_large...\n", + "100%|██████████| 360k/360k [00:00<00:00, 10.7MB/s]\n", + "Downloading the tokenizer of seamlessM4T_v2_large...\n", + "100%|██████████| 4.93M/4.93M [00:00<00:00, 63.2MB/s]\n", + "Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.\n", + "Downloading the checkpoint of vocoder_v2...\n", + "100%|██████████| 160M/160M [00:00<00:00, 168MB/s]\n", + "/usr/local/lib/python3.10/dist-packages/torch/nn/utils/weight_norm.py:30: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n", + " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n" + ] + } + ], + "source": [ + "# Initialize a Translator object with a multitask model, vocoder on the GPU.\n", + "\n", + "model_name = \"seamlessM4T_v2_large\"\n", + "vocoder_name = \"vocoder_v2\" if model_name == \"seamlessM4T_v2_large\" else \"vocoder_36langs\"\n", + "\n", + "translator = Translator(\n", + " model_name,\n", + " vocoder_name,\n", + " device=torch.device(\"cuda:0\"),\n", + " dtype=torch.float16,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BU4xoNRpqEey" + }, + "outputs": [], + "source": [ + "# Download an english audio sample from the LJ speech dataset for testing purposes.\n", + "%%capture\n", + "!wget https://dl.fbaipublicfiles.com/seamlessM4T/LJ037-0171_sr16k.wav -O /content/LJ_eng.wav" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PoWClYZ6FP1a" + }, + "source": [ + "## S2ST inference:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "J_qeX25RnTr_", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 880 + }, + "outputId": "0828c902-5ae3-49be-ffef-4e73751aaccb" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "English audio:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Translated text in spa: El examen y testimonio de los expertos permitieron a la comisión concluir que cinco disparos pueden haber sido disparados.\n", + "\n", + "Translated audio in spa:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Translated text in fra: L'examen et le témoignage des experts ont permis à la commission de conclure que cinq coups de feu ont pu être tirés.\n", + "\n", + "Translated audio in fra:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Translated text in deu: Die Prüfung und das Zeugnis der Experten ermöglichten es der Kommission, zu dem Schluss zu kommen, dass fünf Schüsse abgefeuert wurden.\n", + "\n", + "Translated audio in deu:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Translated text in ita: L'esame e la testimonianza degli esperti hanno permesso alla commissione di concludere che cinque colpi possono essere stati sparati.\n", + "\n", + "Translated audio in ita:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Translated text in hin: विशेषज्ञों की जांच और गवाही ने आयोग को यह निष्कर्ष निकालने में सक्षम बनाया कि पांच गोलीबारी की गई हो सकती है।\n", + "\n", + "Translated audio in hin:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Translated text in cmn: 专家的检查和证词使委员会得出结论,可能有五次枪击.\n", + "\n", + "Translated audio in cmn:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n" + ] + } + ], + "source": [ + "# README: https://github.com/facebookresearch/seamless_communication/tree/main/src/seamless_communication/cli/m4t/predict\n", + "# Please use audios with duration under 20 seconds for optimal performance.\n", + "\n", + "# Resample the audio in 16khz if sample rate is not 16khz already.\n", + "# torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16_000)\n", + "\n", + "print(\"English audio:\")\n", + "in_file = \"/content/LJ_eng.wav\"\n", + "display(Audio(in_file, rate=16000, autoplay=False, normalize=True))\n", + "\n", + "tgt_langs = (\"spa\", \"fra\", \"deu\", \"ita\", \"hin\", \"cmn\")\n", + "\n", + "for tgt_lang in tgt_langs:\n", + " text_output, speech_output = translator.predict(\n", + " input=in_file,\n", + " task_str=\"s2st\",\n", + " tgt_lang=tgt_lang,\n", + " )\n", + "\n", + " print(f\"Translated text in {tgt_lang}: {text_output[0]}\")\n", + " print()\n", + "\n", + " out_file = f\"/content/translated_LJ_{tgt_lang}.wav\"\n", + "\n", + " torchaudio.save(out_file, speech_output.audio_wavs[0][0].to(torch.float32).cpu(), speech_output.sample_rate)\n", + "\n", + " print(f\"Translated audio in {tgt_lang}:\")\n", + " audio_play = Audio(out_file, rate=speech_output.sample_rate, autoplay=False, normalize=True)\n", + " display(audio_play)\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VTD8l_vXFX5x" + }, + "source": [ + "## S2TT inference:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xpvz3VdcFbYA", + "outputId": "b91d60ca-fa55-414d-c56a-67da1d5eb691" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Translated text in arb: فحص وشهادة الخبراء مكنت اللجنة من الاستنتاج بأن خمس طلقات قد تم إطلاقها\n", + "\n", + "Translated text in rus: Исследование и свидетельские показания экспертов позволили комиссии заключить, что пять выстрелов, возможно, были сделаны\n", + "\n", + "Translated text in tgl: Ang pagsusuri at patotoo ng mga eksperto ay nagpahintulot sa komisyon na magtapos na limang pagbaril ang maaaring binaril.\n", + "\n", + "Translated text in ind: pemeriksaan dan kesaksian para ahli memungkinkan komisi untuk menyimpulkan bahwa lima tembakan mungkin telah ditembakkan\n", + "\n", + "Translated text in tam: நிபுணர்களின் பரிசோதனை மற்றும் சாட்சியம் ஐந்து துப்பாக்கிச் சூடுகள் நடத்தப்பட்டிருக்கலாம் என்று முடிவு செய்ய ஆணையத்திற்கு உதவியது.\n", + "\n", + "Translated text in kor: 전문가들의 조사와 증언은 위원회가 5발의 총격이 발사됐을 수 있다는 결론을 내릴 수 있게 해 ⁇ 습니다.\n", + "\n" + ] + } + ], + "source": [ + "tgt_langs = (\"arb\", \"rus\", \"tgl\", \"ind\", \"tam\", \"kor\")\n", + "in_file = \"/content/LJ_eng.wav\"\n", + "\n", + "for tgt_lang in tgt_langs:\n", + "\n", + " text_output, _ = translator.predict(\n", + " input=in_file,\n", + " task_str=\"s2tt\",\n", + " tgt_lang=tgt_lang,\n", + " )\n", + "\n", + " print(f\"Translated text in {tgt_lang}: {text_output[0]}\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IBfkgdQlFcRV" + }, + "source": [ + "## ASR inference:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "E-GkJ-GsFjwM", + "outputId": "191e3d49-ff61-4d3b-c235-1978ff522521" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Transcribed text in spa: El examen y testimonio de los expertos permitieron a la comisión concluir que cinco disparos pueden haber sido disparados.\n", + "\n", + "Transcribed text in fra: L'examen et le témoignage des experts ont permis à la commission de conclure que cinq coups de feu ont pu être tirés.\n", + "\n", + "Transcribed text in deu: Die Prüfung und das Zeugnis der Experten ermöglichten es der Kommission, zu dem Schluss zu kommen, dass fünf Schüsse abgefeuert wurden.\n", + "\n", + "Transcribed text in ita: L'esame e la testimonianza degli esperti hanno permesso alla commissione di concludere che cinque colpi possono essere stati sparati.\n", + "\n", + "Transcribed text in hin: विशेषज्ञों की जांच और गवाही ने आयोग को यह निष्कर्ष निकालने में सक्षम बनाया कि पांच गोलीबारी की जा सकती है।\n", + "\n", + "Transcribed text in cmn: 专家的检查和证词史委员会的出结论可能有五次枪击\n", + "\n" + ] + } + ], + "source": [ + "tgt_langs = (\"spa\", \"fra\", \"deu\", \"ita\", \"hin\", \"cmn\")\n", + "\n", + "for tgt_lang in tgt_langs:\n", + " in_file = f\"/content/translated_LJ_{tgt_lang}.wav\"\n", + "\n", + " text_output, _ = translator.predict(\n", + " input=in_file,\n", + " task_str=\"asr\",\n", + " tgt_lang=tgt_lang,\n", + " )\n", + "\n", + " print(f\"Transcribed text in {tgt_lang}: {text_output[0]}\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1g3oeNp_Fj_m" + }, + "source": [ + "## T2ST inference:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 784 + }, + "id": "KivvCtS9FnH8", + "outputId": "34392c09-40d0-4da6-95b1-9639a9abe960" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Translated text in spa: Hola a todos, espero que todos estéis bien, gracias por asistir a nuestro taller.\n", + "\n", + "Translated audio in spa:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Translated text in fra: Bonjour à tous ! J'espère que vous allez bien. Merci d'avoir assisté à notre atelier.\n", + "\n", + "Translated audio in fra:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Translated text in deu: Hallo alle! Ich hoffe, dass es euch allen gut geht. Danke, dass ihr an unserem Workshop teilgenommen habt.\n", + "\n", + "Translated audio in deu:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Translated text in ita: Salve a tutti! Spero che stiate tutti bene. Grazie per aver partecipato al nostro workshop.\n", + "\n", + "Translated audio in ita:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Translated text in hin: हैलो सभी! मुझे आशा है कि आप सभी अच्छा कर रहे हैं। हमारी कार्यशाला में भाग लेने के लिए धन्यवाद।\n", + "\n", + "Translated audio in hin:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Translated text in cmn: 大家好!我希望你们都很好.谢谢你们参加我们的研讨会.\n", + "\n", + "Translated audio in cmn:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n" + ] + } + ], + "source": [ + "tgt_langs = (\"spa\", \"fra\", \"deu\", \"ita\", \"hin\", \"cmn\")\n", + "\n", + "for tgt_lang in tgt_langs:\n", + "\n", + " text_output, speech_output = translator.predict(\n", + " input=\"Hey everyone! I hope you're all doing well. Thank you for attending our workshop.\",\n", + " task_str=\"t2st\",\n", + " tgt_lang=tgt_lang,\n", + " src_lang=\"eng\",\n", + " )\n", + "\n", + " print(f\"Translated text in {tgt_lang}: {text_output[0]}\")\n", + " print()\n", + "\n", + " out_file = f\"/content/{tgt_lang}.wav\"\n", + "\n", + " torchaudio.save(out_file, speech_output.audio_wavs[0][0].to(torch.float32).cpu(), speech_output.sample_rate)\n", + "\n", + " print(f\"Translated audio in {tgt_lang}:\")\n", + " audio_play = Audio(out_file, rate=speech_output.sample_rate, autoplay=False, normalize=True)\n", + " display(audio_play)\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F_hA66F4Fnjk" + }, + "source": [ + "## T2TT (MT) inference:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wSrZZCIXFtFp", + "outputId": "fe1accd0-d038-455f-9781-bdc6893df7b0" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Translated text in arb: مرحباً للجميع! آمل أن تكونوا جميعاً بخير. شكراً لحضور ورشتنا.\n", + "\n", + "Translated text in rus: Привет всем! Надеюсь, вы все в порядке. Спасибо, что присутствовали на нашем семинаре.\n", + "\n", + "Translated text in ind: Hai semua orang! Saya harap kalian semua baik-baik saja. Terima kasih telah menghadiri lokakarya kami.\n", + "\n", + "Translated text in tam: எல்லோருக்கும் வணக்கம், நீங்கள் அனைவரும் நன்றாக இருக்கிறீர்கள் என்று நம்புகிறேன், எங்கள் பட்டறையில் கலந்து கொண்டதற்கு நன்றி.\n", + "\n", + "Translated text in kor: 안 ⁇ 하세요! 모두들 잘 지내셨으면 좋겠습니다. 워크 ⁇ 에 참석해 주셔서 감사합니다.\n", + "\n" + ] + } + ], + "source": [ + "tgt_langs = (\"arb\", \"rus\", \"ind\", \"tam\", \"kor\")\n", + "\n", + "for tgt_lang in tgt_langs:\n", + "\n", + " text_output, speech_output = translator.predict(\n", + " input=\"Hey everyone! I hope you're all doing well. Thank you for attending our workshop.\",\n", + " task_str=\"t2tt\",\n", + " tgt_lang=tgt_lang,\n", + " src_lang=\"eng\",\n", + " )\n", + "\n", + " print(f\"Translated text in {tgt_lang}: {text_output[0]}\")\n", + " print()\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## UnitY2 aligner usage" + ], + "metadata": { + "id": "N_q-Ek9M9M36" + } + }, + { + "cell_type": "code", + "source": [ + "from seamless_communication.models.aligner.alignment_extractor import AlignmentExtractor\n", + "from fairseq2.typing import Device\n", + "import torch" + ], + "metadata": { + "id": "ttDrZ9nh9LhH" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "alignment_extractor = AlignmentExtractor(\n", + " aligner_model_name_or_card=\"nar_t2u_aligner\",\n", + " unit_extractor_model_name_or_card=\"xlsr2_1b_v2\",\n", + " unit_extractor_output_layer=35,\n", + " unit_extractor_kmeans_model_uri=\"https://dl.fbaipublicfiles.com/seamlessM4T/models/unit_extraction/kmeans_10k.npy\",\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "D-yY4y129WFD", + "outputId": "a3678919-f1e4-45e2-bd62-272d99df5424" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Using the cached checkpoint of nar_t2u_aligner. Set `force` to `True` to download again.\n", + "Using the cached tokenizer of nar_t2u_aligner. Set `force` to `True` to download again.\n", + "Using the cached checkpoint of xlsr2_1b_v2. Set `force` to `True` to download again.\n", + "/usr/local/lib/python3.10/dist-packages/torch/nn/utils/weight_norm.py:30: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n", + " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n", + "WARNING:fairseq2.models:One or more operators in xlsr2_1b_v2 constructor do not support meta device. Skipping lazy initialization.\n", + "Using the cached checkpoint of https://dl.fbaipublicfiles.com/seamlessM4T/models/unit_extraction/kmeans_10k.npy. Set `force` to `True` to download again.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# downloading en audio\n", + "! wget https://dl.fbaipublicfiles.com/seamlessM4T/LJ037-0171_sr16k.wav" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "N8FYd-Fg-YaE", + "outputId": "5ad68739-cabf-4561-acde-64fc5ff01be5" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2023-12-14 01:40:01-- https://dl.fbaipublicfiles.com/seamlessM4T/LJ037-0171_sr16k.wav\n", + "Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.162.163.19, 3.162.163.34, 3.162.163.51, ...\n", + "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.162.163.19|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 485430 (474K) [audio/x-wav]\n", + "Saving to: ‘LJ037-0171_sr16k.wav’\n", + "\n", + "\rLJ037-0171_sr16k.wa 0%[ ] 0 --.-KB/s \rLJ037-0171_sr16k.wa 100%[===================>] 474.05K --.-KB/s in 0.04s \n", + "\n", + "2023-12-14 01:40:02 (10.8 MB/s) - ‘LJ037-0171_sr16k.wav’ saved [485430/485430]\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# listen to the audio\n", + "en_transcription = \"the examination and testimony of the experts enabled the commision to conclude that five shots may have been fired.\"\n", + "audio_play = Audio(\"LJ037-0171_sr16k.wav\", rate=16_000, autoplay=False, normalize=True)\n", + "display(audio_play)\n", + "print(en_transcription)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 93 + }, + "id": "v5k0B1An_DOd", + "outputId": "706daf4a-a07b-4977-9a68-a09cd395a1c6" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "the examination and testimony of the experts enabled the commision to conclude that five shots may have been fired.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "alignment_durations, _, tokenized_text_tokens = alignment_extractor.extract_alignment(\"LJ037-0171_sr16k.wav\", en_transcription, plot=True, add_trailing_silence=False)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 221 + }, + "id": "GpmN4Ofs-ySY", + "outputId": "681d840e-a0c1-4724-aac0-704dfd6ec17d" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAACI0AAAFUCAYAAACj216zAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOy9eZxc1Xnn/bvdQvsCWpAAIQEChADbscHG2NiO4y1xnMTG9pDFyyzJOIFMMpmZN/NO/L7jl0z2mYwTx2ISg0BIgBCSEBKL0AJa0L4v3Wot3ZJ671ZVr9Vrbff9o3xu37p17q176p5T97lV5/v56KPqU6ee57lnfe5ZDdM0TWg0Go1Go9FoNBqNRqPRaDQajUaj0Wg0Go1Go9FoqoqasA3QaDQajUaj0Wg0Go1Go9FoNBqNRqPRaDQajUaj0ZQfvWhEo9FoNBqNRqPRaDQajUaj0Wg0Go1Go9FoNBqNpgrRi0Y0Go1Go9FoNBqNRqPRaDQajUaj0Wg0Go1Go9FoqhC9aESj0Wg0Go1Go9FoNBqNRqPRaDQajUaj0Wg0Go2mCtGLRjQajUaj0Wg0Go1Go9FoNBqNRqPRaDQajUaj0WiqEL1oRKPRaDQajUaj0Wg0Go1Go9FoNBqNRqPRaDQajaYK0YtGNBqNRqPRaDQajUaj0Wg0Go1Go9FoNBqNRqPRaKoQvWhEo9FoNBqNRqPRaDQajUaj0Wg0Go1Go9FoNBqNpgqZVMqPstksOjo6MGvWLBiGIdsmjUaj0Wg0Go1Go9FoNBqNRqPRaDQajUaj0Wg0Gk0JmKaJRCKBW2+9FTU13meJlLRopKOjA7fffntJxmk0Go1Go9FoNBqNRqPRaDQajUaj0Wg0Go1Go9Fo1NLa2orFixd7xilp0cisWbMsBbNnz877bus/HcWv/4dPFPzGLZyHiAwRubIot84wntENlbbIkC2jnKnSF1XZonaoqutu4ZTykZItqmRTkaFSdiXZJ6P+hlFPqedBVO1QpbMa6h11nVTKdbVAJb2p1z0qOimlk6o+UkZfLaJPlDD8ABnPLiqbsoyo+mJhvIuKUG59YcgOQ2cU+4kwoNIWaYIRVju3Z88e3HLLLVi+fHlo9smgktoilXao9BVl2UJFbhTLMHV/nYpsSnMcKu2jPi4rQhTbIir6VFNJeSMDSjbzbBkcHMTtt99ure3woqRFI+xKmtmzZxcsGrlyJI7ZP5xd8Bu3cB4iMkTkyqLcOsN4RjdU2iJDtoxypkpfVGWL2qGqrruFU8pHSraokk1FhkrZlWSfjPobRj2lngdRtUOVzmqod9R1UinX1QKV9KZe96jopJROqvpIGX21iD5RwvADZDy7qGzKMqLqi4XxLipCufWFITsMnVHsJ8KASlukCUYY7dzhw4cxY8YMTJ8+HWNjY7j55ptDsU8GldQWqbRDpa8oyxYqcqNYhqn761RkU5rjUGkf9XFZEaLYFlHRp5pKyhsZULLZyxa2tsML78trSmDazMlC4UFliMiVRbl1hvGMbqi0RYZsGeVMlb6oyha1Q1VddwunlI+UbFElm4oMlbIryT4Z9TeMeko9D2RQSf5LNdQ76jqplOtqgUp6U697VHRSSidVfaSMvlpEnyhh+AEynl1UNmUZUfXFwngXpawvDNlh6IxiPxEGVNoiTTDCaOdOnDgBABgZGcG7774rTV8p8YNSSW2RG2H4RVFMV0pju+XWR91fpyKb0hxH0LiiOqm35W5EsS2iok81lZQ3MqBkc1BbDNM0TdEfDQ4OYs6cORgYGCg4aUSj0Wg0Go1Go9FoNBqNRqPRaDQaDS1WrlyJFStWYNasWWhoaMD3vve9sE3SaDQajUaj0ShCZE2H9JNG/vrbG4XCg8oQkSuLcusM4xndUGmLDNkyypkqfVGVLWqHqrruFk4pHynZoko2FRkqZVeSfTLqbxj1lHoeyKCS/JdqqHfUdVIp19UClfSmXveo6KSUTqr6SBl9tYg+UcLwA2Q8u6hsyjKi6ouF8S5KWV8YssPQGcV+IgyotEWaYITdzhXbS0qp/yi3Pir1Iwy/KIrpSmlst9z6qPvrVGRTmuMIGldUJ/W23I0otkVU9KmmkvJGBpRsDmqL9EUjqfG0UHhQGSJyZVFunWE8oxsqbZEhW0Y5U6UvqrJF7VBV193CKeUjJVtUyaYiQ6XsSrJPRv0No55SzwMZVJL/Ug31jrpOKuW6WqCS3tTrHhWdlNJJVR8po68W0SdKGH6AjGcXlU1ZRlR9sTDeRSnrC0N2GDqj2E+EAZW2SBOMMNs5P/faU+o/yq2PSv0Iwy+KYrpSGtsttz7q/joV2ZTmOILGFdVJvS13I4ptERV9qqmkvJEBJZuD2iJ90chHv3SXUHhQGSJyZVFunWE8oxsqbZEhW0Y5U6UvqrJF7VBV193CKeUjJVtUyaYiQ6XsSrJPRv0No55SzwMZVJL/Ug31jrpOKuW6WqCS3tTrHhWdlNJJVR8po68W0SdKGH6AjGcXlU1ZRlR9sTDeRSnrC0N2GDqj2E+EAZW2SBOMMNs5PzfWU+o/yq2PSv0Iwy+KYrpSGtsttz7q/joV2ZTmOILGFdVJvS13I4ptERV9qqmkvJEBJZuD2iJ90ciKT98uFB5UhohcWZRbZxjP6IZKW2TIllHOVOmLqmxRO1TVdbdwSvlIyRZVsqnIUCm7kuyTUX/DqKfU80AGleS/VEO9o66TSrmuFqikN/W6R0UnpXRS1UfK6KtF9IkShh8g49lFZVOWEVVfLIx3Ucr6wpAdhs4o9hNhQKUt0gSjkto5WTqp6KNSP8Lwi6KYrpTGdsutj3pdpyKb0hxH0LiiOqm35W5EsS2iok81lZQ3MqBkc1BbpC8aeeX/2ysUHlSGiFxZlFtnGM/ohkpbZMiWUc5U6YuqbFE7VNV1t3BK+UjJFlWyqchQKbuS7JNRf8Oop9TzQAaV5L9UQ72jrpNKua4WqKQ39bpHRSeldFLVR8roq0X0iRKGHyDj2UVlU5YRVV8sjHdRyvrCkB2Gzij2E2FApS3SBCPMdo5dT3P27Fkp+kqJH5RKaovcCMMvimK6UhrbLbc+6v46FdmU5jiCxhXVSb0tdyOKbREVfaqppLyRASWbg9oifdGIRqPRaDQajUaj0Wg0Go1Go9FoNBqasOtpPvjgg5At0Wg0Go1Go9GQwCyBgYEBE4A5MDBQ8N3Fo+3c37iFi8TlhYvIlUW5dYbxjG6otEWGbBnlTJW+qMoWtUNVXXcLp5SPlGxRJZuKDJWyK8k+GfU3jHpKPQ9kUEn+SzXUO+o6qZTraoFKelOve1R0UkonVX2kjL5aRJ8oYfgBMp5dVDZlGVH1xcJ4F6WsLwzZYeiMYj8RBlTaIk0wyt3OZbNZ86c//an5/vvvm0ePHjVXr15t/vSnPw3FPhlUUlvkRhh+URTTldLYbrn1UffXqcimNMcRNK6oTuptuRtRbIuo6FNNJeWNDCjZzLPFa02HE+knjVw62i4UHlSGiFxZlFtnGM/ohkpbZMiWUc5U6YuqbFE7VNV1t3BK+UjJFlWyqchQKbuS7JNRf8Oop9TzQAaV5L9UQ72jrpNKua4WqKQ39bpHRSeldFLVR8roq0X0iRKGHyDj2UVlU5YRVV8sjHdRyvrCkB2Gzij2E2FApS3SBKPc7dzatWvz/jZ/ftpIc3NzYH2lxA9KJbVFboThF0UxXSmN7ZZbH3V/nYpsSnMcQeOK6qTelrsRxbaIij7VVFLeyICSzUFtkb5o5OT2JqHwoDJE5Mqi3DrDeEY3VNoiQ7aMcqZKX1Rli9qhqq67hVPKR0q2qJJNRYZK2ZVkn4z6G0Y9pZ4HMqgk/6Ua6h11nVTKdbVAJb2p1z0qOimlk6o+UkZfLaJPlDD8ABnPLiqbsoyo+mJhvItS1heG7DB0RrGfCAMqbZEmGOVu5xKJhPXZMAzr89tvvx1YXynxg1JJbZEbYfhFUUxXSmO75dZH3V+nIpvSHEfQuKI6qbflbkSxLaKiTzWVlDcyoGRzUFukLxqpncQX6RYeVIaIXFmUW2cYz+iGSltkyJZRzlTpi6psUTtU1XW3cEr5SMkWVbKpyFApu5Lsk1F/w6in1PNABpXkv1RDvaOuk0q5rhaopDf1ukdFJ6V0UtVHyuirRfSJEoYfIOPZRWVTlhFVXyyMd1HK+sKQHYbOKPYTYUClLdIEo5LaOVk6qeijUj/C8IuimK6UxnbLrY96Xacim9IcR9C4ojqpt+VuRLEtoqJPNZWUNzKgZHNQWwyTnUMnwODgIObMmYOBgQHMnj07kAEajUaj0Wg0Go1Go9FoNBqNRqPRaNSxcuVKAMD999+PWbNmoa6uDsPDwzAMA08++WTI1mk0Go1Go9FoZCOypkP68pf/9Z3NQuFBZYjIlUW5dYbxjG6otEWGbBnlTJW+qMoWtUNVXXcLp5SPlGxRJZuKDJWyK8k+GfU3jHpKPQ9kUEn+SzXUO+o6qZTraoFKelOve1R0UkonVX2kjL5aRJ8oYfgBMp5dVDZlGVH1xcJ4F6WsLwzZYeiMYj8RBlTaIk0wKqmdk6WTij4q9SMMvyiK6UppbLfc+qjXdSqyKc1xBI0rqpN6W+5GFNsiKvpUU0l5IwNKNge1RfqikZHBcaHwoDJE5Mqi3Dr96jNNE9lsloQtYcmWUc5U6YuqbFE7VNV1t3BK+UjJFlWyqchQKbuS7JNRf8Oop9TzQAaV5L9UQ72jrpNKua4WqKQ39bpHRSeldFLVR8roq0X0iRKGHyDj2UVlU5YRVV8sjHdRyvrCkB2Gzij2E2FApS3SBKOS2jlZOqnoo1I/wvCLopiulMZ2y62Pel2nIpvSHEfQuKI6qbflbkSxLaKiTzWVlDcyoGRzUFukLxp54DNLhMKDyhCRK4ty6/Srr66uDocOHSJhS1iyZZQzVfqiKlvUDlV13S2cUj5SskWVbCoyVMquJPtk1N8w6in1PJBBJfkv1VDvqOukUq6rBSrpTb3uUdFJKZ1U9ZEy+moRfaKE4QfIeHZR2ZRlRNUXC+NdlLK+MGSHoTOK/UQYUGmLNMEIu50rdms9pf6j3Pqo1I8w/KIopiulsd1y66Pur1ORTWmOI2hcUZ3U23I3otgWUdGnmkrKGxlQsjmoLZMk2WHx8V+9Ryg8qAwRubIot04/+gYGBtDf30/CljBlyyhnqvRFVbaoHarquls4pXykZIsq2VRkqJRdSfbJqL9h1FPqeSCDSvJfqqHeUddJpVxXC1TSm3rdo6KTUjqp6iNl9NUi+kQJww+Q8eyisinLiKovFsa7KGV9YcgOQ2cU+4kwoNIWaYJRznZuy5YtQrpE9ZUSPyiV1Ba5EYZfFMV0pTS2W2591P11KrIpzXEEjSuqk3pb7kYU2yIq+lRTSXkjA0o2B7VF+kkjq//v94TCg8oQkSuLcut009fV1YUzZ84AABoaGnD58uXQbKEiW0Y5U6UvqrJF7VBV193CKeUjJVtUyaYiQ6XsSrJPRv0No55SzwMZVJL/Ug31jrpOKuW6WqCS3tTrHhWdlNJJVR8po68W0SdKGH6AjGcXlU1ZRlR9sTDeRSnrC0N2GDqj2E+EAZW2SBOMcrZzbW1tQrpE9ZUSPyiV1Ba5EYZfFMV0pTS2W2591P11KrIpzXEEjSuqk3pb7kYU2yIq+lRTSXkjA0o2B7VF+qIRTflIJBLo7u4O2wyNRqPRaDQajUaj0Wg0Go1Go9FoNBqNRqPRaDRRxCyBgYEBE4A5MDBQ8F3dB83c37iFi8TlhYvIlUW5dbrpu3TpkvnSSy+Zly9fNg8dOmSuWrXK3LdvXyi2UJEto5yp0hdV2aJ2qKrrbuGU8pGSLapkU5GhUnYl2Sej/oZRT6nngQwqyX+phnpHXSeVcl0tUElv6nWPik5K6aSqj5TRV4voEyUMP0DGs4vKpiwjqr5YGO+ilPWFITsMnVHsJ8KASlukCUY527mf/vSn1r/du3ebx44dM59//nnzpz/9qbly5cqy2yeDSmqL3AjDL4piulIa2y23Pur+OhXZlOY4gsYV1Um9LXcjim0RFX2qqaS8kQElm3m2eK3pcCL9pJH2iz1C4UFliMiVRbl1eumLx+NoaWkBABiGAdM0Q7OFgmwZ5UyVvqjKFrVDVV13C6eUj5RsUSWbigyVsivJPhn1N4x6Sj0PZFBJ/ks11DvqOqmU62qBSnpTr3tUdFJKJ1V9pIy+WkSfKGH4ATKeXVQ2ZRlR9cXCeBelrC8M2WHojGI/EQZU2iJNMMJo5wzDUKKvlPhBqaS2yI0w/KIopiulsd1y66Pur1ORTWmOI2hcUZ3U23I3otgWUdGnmkrKGxlQsjmoLdIXjRzeclEoPKgMEbmyKLdOL31ejv7IyAjOnTtXNlsoyJZRzlTpi6psUTtU1XW3cEr5SMkWVbKpyFApu5Lsk1F/w6in1PNABpXkv1RDvaOuk0q5rhaopDf1ukdFJ6V0UtVHyuirRfSJEoYfIOPZRWVTlhFVXyyMd1HK+sKQHYbOKPYTYUClLdIEo9ztnOiGQ0r9R7n1UakfYfhFUUxXSmO75dZH3V+nIpvSHEfQuKI6qbflbkSxLaKiTzWVlDcyoGRzUFukLxrRlA+7k+9cPBKLxbBx40Y0NDSU2yyNRqPRaDQajUaj0Wg0Go1Go9EQQvUJ1RqNRqPRaDSa6GKYJXiLg4ODmDNnDgYGBjB79uy87zKZLGprC9eiuIXzEJEhIlcW5dbppu/SpUt499138cADD2DatGm4ePEili1bhs9+9rPo7OzEm2++iTlz5uCJJ55QbgsV2TLKmSp9UZUtaoequu4WTikfKdmiSjYVGSplV5J9MupvGPWUeh5E1Q5VOquh3lHXSaVcVwtU0pt63aOik1I6qeojZfTVIvpECcMPkPHsorIpy4iqLxbGu6gI5dYXhuwwdEaxnwgDKm2RJhjlaOey2SxM08Q///M/W9898MADmDlzJs6dO4eRkREYhoEnn3yyrPbJoJLaIpV2qPQVZdlCRW4UyzB1f52KbEpzHCrtoz4uK0IU2yIq+lRTSXkjA0o282zxWtPhRPpTrPzB20LhQWWIyJVFuXW66WOni7S0tGDnzp2h2kJFtoxypkpfVGWL2qGqrruFU8pHSraokk1FhkrZlWSfjPobRj2lngcyqCT/pRrqHXWdVMp1tUAlvanXPSo6KaWTqj5SRl8tok+UMPwAGc8uKpuyjKj6YmG8i1LWF4bsMHRGsZ8IAyptkSYY5Wjnzp07h0OHDnHj2PeSHjx4EKlUqmz2yaCS2iI3wvCLopiulMZ2y62Pur9ORTalOY6gcUV1Um/L3YhiW0RFn2oqKW9kQMnmoLZIXzTSf31YKDyoDBG5sii3Tjd9pmkK30WpyhYqsmWUM1X6oipb1A5Vdd0tnFI+UrJFlWwqMlTKriT7ZNTfMOop9TyQQSX5L9VQ76jrpFKuqwUq6U297lHRSSmdVPWRMvpqEX2ihOEHyHh2UdmUZUTVFwvjXZSyvjBkh6Eziv1EGFBpizTBKEc753fcuKGhAZlMpmR9pcQPSiW1RW6E4RdFMV0pje2WWx91f52KbEpzHEHjiuqk3pa7EcW2iIo+1VRS3siAks1BbZG+aOTeT9wmFB5UhohcWZRbZzF9zPln//f29qKhoSEUW8KWLaOcqdIXVdmidqiq627hlPKRki2qZFORoVJ2Jdkno/6GUU+p54EMKsl/qYZ6R10nlXJdLVBJb+p1j4pOSumkqo+U0VeL6BMlDD9AxrOLyqYsI6q+WBjvopT1hSE7DJ1R7CfCgEpbpAlGGO0cO7latr5S4gelktoiN8Lwi6KYrpTGdsutj7q/TkU2pTmOoHFFdVJvy92IYltERZ9qKilvZEDJ5qC2GGYJx1R43X/T2dSLW5bNLfiNWzgPERkicmVRbp08fU1NTTh58iS6urowffp0tLW1YcmSJViyZAluvvlmHD9+HKlUCnPmzMETTzyh1BZKsmWUM1X6oipb1A5Vdd0tnFI+UrJFlWwqMlTKriT7ZNTfMOop9TyIqh2qdFZDvaOuk0q5rhaopDf1ukdFJ6V0UtVHyuirRfSJEoYfIOPZRWVTlhFVXyyMd1ERyq0vDNlh6IxiPxEGVNoiTTDK0c6dOXMGiUQCZ86cAZBbNHL//fdj5syZOHv2LEZHR2EYBqZMmYLf+Z3fwdSpU8tinwwqqS1SaYdKX1GWLUF5/62D+KWvfSqwnCiWYer+OhXZlOY4VNpHfVxWhCi2RVT0qaaS8kYGlGzm2eK1psOJ9JNGnv2THULhQWWIyJVFuXXy9A0PD2NwcBBA/grxa9eu4eDBg9bfo6OjOH78uFJbKMmWUc5U6YuqbFE7VNV1t3BK+UjJFlWyqchQKbuS7JNRf8Oop9TzQAaV5L9UQ72jrpNKua4WqKQ39bpHRSeldFLVR8roq0X0iRKGHyDj2UVlU5YRVV8sjHdRyvrCkB2Gzij2E2FApS3SBCPMds7PiSOU+o9y6tu0aROZ+hGGXxS1Nt40Tbz03/bh4MGDSKVSgWRFsQxT99epyKY0xxE0rqhO6m25G1FriyjpU00l5Y0MKNkc1Bbpi0Y05cHu2NsPi3EeHDM+Po6LFy+WzS6NRqPRaDQajUaj0Wg0Go1Go9HQgDd2zDt8vIQDySuSrq4uZLMmxsbGSvp9IpFAIpGQbJXGjbVr1wIAGhoa0NHRgba2tpAt0mg0Go0mmkhfNPLrf/yIUHhQGSJyZVFunW76TNP0tSpcpsOv8tllyJZRzlTpi6psUTtU1XW3cEr5SMkWVbKpyFApu5Lsk1F/w6in1PNABpXkv1RDvaOuk0q5rhaopDf1ukdFJ6V0UtVHyuirRfSJEoYfIOPZRWVTlhFVXyyMd1HK+sKQHYbOKPYTYUClLdIEI6x2jo0n28eMeWPMlPqPcuurues6Vq1aJfy7lpYW1NXVSdvEGYZfFKU2/p133sHQ0BCWfj53rVJHRwdaWlpKlhfFMkzdX6cim9IcR9C4ojqpt+VuRKktoqZPNZWUNzKgZHNQW6QvGunr5K+idQsPKkNErizKrVNEn59FJEFQ+ewyZMsoZ6r0RVW2qB2q6rpbOKV8pGSLKtlUZKiUXUn2yai/YdRT6nkgg0ryX6qh3lHXSaVcVwtU0pt63aOik1I6qeojZfTVIvpECcMPkPHsorIpy4iqLxbGuyhlfWHIDkNnFPuJMKDSFmmCEWY752cMmVL/UW5944PZkn63Y8cOpNNpaXaE4RdFqY2/evUqAGB8oLT8chLFMkzdX6cim9IcR9C4ojqpt+VuRKktoqZPNZWUNzKgZHNQW6QvGvngtfNC4UFliMiVRbl1iuhjp4/4PYVEpS1hyJZRzlTpi6psUTtU1XW3cEr5SMkWVbKpyFApu5Lsk1F/w6in1PNABpXkv1RDvaOuk0q5rhaopDf1ukdFJ6V0UtVHyuirRfSJEoYfIOPZRWVTlhFVXyyMd1HK+sKQHYbOKPYTYUClLdIEoxztnH18WHSsmFL/UW59nSfHAeSufs9m5SxIKIUw/KIotvFdJ5MAgp+4HsUyTN1fpyKb0hxH0LiiOqm35W5EsS2iok81lZQ3MqBkc1BbpC8a0ahnbGzM9U5FtmAE0PdQajQajUaj0Wg0Go1Go9FoNBqNpvhYsR5LLmTnzp24evUqhoaGwjZFI0h9fX3YJmg0Go1GEykMswRvcHBwEHPmzMHAwABmz56d993YSApTp99Q8Bu3cB4iMkTkyqLcOp36jhw5gvPnzyOTyWB8fBxTp05FW1sblixZgmw2i0wmg0mTJiGdTsM0TcyYMQPf+c53lNgiExmyZZQzVfqiKlvUDlV13S2cUj5SskWVbCoyVMquJPtk1N8w6in1PIiqHap0VkO9o66TSrmuFqikN/W6R0UnpXRS1UfK6KtF9IkShh8g49lFZVOWEVVfLIx3URHKrS8M2WHojGI/EQZU2iJNMFS3c5MmGzh//jz6+/tx7tw566SRBx54ADNnzsTp06cxNjYGwzAwbdo0PPHEE5g+fXpZ7JOBKn0rV65ENgXUTjawZMkSzJs3D11dXfjGN77h+TvTNLFq1SosX74c06ZNw8MPPxzYljD8oii18StXroRhGEiPZzFj9jTcf//9ME0TK1aswLp16/Dkk0+Gal859FH316nIpjTHodI+6uOyIkSpLaKmTzWVlDcyoGQzzxavNR1OpJ808sL/tUsoPKgMEbmyKLfOUvXZjxwcHx/H7t270dfXF4ot5ZIto5yp0hdV2aJ2qKrrbuGU8pGSLapkU5GhUnYl2Sej/oZRT6nngQwqyX+phnpHXSeVcl0tUElv6nWPik5K6aSqj5TRV4voEyUMP0DGs4vKpiwjqr5YGO+ilPWFITsMnVHsJ8KASlukCYbqdu65554rCBfZP0qp/yi3vobNwwVp1dHR4fmbF1980fo8MjKC7du3B7YjDL8oim38hc0jeX+/8sorJcmJYhmm7q9TkU1pjiNoXFGd1NtyN6LYFlHRp5pKyhsZULI5qC3SF43EWgeEwoPKEJEri3LrFNFnd2btn1OpFM6fP4+tW7eWzZYwZMsoZ6r0RVW2qB2q6rpbOKV8pGSLKtlUZKiUXUn2yai/YdRT6nkgg0ryX6qh3lHXSaVcVwtU0pt63aOik1I6qeojZfTVIvpECcMPkPHsorIpy4iqLxbGuyhlfWHIDkNnFPuJMKDSFmmCobqdy2QyoiaVrK+U+EFRoS+bzQIAxvqyBd9t3rzZ87cjI7mFC6ZpIpPJIBaLBbYnDL8oim08y6+gVyxFsQxT99epyKY0xxE0rqhO6m25G1Fsi6joU00l5Y0MKNkc1Bbpi0bu/MhCofCgMkTkyqLcOnn63Bwg++kizrj236TTaezfv1+KLbKQIVtGOVOlL6qyRe1QVdfdwinlIyVbVMmmIkOl7EqyT0b9DaOeUs8DGVSS/1IN9Y66Tirlulqgkt7U6x4VnZTSSVUfKaOvFtEnShh+gIxnF5VNWUZUfbEw3kUp6wtDdhg6o9hPhAGVtkgTjHK3c6ZpWmPHfibXKfUf5dA3MjKCV199FQAw+7ZJAFCQXnV1dUXlGIYRePECIwy/KIpt/KzFtVLkRLEMU/fXqcimNMcRNK6oTuptuRtRbIuo6FNNJeWNDCjZHNQWwyzBg/G6/ybeNoj5iwvvxHEL5yEiQ0SuLMqt06nvyJEjqK+vRzabxfj4OKZOnYq2tjYsWbIEmUwG2WwWkyZNsj7X1NTgpptuQldXF2bNmoXvf//7GBsbw8svv4x/9+/+XSBbZCJDtoxypkpfVGWL2qGqrruFU8pHSraokk1FhkrZlWSfjPobRj2lngdRtUOVzmqod9R1UinX1QKV9KZe96jopJROqvpIGX21iD5RwvADZDy7qGzKMqLqi4XxLipCufWFITsMnVHsJ8KASlukCYbqdm79lrX4zGc+g/7+fpw7d85azPChD30I06dPx5kzZzA2NgbDMDBt2jQ88cQTmD59elnsk4FsfcPDw9iwYQOGh4cx1p/F1BtrsHTpUsybNw+dnZ3o7OwEADz11FMFv929ezcaGhowefJkLF++HKlUCu3t7fjud78byKYw/KIotfErV66EYRgY6c3gplum4/7774dpmjh16hQMw8CTTz4JABgbG8PVq1fR29uLT3/602WzrxhU2nJKPpcq2ZTmOFTaR31cVoQotUXU9KmmkvJGBpRs5tnitabDifSTRp558h2h8KAyROTKotw6vfTxThZxkslk0N3dDSDnIG3evBmbNm2SbktQZMiWUc5U6YuqbFE7VNV1t3BK+UjJFlWyqchQKbuS7JNRf8Oop9TzQAaV5L9UQ72jrpNKua4WqKQ39bpHRSeldFLVR8roq0X0iRKGHyDj2UVlU5YRVV8sjHdRyvrCkB2Gzij2E2FApS3SBKPc7VyxvaPXrl3D2bNnS9JXSvygqNRXv34472/7GDxbPMLYtm0bzp8/z43vjCtKGH5RFNr4dDqNeDxu/X3+5/nlVsaHh4dx5swZnDlzpiz2+YVKW07J51Ilm9IcR9C4ojqpt+VuRKEtoqpPNZWUNzKgZHNQW6QvGtGEi9ciEvt9iv39/dKOydNoNBqNRqPRaDQajUaj0Wg0Gg0tnGPFXmPHiUQCfX19qk2KPK+//nre31euXPEdVyOHvr4+7N69uyCcdzWQngPRaDQajcYf0heN/MoPHhIKDypDRK4syq3TTR/PyXdzglhcPyeTlGKLDGTIllHOVOmLqmxRO1TVdbdwSvlIyRZVsqnIUCm7kuyTUX/DqKfU80AGleS/VEO9o66TSrmuFqikN/W6R0UnpXRS1UfK6KtF9IkShh8g49lFZVOWEVVfLIx3Ucr6wpAdhs4o9hNhQKUt0gSjXO0cGy/2Ghdm39nHlin1H+XWt+SxqQD46cLo7+8vOGHELW6phOEXRbGNv/2xKa7f7dixAx0dHb7yJYplmLq/TkU2pTmOoHFFdVJvy92IYltERZ9qKilvZEDJ5qC2SF80Mj6SEgoPKkNErizKrdNNH8/RCboopFRbqMiWUc5U6YuqbFE7VNV1t3BK+UjJFlWyqchQKbuS7JNRf8Oop9TzQAaV5L9UQ72jrpNKua4WKKR3NpslX/eo6KSUTqr6SBl9tYg+UcLwA2Q8u6hsyjKi6ouF8S5KWV8YssPQGcV+IgyotEWaYKhs58aGkwVhxSbNnePKlPqPcuvLJL3Tqq6uDm1tbdxTRtzG5xsbG4XtCMMvotjG79+/3/P7jK24F5Tj8XFkMhlfeqJYhqn761RkU5rjCBpXVCf1ttwN1W1RNptVJp+nr5Kg2E+ECSWbg9oifdHI+2vPCoUHlSEiVxbl1umlT2SRiIxVziqfXYZsGeVMlb6oyha1Q1VddwunlI+UbFElm4oMlbIryT4Z9TeMeko9D2RQSf5LNdQ76jqplOtqIez0Hh4exrp168jXPSo6KaWTqj5SRl8tok+UMPwAGc8uKpuyjKj6YmG8i1LWF4bsMHRGsZ8IAyptkSYYKtu5NX+9zXfcWCxmfbaPH1PqP8qtr/3oOAD38fTLly9jaGhISOaOHTuE7QjDL6LYxp89W/g7e950uORXOp0W0hPFMkzdX6cim9IcR9C4ojqpt+VuqG6LVq9erUw+T18lQbGfCBNKNge1RfqiEU0wjhw5Euj3flaMm6ap/EQSjUaj0Wg0Go1GIx/TNIUHPzUajUaj0Wg01UUmnS4Y/7X/7RxDbm5uxvXr18tiW7XxzjvvWJ/T6bTU62s0E/CuElq1alVY5mg0GuKMjY2FbYJGQw7DLMFLGRwcxJw5czAwMIDZs2fnfZfoHcWsudMKfuMWzkNEhohcWajUuXLlSjz11FOe+o4cOYK6ujoAQDKZxOTJk9HW1oYlS5Ygk8kgm81i0qRJ1mfTNJHNZpFOpzFlSu5+P/b5d3/3d4XsU/nsMmTLKGeq9EVVtqgdquq6WzilfKRkiyrZVGSolF1J9smov2HUU+p5EFU7VOmshnpHXSeVcl0tlDu9h4eH0dLSghUrVgAAhoaGsGnTJtw4Yz5+41u/Gki2rh/lk+Emh0pfLaJPlDD8ABnPLiqbsoyo+mJhvIuKUG59YcgOQ2cU+4kwoNIWaYKhsp378d/9E6bMrMVnPvMZ9Pb2WmPJhmHgwQcfxLRp03D27FmMjY0hHo8DAJYvX46FCxfi85//vHL7ZCBb3/DwMDZs2IDh4WGkRkzcMN3AHXfcgblz56KzsxOdnZ1W3FtvvRW33HIL4vE4mpubAeTSdvLkyVi+fDnS6TTa2towODgIAHjqqafwzDPP4I477sCKFStw5513lu0ZVfqKsmwpxjPPPIMnn3zS+jsWi2HPnj24fv06DMPA+FAGc+bPwIoVK5DNZnH69GlrEcnixYuxZMkSNDQ0oK+vD9/5zncK5rSC2lcqVNpySj6XKtmU5jhU2kd9XFYE1W3R2ldfyGtXVEIlTWVBsZ8IE0o282zxWtPhRPpJI6/+xQdC4UFliMiVRbl1iuhzO0HENM2CVcyZTAYXL15UZosoMmTLKGeq9EVVtqgdquq6WzilfKRkiyrZVGSolF1J9smov2HUU+p5IINK8l+qod5R10mlXFcL5U7vwcFBNDQ0FITv/D/nA8vW9aN8MtzkUOmrRfSJEoYfIOPZRWVTlhFVXyyMd1HK+sKQHYbOKPYTYUClLdIEQ2U71/TuqPXZPibstX+UnVBdDvtkoFJfoy39SsWellu2bCkI80MYflEU2/gmgfxau3at63dRLMPU/XUqsinNcQSNK6qTelvuhio7TNPEq/8jenWdElHsJ1RCyeagtkhfNNLZ2CsUHlSGiFxZlFunU5/zehnmbLa0tLg6niy+/XepVAr79+8PZItMZMiWUc5U6YuqbFE7VNV1t3BK+UjJFlWyqchQKbuS7JNRf8Oop9TzQAaV5L9UQ72jrpNKua4WqKT3SCwTWIauH+WT4SaHSl8tok+UMPwAGc8uKpuyjKj6YmG8i1LWF4bsMHRGsZ8IAyptkSYYKtu5kVi2IKzYdeXO7yn1H6r1sdO6GX58bZEFIG1tbdbndDqNkZERX78Lwy+KShufn1/ZgrBSiGIZpu6vU5FNaY4jaFxRndTbcjdU2XH+/HmcP94EANiwYYMSHU6opKksotJPlAtKNge1RfqikcXL5wuFB5UhIlcW5dbJ08ccIJ6zX+wFwOk89fX1BbJFFjJkyyhnqvRFVbaoHarquls4pXykZIsq2VRkqJRdSfbJqL9h1FPqeSCDSvJfqqHeUddJpVxXC2Gld3d3N/bu3YtMJjeAPePm2sAydf0onww3OVT6ahF9ooThB8h4dlHZlGVE1RcL412Usr4wZIehM4r9RBhQaYs0wVDZzs1YWGudHGIfK7Z/bmtrQzabv7jEPm5Mqf9Qre/YsWM4ffq09fwyfG03mpqacODAAV9xw/CLotDGO+c/Ziz0zq9i8yWMKJZh6v46FdmU5jiCxhXVSb0td0OVHel0Gjctzl3fcf36dSU6nFBJU1lEoZ8oJ5RsDmqLYZaw/NLr/puB+AjmzJ9e8Bu3cB4iMkTkykKlzqeffho/+tGPPPUdPnwY9fX1AIBkMokbbrgB7e3tAHJ39GWzWdTW1iKTyVirpLPZLJLJJKZPn45sNmt9N2XKFJimiR/84Ae+7FP57DJkyyhnqvRFVbaoHarquls4pXykZIsq2VRkqJRdSfbJqL9h1FPqeRBVO1TprIZ6R10nlXJdLZQ7vTs7O3Ho0CF0dXVh0qRJ+Na3voU333wTfd0J/Mc//cNAsnX9KJ8MNzlU+moRfaKE4QfIeHZR2ZRlRNUXC+NdVIRy6wtDdhg6o9hPhAGVtkgTDJXt3I//9p8wdfYkfPrTn0Zvby/q6+utRSQf/vCHMXXqVLzzzjt5v1mxYgXmzZuHL3zhC8rtk4FMfYcOHUI6nUZjYyNGRkaQGjZxwwwDd9xxB+bOnYvOzk50dnZa8W+99VbccsstiMfjaG5uBpBbmDB58mQsX74c6XQara2tSCQS1m8MIyevpqYGtbW1+NKXvlSWZ1TpK8qypRjPPPMMnnzySevvWCyG3bt3IxaLwTAMjCcymLNgBlasWIFsNovTp09bC0UWL16MpUuX4vz58+jr64NpmnjqqacAALt27cIXv/jFwPaVCpW2nJLPpUo2pTkOlfZRH5cVQZUdZ86cQXdrDxpbLljtQSaTQW2t9+Iztsiypkb8LAYqaSoLiv1EmFCymWeL15oOJ9JPGvnHf7tVKDyoDBG5sii3Ti99zlWyxdYA+V1VW4otQZEhW0Y5U6UvqrJF7VBV193CKeUjJVtUyaYiQ6XsSrJPRv0No55SzwMZVJL/Ug31jrpOKuW6Wggzve2+/NmXhgLL0/WjfDLc5FDpq0X0iRKGHyDj2UVlU5YRVV8sjHdRyvrCkB2Gzij2E2FApS3SBENlO3fupWFRc5BMJnHu3LmS9JUSPygy9TlPZZHha/NwnvxSjDD8oii28ay8+90fbZomTNPExYsX88KjWIap++tUZFOa4wgaV1Qn9bbcDZV2vPM3F6zPAwMD3Gtq2IJAAHj99ddx4sQJnDx5EgCQSqWErsOikqayiGI/oRJKNge1RfqiEU35EXGGeGSzWVy4cIH7nUaj0Wg0Go1Go6FD0Hu6NRqNRqPRaDTVgde15jyfcnx8HAMDA8rtqhS0Xx4ODQ0NaGpqCiTjwIEDuHTpUkF4X19fILkajSZ6sJsa2O0OAPDuu+/irbfeQjKZRF1dHTo7O61bHTZu3Ih169ZhZGQkRKs1GjVIXzTyxX/9EaHwoDJE5Mqi3DpF9Hk5/k5YnGw2i927d+PixYswTROxWEyKLaLIkC2jnKnSF1XZonaoqutu4ZTykZItqmRTkaFSdiXZJ6P+hlFPqeeBDCrJf6mGekddJ5VyXS1QSG/DMLD4k1MCy9H1o3wy3ORQ6atF9IkShh8g49lFZVOWEVVfLIx3Ucr6wpAdhs4o9hNhQKUt0gRDZTt3Wwi+YiXUXzZeLsPX5iF6CngYfhH1Nr6vrw+Dg4N5Yay8+03fVCqFTCZTED7zvrHA9olApS2n5HOpkk1pjiNoXFGd1NtyN1Ta8aFfuaUgbM+ePRgbG0Mmk7EWpo2Pj1unizC6u7utvuLIkSO+9N33xfkBLaYF9X6i3FCyOagt0heN1N7Av/fJLTyoDBG5sii3Tru+8fFxpNNp17issXJbPMLCnf9ns1ns2bMHmUwGGzZswPHjx4vaIhsZsmWUM1X6oipb1A5Vdd0tnFI+UrJFlWwqMlTKriT7ZNTfMOop9TyQQSX5L9VQ76jrpFKuq4VypveePXtcvzNqg107mRNiYmhIzdHbbkSxfqj0Fan01SL6RAnDD5Dx7KKyKcuIqi8WxrsoZX1hyA5DZxT7iTCg0hZpgqGynTNKyF7nhDul/kOlvoGBgcIx9FqjYPzcidtJLqJX0HgRhl8UxTbeT3n3s8m2lHoTBCptOSWfS5VsSnMcQeOK6qTelruh0o6aSfw2es+ePbh69apvOSdOnACQm1/lndTV19eH3t5e1DfUlWYoUaLYT6iEks1BbZG+aGT7syeFwoPKEJEri3LrtOs7dOiQdSIIMOGI2nFbKGJ3WL2cpFQqlXd/pZstspEhW0Y5U6UvqrJF7VBV193CKeUjJVtUyaYiQ6XsSrJPRv0No55SzwMZVJL/Ug31jrpOKuW6WihnerPjUZ0LwwGg9UDwXXBbVx7Gvn37AssRIYr1Q6WvSKWvFtEnShh+gIxnF5VNWUZUfbEw3kUp6wtDdhg6o9hPhAGVtkgTDJXtXNuBcetzqdeoUOo/VOrbvHkzxsbG8sbOWw+MWekmkn7237j9TkReGH4R9TaeNx/CyrvftHVb1GOvN+WASltOyedSJZvSHEfQuKI6qbflbqi048ybHb7jFmtXkskkEokE3nzzTTQ2NlrhV65cQVNTEy5fvixl/IYS1PuJckPJ5qC2SF80olGPYRh5jo3zc6krmTOZDNavXx/YPo1Go9FoNBqNRiOfvMGK0sb+NRqNRqPRaDRVgttJGBo+stJGp3H4iJb98fFxZLNZlSZpNJoK5dlnn7U+b9++Hd3d3Th37hy2bduWF6+pqQktLS3lNk+jEcIwS1hqPDg4iDlz5mBgYACzZ8/O+66nI4F5t84q+I1bOA8RGSJyZaFS59NPP40f/ehHrvr27NmDK1euWAPG6XQaNTU16OjIrYy77bbbYJomamtrkclkkMlkYBgGMpkMxsbGMGfOHCSTSWSzWaTTacyYMQPj4xMraA3DQCqVwqxZs/Bv/+2/Leuzy5Ato5yp0hdV2aJ2qKrrbuGU8pGSLapkU5GhUnYl2Sej/oZRT6nnQVTtUKWzGuoddZ1UynW1UM70XrlyJR5//HEcPHgQ3d3duOGGG/D444/j7bffRqx1AP/5h/8hkPzTh+vR0duMr371q5IsLk4U64dKX5FKXy2iT5Qw/AAZzy4qm7KMqPpiYbyLilBufWHIDkNnFPuJMKDSFmmCobKd+/u//CfMnD8Zjz76KHp6enD+/Hlrs+GDDz6IKVOmFExo3X777WhtbcXHP/5xfPWrXyXVf6jUt3r1aixduhS1tbVoamrCyMgIxgdMTJlj4I477sDcuXPR2dmJzs5O6ze33norFi1ahJ6eHjQ3NwPIjatPnjwZy5cvRyaTQUtLCxKJhPUbw8jJq62thWEY+PKXv1yWZ1TpK8qypRjPPPMMnnzySQC5k9gHBwcxMDCAWCwGwzAw2pfBTbfMwIoVK5DNZnH69GlrkcjixYuxdOlS1NfXo7+/H6Zp4oEHHsDNN9+M3bt346mnnsKmTZvw2GOP4aXnXgv8jiUClbacks+lSjalOQ6V9lEflxVBlR1nzpxBx9U4rnbkbnT47d/+bWzbtg19fX1YtmwZ7r77bmzfvh0A8L3vfQ+bN29GIpHAww8/jJqaGhw9ehQzZ87Et771Lbz44oswTRPf/e53sWXLFgwODuJLX/oSrl27hsuXL+ORRx5BJpPBgfeO4dHPP4QpU6bgYx/7mPRnKjcU+4kwoWQzzxavNR1OpJ808tZPjwmFB5UhIlcWqnSyhR8y9dnXBInep2iaJlKplDRbiiFDtoxypkpfVGWL2qGqrruFU8pHSraokk1FhkrZlWSfjPobRj2lngcyqCT/pRrqHXWdVMp1taA6vf2u6W/ZF/zo5L1rLgaWIUoU64dKX5FKXy2iT5Qw/AAZzy4qm7KMqPpiYbyLUtYXhuwwdEaxnwgDKm2RJhgq27nmfe5H4RfzN69evSqsr5T4QZGtz54uzXvHSjoxxH41fKnXAtkJwy+KYhvvVd6LkclkpMgpBSptOSWfS5VsSnMcQeOK6qTelruh0o6Tr7crk82jeW9lXU8TxX5CJZRsDmqL9EUjzXXXhcKDyhCRKwtVOp0LNHj6eM6maZqYOnUqFixYUFSHiLObSCTw6quvutoiGxmyZZQzVfqiKlvUDlV13S2cUj5SskWVbCoyVMquJPtk1N8w6in1PJBBJfkv1VDvqOukUq6rBdXpfeTIkbx7cN1IdKYD6+q81B9YhihRrB8qfUUqfbWIPlHC8ANkPLuobMoyouqLhfEuSllfGLLD0BnFfiIMqLRFmmCobOeGOnKnT7stXDhw4EBBGBs3HhwcVG6fDFTqS3SmrbQL87qZMPyiKLbxQx2ZvL/95BnL3+eff95VjmqotOWUfC5VsinNcQSNK6qTelvuhko7YleGCsLc2g0ZCwBljN9QIor9hEoo2RzUFumLRm5eeqNQeFAZInJlUW6dTn08h58dL+gG7zvRk0d4tshEhmwZ5UyVvqjKFrVDVV13C6eUj5RsUSWbigyVsivJPhn1N4x6Sj0PZFBJ/ks11DvqOqmU62pBdXqPjIwgmUxyv7P77tPm1gbWNfe2mdbnrVu3BpbnhyjWD5W+IpW+WkSfKGH4ATKeXVQ2ZRlR9cXCeBelrC8M2WHojGI/EQZU2iJNMFS1c01NTZg213vYny0M4cF8UUr9hyp9O3bsyPOx2We7r82bMCw2pi5roUkYfhHVNn7dunWu3xUr7175Yd/IW0yObKi05ZR8LlWyKc1xBI0rqpN6W+6GSjtmL5zKDRe9qcGvDBnjN5Sg2k+EBSWbg9pimCUsk/K6/2YkMY7ps6YU/MYtnIeIDBG5slCl86233sKJEyfwox/9yFXf7t27ce3aNWQyudXi6XQahmGgv78fs2fPxqRJk2CaJmpra5HJZJBOp1FTU4NsNovh4WHMmzcPY2NjME0TyWQSs2bNwtjYmLUQxTAMpFIpzJo1C9/61rfwxhtv4Dvf+Q5qamp8PfuVK1dw1113lfT8MtJVRjlTpS+qskXtUFXX3cIp5SMlW1TJpiJDpexKsk9G/Q2jnlLPg6jaoUpnNdQ76jqplOtqQXV6v//++1i0aBHuv/9+rFy5Eo8//jgOHjyI7u5uTJ48GV//+tdz9+3GBvFH/+mpQLoazl3C1dZGfPWrX827J1wlUawfKn1FKn21iD5RwvADZDy7qGzKMqLqi4XxLipCufWFITsMnVHsJ8KASlukCYaqdu6ZZ55BajSLabMm45FHHkFvby/Onz9vbTq88cYbcenSpYLfLVmyBC0tLQCAH/3oR6T6D1X6Vq5ciZkzZ2LJkiWoqanBlStXMDIygsw4MGmqgaVLl2Lu3Lno7OxEZ2en9btbb70VixYtQk9PD5qbm61JwilTpuDee+9FJpNBc3MzhoYmdrMbRk7epEmTYBgGvvzlL5flGVX6irJsKcYzzzwD0zTx9a9/3cqjgYEBxGKx3HzGaBazbpqOFStWIJvN4vTp01aeLF68GEuXLkV9fT36+/thmiYeeOABLFiwAHv27IFhGFi4cCEee+wxvPrSBvzxf/5D2Y/tCpW2nJLPpUo2pTkOlfZRH5cVQZUdZ86cQc/1fly4XA/TNPHbv/3b2LZtG/r7+7Fs2TIsW7YM27dvBwB873vfw+bNm5FIJPDwww+jpqYGR48excyZM/HNb34Ta9asgWma+O53v4stW7ZgcHAQX/rSl3Dt2jVcvnwZjzzyCDKZDI4cOI6PP/oxTJkyBR/72MekP1O5odhPhAklm3m2eK3pcCJ96eT/+p3NQuFBZYjIlYUqnSdOnPCtz23FG3Ne2Vog3jF6ftYJjY+PY/Pmzejr68PmzRP6iz37tm3bisp2Q0a6yihnqvRFVbaoHarquls4pXykZIsq2VRkqJRdSfbJqL9h1FPqeSCDSvJfqqHeUddJpVxXC+VIbz+7W06vTgTW8+Kf7A8sQ5Qo1g+VviKVvlpEnyhh+AEynl1UNmUZUfXFwngXpawvDNlh6IxiPxEGVNoiTTBUtnNnVhcevc9ob2+Xrq+U+EFRqU/U1/az49z+OZVK+TrpLwy/iHIb/8EHHyCRKMwbVt6DXiPhVW9UQKUtp+RzqZJNaY4jaFxRndTbcjdU2rH16fMFYW7th9uNDc723av9sfcpAwMDyGazSKeje2UN5X4iDCjZHNSW8p63pZGGaZrcRiibzcIwDGQyGXR0dBT8BvA3CJ3NZq2Vt2NjY0gkEujs7MT169dx7tw5OQ+h0Wg0Go1Go9Fo8mA+e5QHEDQajUaj0Wg04cPGgJ1jyMPDw9z47JSRaqTYhB/vqng/uP0um83i+vXrglZqvPDKEz/5FXTBiUajiRL+rh1z60fdwpxyeDJfeuklnD9/HgcOHPBtrUZTLibJFvi533pQKDyoDBG5sii3zlL02Y/Lc8OvIxSLxbB+/XrMnTsXM+9Nor+/nxsv6GISGekqo5yp0hdV2aJ2qKrrbuGU8pGSLapkU5GhUnYl2Sej/oZRT6nngQwqyX+phnpHXSeVcl0tqEzv3t5eALmBhWeffZYbh/nwtzw0OZCubdu24aO/ugRAsqwDpFGsHyp9RSp9tYg+UcLwA2Q8u6hsyjKi6ouF8S5KWV8YssPQGcV+IgyotEWaYKhs5255ON9XjMfjWLBggeu4blB9pcQPikx9bCc5m+C79eEpRTdgFpsY5Omw+91+fPAw/KIotvG3PDzZykO2qVYEFt9Zb1RDpS2n5HOpkk1pjiNoXFGd1NtyN1Tacf8XFyKN7qLx3E4Z8Qtr5299mMbVJbKIYj+hEko2B7VF+qKRWfOmCYUHlSEiVxbl1mnXZ3dW3RxL+xGD9jh+j1Dixevv70d/fz8mz3BvIPft2+fxFMWRka4yypkqfVGVLWqHqrruFk4pHynZoko2FRkqZVeSfTLqbxj1lHoeyKCS/JdqqHfUdVIp19WCyvRet24d0uk0EokEstmsZ9wbZgQ7NLK1tRW3zLkPQBKvvPJKIFkiRLF+qPQVqfTVIvpECcMPkPHsorIpy4iqLxbGuyhlfWHIDkNnFPuJMKDSFmmCobKd4/mKsVis6O+WLFlinThCqf8ohz77ePsNM/KveueNlzsXgBSbWCxl4jEMvyiKbXzQdyPZcvxCpS2n5HOpkk1pjiNoXFGd1NtyN1TaMW3ODbBfdBVkYQj7vddCwBs85lWjSBT7CZVQsjmoLdJ7wbd+ekwoPKgMEbmyKLdOWfpM0yz6clCscWz5IJknTyYynlNGOVOlL6qyRe1QVdfdwinlIyVbVMmmIkOl7EqyT0b9DaOeUs8DGVSS/1IN9Y66TirlulpQnd6JRAIjIyPc7+y+esu+sUB6TNPE/pcvwzAMoR2nQYli/VDpK1Lpq0X0iRKGHyDj2UVlU5YRVV8sjHdRyvrCkB2Gzij2E2FApS3SBENlO9eyd6ykSbCbbroJN954o7C+UuIHRaW+Zkf6lTIOzjuphH32mzdh+EXU23he2rXsDfZuJFuOX6i05ZR8LlWyKc1xBI0rqpN6W+6GSjtObGovHunn+FkUWCxOc5nbFtVQ7yfKDSWbg9pS3qWTGinwVq35dTRHR0ddnVxR53fbtm1Yu3at66C2RqPRaDRRI5PJ6HtsNRpNVcHaPD87TzUajUaj0Wg00cI0TZw4cULoN5MmST+cnDRsXD3IWIDXApNiV9KsX7++ZL3VgjMNZc1v8Dh69GhgGRqNJhoUOyHE2T+UevWVRhMVDLOEnnRwcBBz5szBwMAAZs+enfdd97V+LLzjxoLfuIXz8CNjcHAQV65cwS033uFbrixEnkWEp59+GgDwox/9yFXf+++/j5aWFqTTaQC5yS0glx61tbVIJBJ5v503bx5uuOEGmKZp7VpcsGABAGB8fBxz5szB6Oho3tU06XQatbW1SKfTVmPI9KUHa/HJX/oo7rnnHuzZswetra148sknsWvXLnR2dgIAnnrqqZKeX0a6yihnqvRFVbaoHbLrerFwSvlIyRZVsqnIUCm7kuyTUX/DqKdh5sHmzZvx2GOPWX2lKlQ+Y7l1VkO9o64zjGesZlSm98qVK9Hb24u7774bvb29AIDHH38cBw8eRHd3N6ZMmYJf+7Vfw/bt29F9rR//5f/9DyXr+tnPfoaP3PNx9Ix24urVq4jH4wXvISqIYv1Q6StS6atF9IkShh8g49lFZVOWESVfLIgdlMs2lTSNgs4o9hNhQKUt0gRDVTv3zDPPYDiexk23TsfDDz+MLVu2CNk1f/58PPXUU6T6D1X6Vq5ciVmzZuH222+HYRi4du0ahoeHMdqbxbS5Nbjjjjswd+5cdHR0oKury/rdbbfdhoULF6KnpwfNzc0AcpODU6ZMwb333otMJmPJYtTU1GDJkiWYNGkSkskkurq68J3vfAcvvPACfv/3fx81NYX7e8Pwiyi28c888wxM08S8efMwZ84cGIaBwcFBxGIxGIaB4Xga82+fheXLlyObzeLs2bPWbxcvXow77rgDdXV16O/vh2maeOCBB7BgwQLs2bMHhmFg0aJFePTRR/HycxsxfV4tnnzySYlP7Q6VtpySz6VKNqU5DpX2UR+XFUGVHWfOnEHb5Riar1+CaZr4rd/6LWzfvh19fX1YtmwZli1bhu3btwMAvve972Hz5s1IJBJ4+OGHUVNTg6NHj2LWrFl4/PHHsWbNGpimmRfvy1/+Mq5cuYLGxkY88sgjSKfT2L/zGD79xYcxZcoUHDp0CJ/73OfQ09ODz33uc9KfrxxQ7CfChJLNPFu81nQ4kX7SyPtrzgqFlypjaGgIV69eFZIri3LrtOtzrmhj/2cyGWsBiSoMw0DHsRSOHz+OtWvXWotEAOR9LhUZ6SqjnKnSF1XZonbIruvFwinlIyVbVMmmIkOl7EqyT0b9DaOehpkH9gWTjPHxcemnj1SS/1IN9Y66zjCesZopR3r72Y3SfmQ8sJ6jm6/l/X3o0CGMjo5aC8ZVEMX6odJXpNJXi+gTJQw/QMazi8qmLCNKvlgQOyiXbSppGgWdUewnwoBKW6QJhsp2ruNo7opx0QUjpeorJX5QZOuz7zi3+9qGYSjbLf78888DAFatWsX9Pgy/KIptfPvPy7szn9xOeXHLTxnvWCJQacsp+VyqZFOa4wgaV1Qn9bbcDRV2tLS0AADq3u0qEnMC3tUzouPE5W5bVBPFfkIllGwOaov0RSOXj3cIhQeV8fbLe3zLlYXIs8jU197ejvHx/MaFNU7OcCc8R8jpKNlPG3FjsDWT9xuZyEhXGeVMlb6oyha1Q1VddwunlI+UbFElm4oMlbIryT4Z9TeMekohD15//XXr86uvvpq3O6icdkRBZzXUu3LrTKfT2LhxY9n0acQod3q7DU4MtARf2NFW15snv7GxEVeuXMG2bdsCy3YjrHepsGW4yaHSV4voEyUMP0DGs4vKpiwjir5YKXZQLttU0jQKOqPYT4QBlbZIEwyV7dxAczrw2C2l/kO1PueVJwMtac8rZ5x4pTXvO2dYMpnk/jYMvyiKbfxg88QGIL+Tubx4Mt6xRKDSllPyuVTJpjTHETSuqE7qbbkbKux48803AQCdF3I3NhTrJ53f+23rnZ/L3baoJor9hEoo2RzUFumLRubeMksovBQZAwMDGBoaAgDc4F+sNESeRaa+o0ePoq+vD0C+U9PT0+P6WxEnqVhc0zQxZXahs7xx40YpuxFlpKuMcqZKX1Rli9ohs677CaeUj5RsUSWbigyVsivJPhn1N4x6SiEPOjs7kclk8NZbb4VqRxR0VkO9K7dO0zQtn68c+jRiqE7vbDbrOhBhD58yO/ir3Kybp+b9bX8fyGazgeXzCOtdKmwZbnKo9NUi+kQJww+Q8eyisinLiKIvVoodlMs2lTSNgs4o9hNhQKUt0gRDZTs3ZU5wX5FS/6Fan3OCz+5r+xlfd4tjP71EZBEKIwy/KIptvFt5F1k4lZv7yMk5ceKEde2QSqi05UHqejab9TwBn4o/R2mOI2hcUZ3U23I3VNoxc97kgjBeu+y2GK1Y2+Js96fMruH+5p133vFnMDGi2E+ohJLNQW0xzBLOW/e6/yadymDSDbUFv3EL51FMxqFDh9Db24tkMonTp87i6T9Xf/e2H/uC8vTTTwNAwV3iTN/rr79uHRedSqUA5DpldjXMjBkzCnZCz507F5Mn5xrAgYEBjI6OYsGCBQCAsbEx3HTTTRgeHkZNTY11ykgmk7H+Z2FMX23NJGSyaaTTaUyaNAnpdBq1tbWYPHkyJk2aBAC477778IUvfEH4+WWkq4xypkpfVGWL2iGzrvsJp5SPlGxRJZuKDJWyK8k+GfU3jHoaZh5s2LABn/vc57Bhwwb84Ac/wKpVqzB16lR885vfxMyZM8tmhwpU6ayGeldunalUCn/1V3+FH/7wh5Z/pVKfRgyV6b1y5UrE43GsWLECsVgMAPDNb34TBw4cQHd3N6ZOnYpf/dVfxc6dO9HX24//8Ed/WLKuZ599Fp9+9DFcaryI9vZ2xONx3HnnnXjooYdw+fJlxONx/O7v/i73t42Njbj77rtL0lvu8kqpjVLVR8roq0X0iRKGHyDj2UVlU5YRJV8siB2UyzaVNI2Czij2E2FApS3SBENVO/fMM88gmzExbfpUtLW1Cds1f/58PPXUU6T6D1X6Vq5ciVmzZuH222+HYRi4du0ahoeHYWaBmloDS5cuxdy5c9HR0YGuronrDG677TYsXLgQPT091uICwzAwZcoU3HPPPchms5YsRm1tLW6//XZMmjQJyWQSXV1dSCaT1sKSp556SskzqvQVZdlSjGeeeQamaWLevHmYM2cODMPA4OAgYrFYbh4jncWMmdOxYsUKpNNp1NXVWZO2t99+O5YuXYpz585hYGAApmniwQcfxLx587B3714YhoFFixbhk5/8JF7ftBk1tQaWL1+OW2+9FStWrJCdBHlQacuD1PWzZ89iaGgIn/rUp6TLDiuuLNlh2Ed9XFYEFXasXLkSjz32GAb6B1FXfw4A8Ju/+ZvYvn07ent7cffdd2PZsmXYvn07AOB73/seXn/9dQwNDeHhhx9GTU0Njh49ilmzZuHxxx/HmjVrYJomvvvd72LLli0YHBzEl7/8ZVy5cgWNjY345Cc/iVQqhePHTuChhz+GKVOm4NChQ/jFX/xFxONx1NfX48knn5T6jOWAYj8RJpRs5tnitabDifSTRv7qmxuEwkuVYZomBgcH0bRezX2CXog8iyp9sq6GcbuOxm3l85nnR4vKvHDhQkm2yEhXGeVMlb6oyha1Q3ZdLxZOKR8p2aJKNhUZKmVXkn0y6m8Y9ZRSHpSwtlaJHZR1VkO9C0un32uRwnjGaqbc6e3WDp16diiw7Oee3IszZ87khSWTSZw5c8bz+ks2gOJky5YtRXVSeJcKQ4abHCp9tYg+UcLwA2Q8u6hsyjKi6ouF8S5KWV8YssPQGcV+IgyotEWaYKhs507+LCFqTiB9pcQPigp9zPc++WyCe0KIG6Jj9fYTSLwIwy+i2MYXS6tTzw5xr/Vk//xeQXHyWX9XVsiCSlsepK5ns1nP/KHiz1Ga4wgaV1Qn9bbcDVV2mKaJ139YVxDO6r29PLu1BaLjxaxtKfX31KDYT4QJJZuD2iJ90YhqGhoarFMwrl27ZoUnEsEd4Shib1x4Ewu9vb3SdOUayGg3ZhqNRqPRFENkYEij0WhUwmuHLP9fQROVSCSs+9SZ7gsXLmBsbMz1N/F4HG1tbchmsyXtZNVoNBqNphiDg4Nhm6DRaKoQP2MCblcX2MP8XkuzdetWUROrGpFJ16hP0Go0GvX4aSdkLCLU480aykhfNPLoN+4TCheVsXv37rywG1fkKvKaNWt8yw+KyLPI1sdWxspydFgDxVtFx+PmD90gRS8PGekqo5yp0hdV2aJ2yKrrfsMp5SMlW1TJpiJDpexKsk9G/Q2jnlLJA5WDCuX2JVTqrIZ6R11nGM9YzZQjvf0MIiz8cOEdvKI67vvcwryweDyOlpaWvLBkMon29nYcOXIkLx6jo6MDZ8+exYYN/nYzRLF+qPQVqfTVIvpECcMPkPHsorIpy4iqLxbGuyhlfWHIDkPno9+4D1euXMGbb74JILeDee3atRgdHcXQ0BAGBgak64siVNoiTTBkt3MXLlywrkqZvmw88IJeSv1HufUt/EhpvrbTjxeZHOTlVxh+kYjO+vp6obGTUp/Hno68NOXll2EYwpOzdjnlWGhCpS0vta6fO3dOmeww48qSHYZ91MdlRVBlh2EYWP7Z+dxwJ2w+VqRtd7YdhmFg4Ucm+56DjQKV9m4TFEo2B7VF+qKRW5bdJBQeVMbUmyD9hbEYIs/il2PHjhXVx2tMVKxK85I5bb66VXAy0lVGOVOlL6qyRe1QVdfdwinlIyVbVMmmIkOl7EqyT0b9DaOehpEH2WwWO3fu9PytzFPNVD5juXVWQ72jrjOMZ6xmypHebv64PXz6guCvcjctnub6nWmauHjxIgBgdHQUfX19nnHZCSXFBkCiWD9U+opU+moRfaKE4QfIeHZR2ZRlRMEXk2EH5bJNJU2joPOWZTfh3XffRWtrK0zTxKpVqwAAly9fxqlTp/DSSy9J1xdFqLRFmmDIbufi8Tj6+/sB5MbMg0Kp/yiXPjY5OGN+rRXm5d86N2Ty4F114HfSMAy/SETn3r17pdgiAi/t7O9Gpc6hmKaZl+/lgEpbXmpd37dvnxVm/yxDdphxZckOwz7q47IiqLLDNE3ceFvhWIhbu2wPL2XBB69tcbZJUTtpqtLebYJCyeagtkhfNPL6/zokFC4qw1mZug4a0l8YiyHyLH45evSob32lrJLl4ef4PKaP0bw7GVivGzLStdRytm/fPuulSpW+qMoWtUNWXfcbXu5yo1oOlXynLkOl7EqyT0b9DaOehpEHpmmiqanJdfeKaZpSTzVT+Yzl1lkN9a7cOkX9vDCesZopd3o7fXX299X33K+M8cvBl64VhNnL36uvvlqg120wkFHs3azc6UepjVLVR8roq0X0iRKGHyDj2UVlU5YRBV9Mhh2UyzaVNKWus6WlBS89/R4AoKamBtls1lqUCKjZmRlVP4pKW6QJhux2zn46ddfB4GPHlPoP1frs4+SmaeLKe6OBZdrzg43ns13rfgnDL6LYZxVbCHJ1F//dSLTfuPLzd6xyXR9BpS2XUdfdTh2h4s9RmuMIGldUJ/W23A2VdhxZ1+ornshCP69FhFfeG/WUEbWrfin2E2FCyeagtkySZIemTKg+usjLea2EY5PcuH79et5AhEaj0Wg0DH3XpCYsdNmrXvzsWJSlo9j39neArVu34tSpUwVx7XEGBwclWajRaDSaaqO3txfj4+MAJhaNMGRe1azRVDK6npSG10l/ItcQuMlwi6ff+cTxmsS1z2vouqDRaPxgv4LG7zgJ+10QnbqN0lBE+kkj/+bvvigUHlTG4i+Vv2KJPItf7PeC+9En0qCkUinPO7eKNYR2Xff82pSC38hq3GSka6nljPcMHR0dRReSqCgL1GSL2qGqrruFl7vcqJZDJd+py1Apu5Lsk1F/w6in1PNABmHYoUpnNdQ76jqplOtqoRzp7Wfw+L6vz0AikUAmkylZz2O/t6QgjOcXt7W1oaenB83Nzb7kevnQUawfKn1FKn21iD5RwvADZDy7qGzKMqLqi4XxLkpZXxiyy63TMAz84h/cZX12Lhph9PT0YGws+IlbQHT9KCptkSYYsts5tjghFotJGTOn1H+USx9ra+77+gwAwce9i23MLCY/DL8oin3W8q9Ptz67naBeDNM0seIbfDmqoNKWB6nrIyMj2Ldvn+v8FhV/jtIcR9C4ojqpt+VuqPQ3f+nJZcK/CQLrUyqFKPYTKqFkc1BbpC8aObLlolC4SNw9r54BkN9h91+E5/3aKhB5FhX6RBuonp6egjA/93M5MQwDsbp00XilIiNdg5QzZ7oeOHCg6JU1KssCFdmidsiQLRJe7nKjWg6VfKcuQ6XsSrJPRv0No55SzwMZhGGHKp3VUO/KrXN0VOwoZCrlulqgkt5dZ8exfft2rq/vl93rTgPIX8DOPUr1yhU0NTWht7fXClu/fr2r3Oeee871u7DfpcKS4SaHSl8tok+UMPwAGc8uKpuyjKj6YmG8i1LWF4bscus0DAOXPsj1STU1Na5XtB05cgSdnZ1SdFLp10Wh0hZpgiG7nbOfaNEvIXsp9R/l1td91t+J1H7G6Z1tmcjYfhh+EdU+y+sEke5zybw4xU4McDv9pftssqy7/6m05aXW9Xg8jmQyaW0aSCaTBddsUPHnKM1xBI0rqpN6W+6GKjtM07T8TdHf2RHZWN99NlnQPkX5pCmq/URYULI5qC3SF42cP8C/C8otXCTu9nUHCsKGWoxAu+tKQeRZSsHZwMjSV+zuPzfscfqvqktrGc9ZajlzrvBOpVLS9YlCRbaoHTJki4SXu9yolkMl36nLUCm7kuyTUX/DqKfU80AGYdihSmc11Lty60wkEmXVpxGjHOltH0Bw2ynX3xR8Mfdgc/Hfusl3LlbxO+BR7vJKqY1S1UfK6KtF9IkShh8g49lFZVOWEVVfLIx3Ucr6wpBdbp01NTVoOzdgfXY7aQSQt5kpqn4UlbZIEwzZ7Zy9ngy1BJ+MotR/lFtf35Xc2HDQSb2gbVUYflEU+ixnurJ3I1483nuVW770NqXLOpFLpS0XkTE4OOgaf2BgAAcO5M/hUfHnKM1xBI0rqpN6W+6GSjuYv2nHq712uw7LiVv7wfqUSrmSJgr9RDmhZHNQW6QvGpl50zShcL9xY7EYaqcVriit9S9WGiLPUgqrV6/m6lPVoIjIvWEav9GTYZuMdPUrI5PJFMRlZWtgYAAbN2709UwqywIV2aJ2yJAtEl7OclMOOVTynboMlbIryT4Z9TeMelquPGATnidPnsyLUw4HXrUvUU6d1VDvyq1z1apVZdWnEaOc6e01YHnDjGCvcoZh4Ibp/gZE3exoampCU1MThoaG8sK92tFyl1dKbZSqPlJGXy2iT5Qw/AAZzy4qm7IMqr6YbDsol20qaUpdp2EYmDKzFgB/0QjrX9x2iJdCVP0oKm2RJhgq2jmZY+aU+o9y67thurev7XXqhTOct6OcLWYotkAhDL+IWp/1xhtv5KUTL93Yew1Ld+cCkWLpzL6/YXp+/5JO8xejyIJKWy4iY+3atXnxnXXgwoULJcsWjU/drwzDPurjsiKotGPqrEnWZz8+pd+TQdxk3TC9pqB9ivICEmr9RNhQsjmoLYZZQskcHBzEnDlzMDAwgNmzZwcyoBimaeLs2bPYv38/4vE4Fi5ciI985CPo7u7GmTNnrHjz58/HU089pdQWlTz99NPWZ7dn2bhxI0ZGRlBbW2sdW57NZn0dyXnLLbegr68PY2NjWLBgAYDc0efz5s3D0NAQamtrkc1mrTtja2pqkEqlLKcqlUrBMAzU1tYinU4jnU5j0qRJSKfTqKmpweTJk3HDDTdY+qjnxb/8y7/ga1/7GmbNmoV33nkHhmHgC1/4AubPn4++vj5s27YNkyZNwuc//3krvTQaGbz66qv4zd/8zbDN0Gg0AFauXImnnnoKK1euxO///u/j2Wefxfz58/HYY49h06ZNVtj06dPxjW98A2vWrMEf/MEfIJVKYcqUKWGbr6lgmF/4J3/yJ8p9bQ0tVq5ciXg8jg9/+MPo6OhAbW0tfu3Xfg2HDx9Gd3c3pk+fji9/+ct4//33MTAwgEWLFuHTn/40Fi1aJKzr+eefR19fX8GCjwceeAD19fVYsGABYrEYfuM3fgOHDh3C9evXC2R87GMfw8mTJzFr1izcfPPNGBjI7dYxDANPPvlkaYmg0ZSZsbExTJ06NWwzNBoNgIaGBnR1daGhoQEzZ87E17/+daxduxYA8PnPfx7d3d04f/487rrrLtx7771YuHAhZs6cGbLVGg0djhw5gtraWrz99tuB5ER9nF2ElStXYtasWbj99tthGAauXbuG4eFha3LwjjvuwNy5c9He3o6uri7rd7fddhsWLlyIeDyOlpYWK/7UqVNx9913I5PJoLm5GSMjIwBycxyTJk3CbbfdhsmTJ2NsbAzd3d3WaddsDP7DH/4wHnvssTKnQuk888wz+IM/+AMAwU9lcWPlypXW53nz5uHGG2+EaZoYHBxEPB63FhLOmDED9957L1KpFBoaGpDJZGAYBhYvXow777wTZ86cweDgIEzTxIc+9CHcdNNN2LdvHwzDwK233oqHHnoIb731FrLZLB544AHMnz8fBw4cwA9+8AMlzxVVnnnmGetd7+mnn8ZnP/tZ7Nu3DwDw5JNP4plnnsGXvvQlfOpTnwrTTI2Gy8qVK/HpT38aw8PD1vzyv/pX/wq7du1Cb28vli1bhmXLlmH79u0AgO9///vYtGkThoeH8dBDD6G2thZHjhzBrFmz8I1vfANr166FaZr4zne+gzfffBMDAwP4yle+gqamJjQ2NuKTn/wkUqkUTp48iY9+9KOYMmUKDh06hF/8xV9ELBbD+fPnrXrzB3/wB5G+skZDF5E1HdJPGvmLr68XCi8W1zRNHDx4EABweV1hheGFqUbkWSjr493BVWwN0ZlVo1Zc2bg9p9+rYrxkOEmn0/inf70dyWTSupvduWJZpr5SoCJb1A4ZskXCZaSTrLT2I8d5lLsqW1TJpiJDpexKsk9G/Q2jnpYzD0SPHmxpacGuXbuk21EOVOmshnpHXWcYz1jNlCO9/eyEO/EzsWuMeJx7YbxoHK8jV/3srHQSxfqh0lek0leL6BMlDD/Az7Nns1lkMhk8//zzUmRTlkHZF5NpB+WyTSVNqes0DANv/D/nAeROGjFNE/F44Z3zrB968cUXUV9fj46OjpJ1RtWPotIWaYKhop1jfpiMMXNK/Uc59Nl92BP/MliSDN74uv2UJHs89p09vn2TLFA+vyiozpdffjmQLX5xe0+yvxu5xWHhXu9ax/95MO/7TCZTipm+odKWi8gwTbNg/s7Jzp07AQCnT58m489RmuMIGldUJ/W23A2Vdmz8b2cB8NuDUsY5eNhln/iXwaLjPNu3b+du1qFIpb3bBIWSzUFtkb5oJJvlVxy38GJx8yoiT8TPw8q5AkvkWUrB+ULM0xfked2OZPPTGPIWmsjCLV2fffZZ6/Mrr7xSkgwe5s9POY3FYmhtbcXo6Cjq6uqs9GloaCjaSKssC1Rki9ohQ7ZIuIx0kpXWfuUMDAygv79fqS2qZFORoVJ2Jdkno/6GUU/LlQe8PswZpsq/UO1LlFNnNdQ76jrDeMZqphzp7dX2WP64WXobtX379kDt27x580r+bRTrh0pfkUpfLaJPlDD8AD/PfujQIfz4xz+WJpuyDKq+mGw7KJdtKmlKXadhGDDdxgQdsO+6u7utk65KIap+FJW2SBMM2e1c3oIFCdlLqf8olz52BYHfuUERn9ptjN1LRrn8oqA6/bbDQZ7Hc8PrxG1mBYt0eFcDuVHu2yKotOUq6rppmjhw4AAZf47SHEfQuKI6qbflbqi0w8zyw4OMk3i1Nfa2xdk2sXijo6PKr8SSRaW92wSFks1BbZG+aOThr94jFO4W1zRNNDQ0oK6uDoZhIB6PY849ZkFFmnNP+TND5FlU6VN935Wb4zpvxSTP74Pgla7sHtu+vj7XyfZiMpzc/7lbrOcYHh5Gb28v9u7dmxenWCOtsixQkS1qhwzZIuEy0klWWvuVc/nyZVy8eBEbN25UZgsPKmlVDc+oUna5628Y9VR1HnR0dBTs3ik35fYlVOp0k3v58uXAMmRQSWlNRV+1U870dg6M2v3uBQ9MBlDae0FTUxMAYP6KSUVi5uTX1PBfG5ubm7nhXu8HUawfKn1F2X314OCgZ1xRnWGkn0qdLPzYsWMYHx/H8PCwNNmUZVDzxVTZQblsU0lT6jpramqw9ONzisaTubA7qn4UlbZIEwzZ7Zzdd5QxZk6p/yinPtM0LV/bT1wn9jaqmK9erD0rh18kopNdZ29H5H1ExvPw9C144AbXuCy+HztvfuAG5fMudqi05SIy4vE4brwXRU/5YvMqH/7CEqETW1T5V15xWZ7v2bNHquww/F7q47IiqLRj2aMTG2HsbYTbhnvA5+YeF3jjN86TpqJEpb3bBIWSzUFtkb5oZNlH+fdpu4W7xX355ZcxNDSUN4gz/ZaJOKwysbByVi6RZ6Gkj3cUHu9vL2Yvrs2TJzPd3Z7TNE2sXr3a+vuVV15xdTRE0uq2+2/K+7vYMdtB9YlCRbaoHTJki4TLSCdZaS0ixzRNdHd3K7OFB5W0qoZnVCm73PU3jHqqOg8SiYTwcX+y/Yxy+xIqdbrJZceRBpEhg0pKayr6qh2V6Z1MJgH4G4yYY/PLS2XZx24pHsnDHnbNo5DOCNYPlb6i7L567dq1RWWr9CVkyFCpc/H9ubvrjx49apXrWCxmfe9nlyyVMkXlnU4UKnZTzwNVdlDXaRgG5i+bbg3Ym6aJm2++mRtXln8eVT+KSlukCYbsdm5wcBB1dXUA8sfRS4VS/1FufUF8bd5J3eyfKCrreim+4tatW/PGM0WfqdTnse/M5+mcfXvhYnivyV835iy5wdJTjvkmKm25qIxZt9VibGzMV9zr49fQ1dXlOz1V+Vdecdnp9ufPn5cqOwy/l/q4rAgq7Vh4z0xf8UppR3jMWVzruciwXG2OLCrt3SYolGwOaov0RSPr//IDoXAe//vJ9UgkEgWVpHNfYeXkhalG5Fmo6HO7jsatMXJzwK7uHM+LIxO35zQMo8AJWbVqlZAMRiqVwuHDhwEAO/9PvWs8+7MNDQ25xlNZFqjIFrVDhmyRcBnpJCutveRks1lrQNrLCaCS79RlqJRdSfbJqL9h1FPVecDb9VPqSvIgdpQbVTqrod5R1xnGM1YzKtPbfkqEF4ZhoPHd0cADC+e39nNll4Lzd++++y6A3IkOdqJYP1T6ilT6ahF9ooThB7jJ2Pg3h3Dt2jUA4J6g89JLL5UsW4Qo+oSyoGI39TxQZUcUdB5f11F0Uw87kVgGUfWjqLRFmmDIbueGhobQ1tYGQM6YOaX+o9z6GrePSpHjbM/c/nfD+YwjIyN45513hGxQ6SuKElS2W3o1ves/v7zedxq3jSi7ppgHlbZcVMaJ9V3WyfDFyvDpjddhmqa1MEOmLTLiHjx4EI2NjRgfH/cVX0R2GH4v9XFZEVTYwer3wTWFp6a61X22ed7v2AtPDutTorY4xA3K/UQYULI5qC3SF43IIPXzHXYMPx11OTvzqOI8ZikIvDsBVTZ2TtmpVKokOalUCg0NDXly586da/3NGm37c7344osl6dJoAODixYsAgLGxMbz99tsF3x86dMj3yuygmITuVtNowqbUleKV4txrNBranDp1yvU7WW1QsTZQ5BjnRCKRdzyxaZpoampCa2trwaIRjYYKegxBo4ke7ESurq4u7NixA0DunfqNN94I0SqNRlMJ7Nu3T5lsL39a1B9Jp9Po6ekJapJUBgcHkUgklOoQfQeyL9ZxzmMUk6XHfIpj4OfXOC1YkB/uUZ79nOgXBuyU3NFROQvFNNGk2EZCkbbabf61mAz2/cjIiG9dGo1spC8a+c6f/6JQOI97vjYNQOEdUrd9vnCC57bPl78TF3kWqvpYOrJdVvYwL+76ZX/3OZaCjOcUkfHLf/QgAKC2NnfcIJsIHBkZyVtYwjhw4EAgfaJQkS1qhwzZIuHlLjeicnbt2oW6ujrU1NRY1yrZJ50bGxutgS+Ztjg5c+YMpnwoVjxiESiltyrZlWSfjPobRj0NOw/KMUBQbl9Cpc5qqHdh6bxw4UJZ9Wn8Uc709lqodu/XpgeW/8nvLS4I411p6ec9YWSkcFfem2++WRAviu9SKn1FKn21iD5RwvAD3GR8+4efBJC7i91eXtPpNLZs2RJItghR9AllQcVu6nmgyo4o6Hzke7cBKNzU09vba50ce/XqVQC5Ez3T6TSSyWTekfIiRNWPotIWaYIhu52z920yxswp9R/l1GcYBu791eC+NuBvgYI9DsvD5uZmXLp0Cd/6s0cD26DSVwRyV3o4T2kQtcUvbu8lbD6JF1dkjOfeX5tRmmElQqUtF5Vx+xcNHDt2rCBteWn98G8tytuQwMbHZdgSJK7djilTpmDy5Mme8UuxIwy/l/q4rAgq7fjs795pfS62YYa3cMTPadX2z6xP8drEyPSvXr26mPmhU2nvNkGhZHNQW6QvGjm3p/BYH69wHj2X09ZnwzCso64Gr03EYRWIhZmmifPnz6OxsVHI3lIQeRZV+ljD0tnZKU2PvVF0+9zX6N2p220TpVi6btq0KbCMvMn6I9e5HQErbyw+4/Tp0wCAvXv3+tYXBCqyRe3wit/T04MPPvigaFyRcBnpJCut3eTs27cvr+y5fZZpi5P9+/ej+VRfYDmU0luV7EqyT0b9DaOeljMPSl0g0traKtWOcqBKZzXUu7B0FhtMka1P448w09vum8Yvl3bynp22M/6uw/HDtGnTuAtOnFB4l3JS7DQUlb4ilb5aRJ8oYfgBbjLOf9BmfbaX02w2i+7ubuvvQ4cOhWKfKhmUfTGZdlAu21TSNAo6289O7FY3TRPd3d2Ix+N47733rDBGb2+v9XnPnj0l6YuqH0WlLdIEQ2Y75zyV2T6OXiqU+o9y6LNPHPb48LX97hr3mlx0XhHPbHj33Xdx8uRJnNx52VW+35O4VfqKY2NjnqckBpFdjN7eXuuqMvt8kh3R0wF6LiXLehodlbZcVEZX/QiGhoYAuI+rsfCOuiErrL6+HgcPHpRmS5C4q1atsj4vWLCgIN9l2BGG30t9XFYElXY0n+r3Fa9Y+bZjX/DsLE9ufYpTjv13r776qi8bw6DS3m2CQsnmoLZIXzRy5v2rQuE8ei+mMDQ0ZA2Us8GbxNVCB8oe1t/fr/woNEDsWWTpc15fUc5j0uwNlX3RiMiR1X7gpat9AUdXV1dJMtxoPHwdQOFuymLPU19fX5I+UajIFrXDGd++yGZsbCzvCEXR9oIXLiOdZKW1lxyvF0R7mVOV7/F4PK+9LBVK6a1KdiXZF7T+eslQWU9V5sHe1ycGM4IMBGzdujWQHeX2JVTqrIZ6V06ddp9vx44dvo4eDuMZq5mw05v5Db0XUzh79mzJcgzD8LVoxI+vX1tbi3vuuYf7G+fvw3iXciObzeLgwYM4evRo3nsHT4aMdx5VfaSMvlpEnyhh+AFuMur3TSwaqanJHw6x+wUnT54Uli1CFH1CWVCxm3oeqLKDuk7DMNB2eqJvKsfYV9j9eqlQaYs0wZDZzj333HN5fVkYY0CU/Lyg9FwKvkAbKO1qAzsN+9sLwtiCumeffdaXDJW+4vj4uLVww0ljYyN6enqsxQWisr1Ip9O4fv269XfvxcL8cjsFwysvei6myjrvQqUtF5XRe9nfBhcA6Dg3kf/ZbLZo+qryr+xxOzo6rEVX8+fPt8pELBbjxi/VjjD8XurjsiKotKP5RPHNtc4FfV5ll9fe2MP89ClDQ0N589vUriKzU2nvNkGhZHNQW6QvGpky7QahcB4pM4mRkRE0NjbmnRxi2ERYDtcN+X+XA5FnkaXv+eefL6tOgJ+mNTcUhslKe166vvLKKwULZkRlAMD69euxffv2PCd10tTagniiz6KyLFCRLWqHM35dXZ312blAQrS94IXLSCdZae0lx3nUpNtJIyrz3bgBnhMifqCU3qpkV5J9QeuvlwyV9VRlHiSGB4ru8ikH5fYlVOosJre3t9fqf48cOVKSjCBELa2du8X8HEsZxjNWMxTS2zAM1E4O3l5Nmiz3dTAej3MHjc+dO2d9DuNdykk2m8Xo6ChWr15t7Yx0e9/KZDLoT/Ri7dq1Smyh0leL6BMlDD/ATcbkaRPvgLxdvUFkixBFn1AWVOymngeq7IiCztopE30TO/HVjld93bVrl7A+Cv16KVBpizTBkNnOmaaJG2+8EfPnzweQP45eKpT6j3LpY/5BLWcM3InbTnPnd2zykDfe4LXY2jAM3DB1UsFv0uk03njjDQDA4GDxReAqfUUvLly4gDNnzmD//v2BZTvT1bnAt2ay4Zr2Trz6kZrJwRb5iEKlLReRMX/+fNROnhjP5l2vZEf0nVOVf2WPu3nz5gJb2XNcvnwZQ0NDUuwIw++lPi4rggo7WD7fYPM33RaUFSvbPNzaedanOL+3/3316lXf11SHTaW92wSFks1BbZG+aOS/rv+mUDiPu7+dq4w9PT3WPaUs3FmpWNxYLIbh4WFRc0tC5FlU6RN1XDo6OvI6cj/3b/EcrA99b6qQXhHszzk0NISenh6Mj48LPatb3sTjcfT09ORNxHzvx48GWjk8MjKitCxQkS1qh1d8lpeNjY1obW0Vbi944TLSSVZa2+WcOHECV69e5dY1Z5i9HKrM97u/bQZuJymltx/ZpRzjRqXuyZAho/6GUU9V5sFH/90sAO47ToqFyaLcvoRKncXknjlzBs3NuaPxTpw4UZKMIEQ9re0LXsuhT1Occqe303dnbRNrz4Lwr3/yGB5++OHAcgDv47H37dtnfabwLtXa2ort27fn2Tw+Po6RkZG8eKlUCi+++CLu/jaknGipqo+U0VeL6BNFREZ/fz++9t/vU6bzj1f/qvXZa8DObcesl2wRougTyoKK3dTzQJUdlHVu374dAPArf3Y3gFydLHaEvZOLFy8K642qH0WlLdIEQ3Y7N3fuXDzwwAMAJsbMg0Cp/1Cpjzf2LMPX9tLHm6Tk2fH7//IFT1l+Fjar8hV5J7Cm0/xrYkRlixCLxXDfbxVuDmUUm1uwj/089LuzfZ9GLgMqbbmojI//+xsxefJkX3G/+Kd3CKWlKv/KLa5zA9n58+cxMDAgRXYYfi/1cVkRVNrx9T9/EIC/a8REwt2wj0f7GZOmTqW92wSFks1BbZG+aORvn9gkFM4YHR21jldu3MDvmO3h7DsWBhR3SmRR7FlU6Su2SlaUYtfL8JzWc2vGuN/LsMmeriMjIxgcHHRdhe1HhhOnnDX/8WDB98WOmQKAgYEBAMD//J//U2lZoCJb1A63+PZ7Dru7u9HT0yPcXvDCZaSTrLS2yxkaGsLw8HDRe++ci0ZU5ru9vSwVSuntR3Ypx7hRqXsyZMiov2HUU5V5cGrVxKSfs/7xULW7pNy+hEqdpcjdu3dvnt9Gvd5R1xnGM1Yz5UrvYqcfsfasVB/cMAy8+qfHUVtba+1I5cnzI3/OnDloaGjw1MUIu37U19ejvr6e+1wvvPCCtbjNNE2k02lkMpm8vkOmLW5hpcpwXldU6rt5qXFlyEgkEviXH7ynTOc//uu3rc/F+ni3jSlU/OEw3ulkQMVu6nmgyg7KOtlJwy/95yNIp9Po6OjgxrO333PmzMn7rhTfPap+FJW2SBMMle1ckDEgVpco9R/l0PfBBx8AyD2/LP+PyfOCt5ud/f/P/55/gpLImLmMcR0nJ06cQGtra54NyWQybxELz99WVUbq1o4X6HQu/vCTXieeGyzrBG6p6dHd3R1YRhA7mjYY+MhHPlIQzpu/2vV315TZEjSuM68XLFhghcuwIwy/l/q4rAgq7GDtwBv/va7gO5ENhbxxE7dxZtl9CgUq7d0mKJRsDmqL9EUj46P8HWZu4YyRkRGcP38eAGC6RGXhhw8ftibu7XHLdWxYsWdRoc/PxFYxZDg82ZR7Ixk0/e3p6mfxRjEZbjC5qfEM99i/YiSTSSF9pUJFtqgdbvHXrFmT97dpmuiN9XMXe4m0IzLSSVZa2+WwOssmfOxlLZvNWrtVnd+pzHe3tlUECund0tLi+l1vvD+vjjIOHz7sWz6VuidDhoz6K9qvy6inKvMg49KPeaHCvyi3L6FSp1PulStXAHj7HZcuXcq7Lot6vaOuM4xnrGbKld6xWAzXr193PRI1k/ReAO6H1FjhXdS8nXVe7eBdd92FZcuWcWU4aWpqwvhIuPVjdHTUOlGEl3aHDx/G2NgYLl++jPfeyy1eKKXv8GOLW1ipMj744APE43HrBJVS382D2FeqDPs7QTpp4ujRo0p0Jkcnynyx62lOnz6N9vZ237Jl2KdKBiVfjIrd1PNAlR1R0GmmcmOEfk5bc5JIJKxrx/wSVT+KSlukCYbsds4+xhRkDIjJoNR/lFOfaZq+/D8/p3eLfm+apnVqkmEYSHL8dVFkjOsw6urqMDQ0hLfeeqvgO9M0C67FdqZRqXlmjeenUtxrb3lzFm7zFV7vNtlkeXf8l5oeGzduDCwjiB3p8ayvsTLDMJDmpKnzhMdSbQkS1+v6eFl2hOH3Uh+XFUGlHanxbPFIAckbv0n5G18xDAOxWEypXTKotHeboFCyOagt0heNfOSX7hQKd3Ls2DHMupPfOc+600Q8HkcikcDY2JgVVm78PotMfV5H56rEedrITXdPHPXmXKUbdFGKPV3ti0ZE5Irkzd2P3Fw0TrF0VlkWqMgWtcMrvn0hhWmamLRoxKrLfmTwwmWkk6y0tstxOp72l6br16/j2LFjBfFk2sJDRntJIb2dL6bZbNa682/yrbmj5Z2Lkdhdp+fPn8fIyAgOHDigzD4vyp1+MuqvaL8uo56qzIN5994g3Ic6J05lLMIsty+hUieTy+7dfPfdd0uWoYJKSmsq+qqdcqZ3Op22TmljMF9q7vLgd6be9Yn5rt/5bSv97sbZvHkz3n//fSz/9C1l3b3nzC8/7fjWrVut60kMw8C8e+XcT6uqj3SGjY+Po6enJ/C7eSn2lSrjueeew65du2CaJm778EzLV5at84HPLrY+19TkhkNuuukmAP7LPAV/WFQGJV+Mit3U80CVHZR1snZ31p3em5K8TuEdGxtDU1MTAHA3E/CIqh9FpS3SBENmOzcwMJA3nhrGGFAU34N6enqsjQ92/Ph/fn1a3lXxmUzhYhDrZIZdu6x4dzx0kzWBWOomSxnjOnZeeeUVAEAmkxEeX7nr4+7vH8VIpVLcvAKAG5f5n+by2kQ6/77JnnFlQ6UtF5Wx9KGbrM9emw2Yb++M98ILL0ixRUVc9r4oQ3Y5/N533nkHwET6Uh+XFUGlHbwyrGrO1TTNon1KueZ7/WKapnUKII9Ke7cJCiWbg9oifdHIh35xqVA4APT19WF8PHeE2NGjRzH7jly4s6LMvmPipY9VZBYXyH03MDDgWZhl4PUsKvXJOM2jGMVezG9cVlN0RxYAtLW1CevmpauXPfYVtV4y7HLsaXjXxxdw44o4gyrLAhXZonZ4xXcesXjzimkwTRPHjx/3JYMXLiOdZKW1XY7XamXni6JpmtY1Kirz3d5elgql9O7v78fY2BheeeUV9PX1obe3FwtXTAWQm3jgsWfPHiSTSVy9ehVjY2PcI8ep1D0ZMmTUX9F+XUY9VZEH+/btAwDMu8fdSfdaBCl7oKDcvoRKnUwu25EfRIYKKimtqeirdsqd3i+//DKuX79eED7foz3zy50PzysIE72exm/72NnZmfs/edE6ca0c2POrsbER/f391oC727tGTU1N3oJf1ne8/vrr0mzxCitVBptw7erqwqFDhzBzKT9vVPoSpcrIZDK4fPkystksbnlwpjKd939mYtHItGnTAACTJk3ixnUr21T84bDe6YJCxW7qeaDKjijoDPremkgkkEql8Oyzz/qKH1U/ikpbpAmGzHYumUxiaGjIumIljDGgKL4HufX3XmMHpeoZGhqyNjXV1U1cjcC7Ip4xbXEaO3bswIULF7Bu3Tq8/bb/q/YYMsZ17LA06+vrc/3OTiaTscZ+20bOe8p2o9jVnW6LRtxOgXbGYf/LWizuFyptuaiMpR+9yffYGfPt2buKTFuCxGVlQobPX2pdKjWuPf61a9cAAC+99BJSqRRu/dDMgoWz1NtyN1TasfSjNwb6vVt75DZ36rdPKecGm2Ls3LnT9btKe7cJCiWbg9oifdHIS/99j1A4ABw6dCjvrtL23fxJGxbuFtbe3o7Dhw9j+/btAhaL4/UsqvQ5HZtSGg8/vynmPF19t/AYLx5bt24VtC4/XU3TxLFjx5DNZl0n2+07LnkyirHjp/Wusv2isixQkS1qh1t8+6p49n/95tygvfP4aZF2REY6yUpru5yampqCRSOsjE2ZMsWKx75bv369VFsY9hc4XhsqCqX0fvnll2GaZp4jfG5j7nmdx2HaYflw8eJF7tHFVOqeDBky6q9ovy6jnsrMg7feegtjY2M4d+4cAODS2xNXEvg9NUTFgs1y+xIqdRaTW2xwx4+MIFRCWhfbKRvGM1Yz5U5v5+lZrN269Jb7FSt+MAwD7z1zMZhxAvpZm1u/ubz3hNvzq6mpCX19fa6Dg/b2yu5LsL6DLXyRYYtXWFAZLK1fLuHdPEjcoDJY2h9/pUuZzg1/OXFl4ezZs/ELv/ALAMTqERV/OIx3OhlQsZt6Hqiyg7pOwzCKvrcWq68tLS3cq6XciKofRaUt0gRDdjvHFkQCwceAxsbGSPUf5dRnGIbl/8mSB8DaNHvw4EEAhVd0ODfbGYaBI2vacPXqVbS1tWF4eLikxdcyxnXccFvw0t3dbV2ZmE6ncfbsWQAIlK5e4wrXduSf3OKcuPXr6118M39zmeoNvFTaclEZ+57jn/hih427ifr2qvwrt3cmJyxchh3l8ntN07ROc3/+v+4oWNBFvS13Q6Ud+567mve36FVWvM02XvH9tn2GYWDBgsLN7m1tbThz5owvGX64evVq8UgeVNq7TVAo2RzUFumLRkS4fv26tbpWtANmTocdWUfGU0TGFTC8Y+/ssv3oD2qDCBcvXnS1OQheqwC9nEi31bDsnklNcQzDQH19Pbq7u7Ft2zakU2kMDQ0hFoshkUiUtNiIKuxKGtM00dnZicHBQeu7hQsX4tOf/rQVT2WdYkdGVipBX94qtc/QTBCPxz0XETF0WZBPGL5D1NmxYwc3nHeVm6b6iMVieQvtGbLrl9vAmVd8v+8T2WwWBtzfS1Ry+PBhJBKJgtPg7JimaV1Z4hanv78fDQ0NaG5uLoPVpRPFdndiobU6HcNDQ9YAmdsVsKZpYu7cudbfbMxCo9GoZ+/evUXj+NlQxXbdajTVhqwJ7ng8jlWrVkmRFTWcGxi9EEnveDxuLRph8E7qABztHAo3o1H189hiGCB3Qvi5c+eklcmBgQH09vb6imv343mLWvycEEDpqohUKmWVlcuXL4dsjRhUyyoQnXqlKR8yyoDoxn2/OhOJhO/TevzArjVi7N69W5psTbSRvmjkiR9+xlf49u3bsWHDBrz55pu4dOmSdecoANzyWX5FsYezCVh7mH1nv0rcnlG1PnsnFtRxKfX3S7+o7og2e7raT6MoVQYPu7xf+vf3BS4vT/zwM9Y9k7JRWc5EZIva4Sd+NptFJpPBvE+OW3U/m82itbXVUwYvXEY6yUpruxznYpDR0dG879ziqcx3t7ZVBErpzbCn3/2/cWNgeVTqngwZMuqvSH0UlSFiRxDsZeTur0zz7APLNTCgopydPn267Dr9yC3W116/fp18vSuXTrcBoGK7ZcN4xmomzPQeGxuz3gmW/fI03HPPPZg6dWrJ8j73u/dYdXT+fP794rKup2E+z/2/cSMuXLiAgwcPYteuXYFP7ygGy6+2tjYr/QD3xTHsn33C4O6vTOzafeWVVxCPxzEwMFCyLcXCgsi49dZbrc93//K0griiOsvpu7Ay8tFvLwzcH7vpfPR7/KNanYuE2OIhILfgyI9sGfapkhG2L1aqbJV2U88DVXZQ19nS0oJbPuu9GNFPv2NffFJsJ2VU/SgqbZEmGLLbOXv9CGMMKKwxc5mwhcF2/88Nr8XWImPOXn7px/7VQk+ZfnwmGeM6o6OjGBoaKgi3n5Ziv37ceSIKO4GapevIyEjBdeVe8DYQ27nlM9mCOH5PGGFpuHHjRtzzKzPyfq8aP2X4+vXr1pXLvOsiwugPPvX9OwAUjms702x4eBgf+/bNymwRjbtz586i9ejatWvo7OyUYkc5/F5nHnzo8XkF7QL1ttwNlXZ8yvZeqKqu2/PB3qeIvOuOj49j69atVj6PjY2hp6enIF4mkxE6aY/BFqQ1NDTkhdvT5I033ij4XaW92wSFks1BbZG+aKTpFP+4KRY+NjaGgYEBNDY2WiujBgYG8gbdRn4+buh0LkY444n2sHJN9Lg9o0wuXLhQoC/o9TRuv/WS5ZzcjjeNYHiYf0xb0MbVnq5M1sjIiNBzi+RNe0N/QZhoGWo61SV1hZ9TtipEZIvaIRJ/pDN/gRBbMV6sHQlin1+5QeRs3rwZNTU1ud20hoGZM2dixowZeXGdV9ewlyeV+c5rQ0WhlN48+pq9r3Bg2K8mOX/+vJX+AJ26J0OGjPorUh9FZYjYIYuBtond7UFO+gnqc6h4xgMHDpRdpwy5GzZsKPABZJ5CUA6/TaZOXtnasmWLZ1kN4xmrGRXpnU6n0d/f7yvunj17AACDrWlMmjSpZJ2GYaDzgvjCByD/BD5e2WTf2/tXwzDQ19eHniujyGazaGlpwZUrVwreLWTD8st+l7lo+2/vO2TYUixMloyB1jQ3XKUvEVSGaZrouTJaPGKJOmON+eWtFD+Aij9M5Z1OFCp2U88DVXZEQad9nEAEt984d1I6iaofRaUt0gRD9ZhbUCj1H+XUF4/Hpfl/QYk3BbuSEpAzrnPp0iXuFc92hoeHrUlLdqoKe79lp2eydB0bG0NjY6MP6/0x1J5Fe3s7du3a5XqaHMOelvb3mkQiwfWfjx07Js1OJ1TaclEZ3Zdy83Ze5dI0TYyOjiIu6Nur8q+aTnUVlDme/ffeey/mzp0rxQ4Vz8I2QzSd5DfyvVdzdW3Tpk1WfaTelrsh2w57fndfLlwExxuPcyvjXtcb8XDrU7zGl+PxODKZDHp6eqy5s87OzoJNDUCuTS1lc/u6detcv2O28U68rbR3m6BQsjmoLdIXjRx/h79DkoVfvXoVx48fL5hot1eOgcsuFYsTbg9jJ42omsRnuD2jTOy7Io6/cxm1tbW+jtcXoRRn05kHdhlBF4/w0pW3gllUhh37aS0X9nUFtp+nb8OGDQAK758XRWU5E5EtaodXfOdJOQOXJzrXbDZrlfFi7UgQ+/zKDSKns7Mzz3G4+eabUVNT43lSELurTkW+s7JoT+9SoZTeTgzDQOfp4vcDOvOhtbUV/f396OrqQjabxc6XjiqxDyh/+smovyL1UVSGiB2yiNXnFhbx2n8nzvoi87oV2c84Ojpa1P9Rla5uct2OhC0mo729Hdu3b1dun0qC6OTtoBofH8f69euV6NOIoyK929rasHHjRl9x2clssfOpwAvYLu6/XjROsfbOrw01NTVIpVJoPj6Qd4qHyBHgpcDyy+7rs39+2n/DMKy+g/d9KbYUCwsiw+6Dxur5OzJV+hKlyHBO6LacGHSJGVxny4kEbr/9dgD862mcx1SLyJZhnyoZlHwxKnZTzwNVdkRBp9uYIMPvxiK/bXVU/SgqbZEmGDLbudtuuy1vIqlYXQqqT0b8oJRj7KBU7L6E/fRhkd8ahoHm44OeJxb78cVljOuUyp49e/LmNGL1SaRSKXR3d5d0cp8b9vLOSx+vReOxWMx6x+o6m59Xpmni6NHKGSeUJaPp8MRJB8X625YTCW54Op220r1UW0qJ6zxhkPn+b775phVumqYUO1Q8y9GjR9HX14c3nt2DTCZjPQerZ02Hc5ty6+rqUF9fL2xHKfFVIdsO+wKJpkO5Muzn/c8PxWR49Sl+fNbx8XHP00D8wE7/Hx4edh1DPnHiRN6BBnbsbXalvdsEhZLNQW2RvmikpoZfMVi4rxVYbnWLFx7C9XJuz1gMtivQjp9JkZoaA21tbWU5Jql4ZCUmAMhP11KfVSRvDJfSL+J42/WxRvP69dzg+7PPPuvblmKyZeMl+/z58xgaGrJeaETtYPF5Kx4ZVroaEw5N3krP7m7fdstIJ1lpbZdT7Hg+tzKuIt9feumlnxsF1NfX+74DlAel9OYL9/6alw+sPL7zzjsYHx/HQELeS6uTcqdfqfXXjwyRcBl2lErhhFBhHOcLo9vvS706jYfseuDnZA5VdU92uWYvMbJQ2uYo0PmTn/yEG+48gU+WPo04KtK7ublZ6CjR4eFhKX65mz9cEM/FJ16+fDnmzp3r+Vs2GDExYDuxUMQ0Tezfvx+XLl3yb7QgPP+s2DVl9vY+t5CA/30QW7zCpMlwyV+VvkQpMtjVERP5kwt/6623pOusra3BTTfdVBAukqdU/OFy+4SyoGI39TxQZUckdBYR7VZfneF+N15F1Y+i0hZpgiG7ncu79k9C9lLqP8qtz89webFxY+bbFNsU6Zy8i8Vi1mYCowZIJpPo7Oz03NjiRZBxnWLXwvBwXrua71sDb7/9Nt5//310dUncme2RFG7pZreT5RHveVWebi9ahnlX7oTRH/jdfDV//nzXd87R0VHufJkq/6qmpnBDk93e5uZmxGIx67oOGXao9BUz2Uxe3WL1KZWaWJzAfCHqbbkbsu2wn+7qdyxEFLf5oGLNiFs/4eb3Xrlyxbq2yi3O22+/DSB35RIAnDlzBkePHs1fyGc7HRbI1Ut2UpRzjs6aY0LlvdsEhZLNQW2RXjX+nzeecA1vbm52PX2B3Rsdj8dxz2/xCzkv3B7GJutV4/aMxaivr4dpmnlp4OcYNqbPuZhB1iISP3LYsdHOPCg2wSYCL11nzZrl+/5BNxl27IPE3//Jo65xeJ+L6VuzZk3ed0FPhim1nPk52txL9u7du3HmzBlrRaGoHSw+b0GU84SHe34rl7dLly7F1KlTrXi3/Qb/mHKeLaWmk2wZTjml7AJoaGjAn73+bSm22GGTr/f8lolsNut57Fgx/NSxYpO9stKbx2f/ZJHn985V5Ax7/jz072cBAHe1e1DKXV5Lrb9+ZIiEy7AjCHn5+4PZUmWXiugznjt3znPRgAqdMuWK9KeyUSmbis4wnrGaoZDera2tVn/lxO/ga1NTEz7xBzcWhIu8Y4i8k7B+94HvT877TSaTkX6iop3/540n8NprrxXoEHlOXt8xPj4ufPyrqj7SGcaeza2MqPQlgshg5ekrP7wLQG7gVqbO0dFRPPXClzz123ErI1TeP8rtE8qCit3U80CVHVHQycYJ3PDbfpumidOnTxeNR6FfLwUqbZEmGDLbOWc/5ja+LkufjPhBUanPz9iB30VsfojH43lzBiMjuZN0v/JndwLITfo5T/EMOl7uJ/y5556zPvv12V955ZW8v69cuWKFP/SD2RgfHxc+Kb7YfAOvvDsXgzs5d+6cLzluv5eB6HhKPB63To8QkSHDDjvf/tuPAPA3D8TKsBN2tXsQW0qJa984xttQwMbeZNih4llYuXV7l7f7Uex/6m25Gyrt+NZff7ggzOsqGj94zZXy+hT7vCebf3WT4ZxrSqVSSCaTaG9vz7u1wg57n2aLR1h98/M81TaOGxRKNge1Rfqikf/9/S2u4Xv37nU9js0wDOtuuyub+YWWF24PSyaDHRvnF7dn9EN7ezu2bdsWWJ/KI5x7enq44YlEwkpvlld+T0/wg/05S22gRfJm/Z8dFz7Oz0tfkNMbiskGJlYEFuPll18Wlu3E7lSLlneR+Fc28++Vv/gaf3EZT3aQ+ihThlNOKXXjgw8+CGyL14IQlt5BKGbf+fPnceDAAU89stKbx6F/Lr540OsF2zRNnF2TW9m7detWucah/OVVRv316teDyhCxQxZn1kys3HYbQCh1944Ios94+fJl6wWChx8bVaUrkxsknf7397dYK9ntHDlypGSZdtnlplSdXgNnXukbxjNWMyrS+6abbsLs2WKL2s6uzbUJzvZLZKJ93z8VP93Eq09PJBIYGBjgHtnsdh3pxddSyGazuHz5srXAReXuvb/49joMDAx4njbGw36FDa/vyGQywgtMVfWR9rD29na0tLQAgOXTBNFZTt+F5cfuH5e+WMRL5/PPP4/n/5g/oGanmL9M5f3DS0ZPT0/eTllKvli5fVlZsinoC0N2GDrdxgQZItfTHDhwoKi+qPpRVNoiTTBUj7kFhVL/UW59p18MtmnD60RTN3jzJrv/oYUr2+sKbCdBxnXsk/q8UyH8wMYyGhsbS07XYv6Zvby77b73czU9r96ofFcppQw7N+uF0R9s/R+5hSte49/sM68MA+4bEFT5V3/6lX/h6nSetMDeA2XYoepZDMPAqdWD1mc7sV2zrc25pmkilUrh77+72bdsUVtUosIOlv9v/sX5gjCv+H5x24hgH1MQldnc3Oy6mCSVShWMF7OTPFVSae82QaFkc1BbpC8aGerjLwoZ6htFTU2Nr8UAGZdr/njhbnFV4vaMfhA5NaOYviAOi30HIk9OPB7n2qkyve3PWeqpKsXyxu5Ujw7yFxmJ5E+QsiAq23nXt0zZTux1VfQZveI7X2gyo/wBe7dyxpMtIw9k5WOpddVe5ob7xgLZwI7Q48HSOwh+0iqTyWD16tWBZJRKatj/TmVneWT/p0bkLYZzUu7yKqP+evXrQWWI2CGL1Ei2pDuAWTyRgRkvRJ9R5AXGbfePqnRlcoPUlaG+0bwdTIwTJ06ULNMuu9yo0Ol1dUkYz1jNqEjvOXPm4J577hH6TWo4W3B/MZCri34HCVIjwU74SKfT1mLyYjBb0w5/pKOjI5ANxbhycWIBgp92itdH2NOJnZBZSj+gqo90hu3fvx9AoU9Tis5y+y6maWJ8OPgVZW46h/sLFygCuQX6vME49vfY2Bh2797tKVuGfbJk7NmzJ2/hFiVfrNy+rCzZFPSFIbucOhcuXAig+LhTKTv7eTuzGVH1o6i0RZpgqBhzY8gYw6XUf5Rb30j/OGKxmNJrFAH+JPuMGTMwefJkAMD4EH9jnQgyxnVkkRwuzY8uhqw5C6ccNg6kagMvlbZcVMZYIuVrExYAjA/xfXu3dFXlX8U7cmPm9rkIpp93O4IMO1T6iqmRbEEemKaJ5M/Hxj/+8Y/j3nvvxYkTJ9DeLHZDAxX/QKUdY4nCPHebK7DjtiCQV5btcexzFjLGlr3aJLd5RLeNMxcvXhTW78yb1157TViGX9lRgJLNQW2Rvmjk/k/f7hperINl381cwo/DC3eGqTyBg+H2jKUQj8eL3mto12d/PpF7z+0408htV2tfXx9GR0fzrjtxprdMJ09GuorIWPoL84qWl2LPx9PH0lP0mD0/smVRTDYbDC/FDmd8r2O5Ft0/AwsWLACQXy7d2gCeLeUuN37lONs7t07ZGXbfpxZLseXkyZMFYTOXmHj//fcDyfVTdgDvxSsyy7YzXeffO9UjNrjOtJMbl03K+9vP6T1+KXd5DVp/vWSIhMuwQxY33XVDQZifHUDORZ9B/Q2RZ+zr60NdXZ2rfc4y6rZoS1W6+pHr9aLlR0aQE0dUlicqOsN4xmpGRXqbpim8SOrGZZMsf23VqlWWHNM08wYJ3E56BIAFRfpNhlv7U8q7wJw7+McQ9/T0eJ6oVAqXLl3C3GW5dt9+up3bUcQMZxtv7zuKHXPthao+khdmGEaBT1OKznL6LixtF62YoUzn3R+/GWfPngUwcVKpaZrWkb0AsHlz4U68VCplnSxD5f3DTQbbBWwv35R8sXL7srJkU9AXhmzZOk3TdF3gzMrszCXim63cdDEGBwfR3NyMQ4cOFSysjKofRaUt0gRDZTvnNramSl8p8YOiUh9Lv4GBAdc4fnxhds2MiIy5c+eipiY3bbPQ5heVeiqqjHEdWUy9JVnSO0Sx37iVdz9jsTw5XteOy4RKWy4qY/GHbuSG89KpmG9vPx1P1BaRuDcum5SXl7yxe9l2qPQV5y6bzA1fcO9UmKaJmpoaGIaRu25nrtgJP1T8AxV2sLq9+ENzuN+7lYlYLIbR0VFrrMVtM6JbO+32bu7F/PnzuTqYHlltabF5ZudpPIODg1jhmMNic6G8BViiUCl/IlCyOagt0heNPPIby7nht31sutUQF1t1dSNfBDfcGcbkXL8utnpOBLdn9AOvUrLjg+3YF2oE0cfDbTI3Ho8X2OfcOWhP75GREYyNjXEbxVIcKftzljox5yetmL0rPr8oL5yVTdn6SoXJPnjwoDLZbthPGhF9Rmd8XgfK/v/Il5dizpw5BU4aK2fr168vareMPJCVjzw5IrugDMPAx792txRbDh06VBDm1raK8MhvLOe2WQz2XOz+RzcZsmF6Fz80PbCchR/Kd7q9BghEKXd5DVp/vWSIhMuwQxYLPzyRv16DB/F4nIwvYZ9EHRkZKTiKdHBwMO/v1tZWrpOuKl1LketM+2Iyjh8/LqzDr2wVFNN56tQp4WstgujTyIVKek9ePIyGhgYA+ZPfDNZePP/88wByd4k7rzu87aHik/NevrHfQQp7uzXvgVquzDNnzghdreOHnTt3YtFHpgDwf3Ug76QJt75DdJBGVR/5iV+/lxt30Yen+JYhok8UvzLYO8HSR/gDeDJ0fuQrS63Pa9aswZkzZ/IGqgcHB9HZ2elZXqi8f7jJ4J2mQMkXK7cvK0s2BX1hyJats6mpCfv27fPsW25c7v+UPb9jR6x9YeNYdh75jeVS/bJyQaUt0gRDZjvn9EtkjQGpjB8UGfrc/Dl7+vE2ZvlhaGio6KJo52Sck6Ufn+X6nd9JQxnjOrK4cTmsBTEiFBuvd5b3zs5OYR08OX71lwqVtryYDOfz3/uZ3CZQP6ct3PEI/xpWFnfVqlV574uq/Cs23surN7znkGGHSl/xll+YmrchAsjZbR8bN00T4+PjmLxYbHMGFf9AhR3pdBqtra249zPzC75zzm06ywVb+FzKXAHv3bxYG+5c5MT7XgQ3WcU2+jF6enoAAFu2bMGDX+RvfGYbmvzawoNK+ROBks1BbZG+aOSFP93FDf/Zf3rXc1WmvWC27eQXdl64M4zJ37Bhgy97S8HtGf1w+PBhX06G/SWW6ZPtnNjl+Z0Yc8sbN7ki2NPVudDAr0yvvHHKefd/5wbTnCeCiNgfpCz4lX3q1CkAuZcItivOyfHjx9Hb22v9ffr0aV+y7dh3pNpPGhF9Rq/4NTU1uOOOOzBjxgxMmTIFh1dPrGK0pzsrZ8684cmWkQey8tFNjshVGGv+bI8UW+ww3X7qbzFe+NNdeOuttwLLCILXzorT63qd0blx7deMMJns74tbvHeBBKHc5VVG/XWTIRIusx0RxVnfLryR/6LEc9DZVQluK61LXc1tp9RnfP755/Gzn/0MQM5XYCcIOK93uHLlCg4fPixFZzH8yC3Wt5ajPy0nxXTKPlEhjGesZmSnt9uO62Kwft05Ycb+fv/99612LJ1Oo7u7Gz09Pbh48aIV5/S6ngK5pfry9913n+t3vb29Vrt55e1UgY6mpqZAJwp50bA5V9f8Lhqxf8/ae17fAYgP0qjqI3/8exu5/r/T7lJ0ltN3Yel5ZDX/yqJDhw75fm9107npL45xw531KB6PF0w2MPuovH94yairq8t7pjB9sSCyVdpN3S9XZUfYOrPZLJLJJF566aW8zVJ22nb6O6W4GCLjSFu3bvUVlxJU2iJNMGS2c06/RNYYkMr4QVGpz55+sVisYPMGUPw6gmILQtzk2WUcfbHLLXpBXDeCjuskk6WdDsKjbaeBuro64d8V088r78lkkptHQ0NDrj4lb65J5Ukjon5yEBmidoyPT1zruGbNGsRiMWvTwvvPXLa+K7Zh8vBq9wU8hmGgu7u7ZL9VJO7FN0aKni5i51/+I/+KDxE7VPqK5zflb9i8/fbc6QKnX+3Ne6677roLwyfmCsmm4h+osGN4eBgnT57E+880WWH2cV7nya288uI8HcfedrudOnJhc7AxwGInjfide3IjFothbGwM/f392LdvHzcO03HlyhU8/3/ttMITiYTV37G+0rnBkcE2I69bt64gHRlUyp8IlGwOaov0RSNupJKpvNMLnIie8BBFvO5Q9YO9U+vu7g5sj73jZxQ7Ms8vshzJUsvF5cuXcfny5YLwYoPFMp1Adu2KTN599100NTUVhLe3t+flXSmnkzh3nhqG4fuOei+cZWHSpEm49dZb8dBDDxXEsR+5VSk470gsHtd/+TNN07UDZvA636BXJ7nx/vvvS5lMLwbPISpVJ9fBinhXdP36dZw4cQIdHR0YHMiVj0rvX4uR9/xm8SMonXc/Xrt2DbFYTHnZ9kNPT481uFFfX28tJnQuGnnzzTc9r4lSgZ/0KTaoVoxYLFbyjiGK8AYeNdUHOwlEBnbf2d7WrVq1yqpr7733nuUL1NbUFpwEpxxOl3Tt2jXrtBRlaou0P36f3356JoV+AcgNfL377rthmyEFN5/l+PHjiMfjGBsbswarVWDP03Q6LeTLU4DtJtRoqMLa0MHBQbz88svo6emxNsrYKeWkERG6urqQyWSs0wOotOcaTRB0OS4Nv+2I3f/w85tSjuj3IzcMn+S5554ru04ZDA4Ocv2i0dHRgjkae/0ZGBjA8PAwWf+P2cquJFTFqlWrYJomMpkMhoeH0d/fX3D6tDONRDZE2H9b6kYKPzz99NO58XFjQi+b3Oe9OzOc42xhMz4+bp30cPbsWcDw1x6wa2o0OYqlhWmaBeOpV65c4cqR4ZP69XlnzpyJadOmef7O7dnY+E+xMYzdu3djfHw8b3O6m03JZBLHjh3DT37yE25/99JLLwGYuPr1zJkzAHLj1UBu8Z6GJtIXjTz+Xx7lhi/6lIkLFy4A4Bfoa9euWTvhFn2KX1F44c6wcnTmbs9YKsUmDJg++7PJuBuKR7Fdr255A0w0lM4Gc+vWrdi4cWNR3fZ0Fc3HV155JU9Gf39/QeNmtw8AHvv+soLvRXUzfUwXW3ghoxzyytno6Ch3IYdox1+sDDP7//Zv/xbf+M+flCobmLD3Fx5fyE2r+746Jy+el2wZ9VFWnebVVaCw7DFGR0cLFmr9+n/8hO/d5yMjI3j99de53zFnwH5cmlf99cvj/+VR150SH3zwgW8ZMuAtHrnvV+d4/sZtIZo9f+784lTub2VM7pajvI6OjqKjowPDw8O4/9duwrVr11xXCPuR7abPT3g6nYZpmsLPLbuftXPXF6fl/c36U9M00dbWlvcdKyvpdBqxWEx4l5AXsp7x6NGjAArbHZ7zrSpdeXKZPQcOHOC2f+3t7VafGYvFitrGft/a2lpwD30p9qnGTz/b2dlpndwUdIAkjGesZmSnd6mLJZz9+quvvorx8XHuwJf93YGFPfjgg/jsv7mnIFz0RA5RFn9uUoFMVQtaAeDuL88EULyeOdPN/v9dX5joO9iJfNlsVooPLqOPXP4rs7m2zH2Iv3hARKdq38W5I8o0TXz46zcXfG8/ieb999/HiRMnMDo6ag0+1dXV4fz58wU6r127lqfjy7//oKetbpsI7H0ZlfcPnoxEImFdE2maJl599dWS9KnsV1SWP0plW4W+MGTL1umsY0NDQ9ZYIKtjiz7lvXmo2I7mYvEAYMeOHRgZGcHRo0fx9T95pGh8ilBpizTBkNnOOX0BWWNAKuMHJai+TCbj6s+5pZ9pmujp6cHZs2ddfdhsNltw8mepPPjr81y/8+uLBhnXYcjaaCqjXKqUu+hTJlKpVN4kscqTRoKUYbbwRWZ/YJq5q9zYIqmLFy9a471sQ/jo6Gje2H2xifOPfKP4xlr772T6V2zRUCaTwV1fmOZ7on/jxo1574Cl2iHzWTo7O6125dChQ7j7y7mrZp3X06z46o0Fv33o27f6tsOPLeVCpR2P/FbuZBa3K2BExijc2mJ7+NyHx5HNZq05clEmT56MZcty85lO3xmQ48eeOHHCd9yPP7EYsVis6IZFtvjqwIEDvmWXku/Xrl0LdSEKlToDBLdF+qKRzqbCQhKLxTD282D7YJ3bqQJjLuWMF+4WVyW8ZwyC26Qvm6DcsdnfZF858JPe9tNQenp60Nra6utkFLd09eOYscaps6kPiUTCGsh1sw0AeluHrUE053d+YTazkz3sE1lBVnCuX7/eVzkrdTWjSBnubBQr707ZvAUUjMHOpBVmjzcc50/O8+yWUR9l1WmeHLd86ezsxPDwMC5fvpy3W7XrSh9efPFFKfY4F5/IaC/9ppXXqTGy0pu3y3eo29+COvtxcfb/TdPESJw/qSRjJ7jK8ppMJrFx40ZMmjTJ2hnbfL4bhmEUPZHGS7abPj/hf/M3f4O33npL+Lll97N27O2LYRh49tlnPeOz8lHs2i9RSnlGXr/CFi3yVoE7j1tVla5OuQMDA9YR3yzdeCcZ2Be1+bWtlL5VZXkKojObzVoLkfze+RlEn0YeKtK7FD/U2a9fvHgRIyMjrruleAMhfW2Fg7+XLl0qCHM7NlSkTrK4Yz3Zsk7Mjbj4lcVssKeXs++wf967d69vW2T4srz4iev8/HHz/YTeBcrgazuP8R3sGrfCM5kMXnvtNes75vsdPnw4b0B7eHgYw8PD2LJlC3bv3o2ms22or6/Hjh07kMlkkEqlYBgGrl/j+0T2/GZXZjjrT01Nja/n8YMqGcxGYGJSqxR9KvsVleWPWtmWrS8M2bJ1Ok8g5i0uLvbe6rcP4cWznyzCdHc29bn2Z6o2bMmASlukCYbKdi7oGFA6nSbVf6jQ5zXO45Z+rG81TRMnT54sOAGtr68P+/fvL9km0zSRTqfR09OTO5mpO4kFCxbg/vvvL1lmkHGdgYEBxGIxbN++vWT9dlTN5bjJTSQS3BOtROSwPkJ0A4sf/JThvr6+gj4tk8lYmw9k9gerV6/OW5jNThphmKaJ4eFhnDsyccL70qVL8cADDwDgj02x8X8eLG2z2ax1qqxM/+q5557D4sWLAUyMB9p9D7eNE5lMBl1X+wPbIfNZnD7TcCyDLVu2AADeeust67uh6+mCZxzoEDtNnop/oNKO/vbRvL8Nw0AsFvO9qcs0Tdf5R6dfefHiRYz15RYxNTY2cn/Dw+1k1I6ODteTRkzTRCwWy1voN23aNBw/fjzvlB2v5+L97/y+36VMyRjrKSXfz54963lCimqo1BkguC3SF40c2ly4UmrDhg3ob/A/oOgWlxfuDCvHACTvGUUwTX9Hxv7FX/wFAKDuvXbMnDnT+m2Yux94ecB2MtkxDAMbN27MW5RRbNJy10u5lWzZbLZgEMEN53eHNl/AgQMH0NbWVtBoOwfO69/vKji1QzR9nWWBd1qJ31Mj7Fy4cKFoOTNNE2vXrhWWDfDLsNvijkNviJV3kfpx5VA/N7z7NN+Z5MkOWh9lybDLcXa+9rI3NDSEN954A83Nzdb39fX1VvyjWwuvVXLj4sWL1me2Wx0Azp07BwAFi09E2mE3vNJq/vz5XMdIREYQDMNA23Hv4yOdtvHidp/hlz+3CTMR3J79vffe8/wdO03i2LFjeTLsiw57enpQX1+P2tpaZDIZpNNpNB3oxZQpU3wfUy5Sx/yEZzIZjI+PC+e5zDLizHPWvrAjNU3TRDwet67+4p2IAUzcRexs551X2fhFxjPady7x6l1ra6t0nTyccjOZjPVy4rXw026znz7P+dnvwI+q5w6is6urC/v377fKVzKZLLks+dGnkYvs9C7Vt2f9uv33xfxop39ycZ/77hn7Lg2ZV8fE69QdPcyj81TO37efHsJbQMPDWgzv8A3sC1dFrh+V4cvy4refGOXEdPf9RHSq9rWdp7qYpolrh3OLCtnfbIOAsw9mf2/atMmS19bWht7eXhx9sxF79uyxZDz33HMwDAOntjXDifO0Ezusbbb7uFTeP1Tmo6p+pa6ujozd1PNAlR1h6ty0aZN1WpOTuro663N/g/8d3aJ9qHOnvGEYOPzGRVe7ZF4hJxsqbZEmGDLbOWcfFnQMaGxsjEz/oUpfKpVybUf8pt/o6Cji8Tj6+/sxMDAQyG9mtrBJ+rq6OjQfGbTaKOeYm58xOCDYuI6MsTA7MsYmS5Xrp89wyrH7n0He2d3wKsPt7e1ob2/Hli1b8vqv5uZm9PX1WXMtsvqDjRs3YnQ0917h3GDHYOH2dJoyZQpmz54NANYiBmAiva8eHsAbb7xRoNP53squsZDhXw0ODhZspug+k3Q9VYJXj0TKqmgdKyWus/x2nhpDe3s7DMPIG+NsPV542kLjAbHJdCr+gUo7nGMh7NRKuz+YSqVc2/TOzk5s27YNQP77Ka8sNTc3u5YnZ/xSNuawBSHARP9hn7OdOXOmdXKnfT7JT5vonNPt6uoCAJx8m7+Izs8JLUNDQ57jxaXme5BN/EGhUmeA4LZIXzTiB5WLHsJcUOGXM2fOcDtKN5LjScyaNcv6m01sUcQ56Gdn7dq11lH0PGKxGNLpNNasWeO6gs3JmjVrCo42Yruriq/048v0GjR0u57CuYP62LFjSCaTGB0dxerVq7m/KZVt27YhlUrhpZdeshp/GQ2ivUFnaTB9+vTAct0m6Z3pnJfHBgriRBFePchmswULidjEby4d/Mt/8803rQlH+/FhbEEKe6FgcSoNbtnyURW8JrBN03RrGkqiubkZ6XTaWqHf3X2dG895NJ3zaLfjx48DyLUtdpijdujQISuMpcvkyZMBALW1tUrvBvVDMpksWMBQTux5nkwmYZqmdQoH+469BPhtT1k8r35NJVeuXLHabZnX5pTC0NBQ3ukGdpwnAsViMavcBoWd8hVFnHUyHo8X1G9N9SDrxXbZsmUFg2BBfCk2YOg1GF1slwqPTCbjazGZPApleg0Qug2MOr/3O0hfHgptoWObN/Z05JVZP2WY26+YE+WTTbLk3hP914l4PG75ZLTym4/93vW77roLM2bMCNEaPiIn82gqDy8fkC0oYfjdPOS1UNL+9+XL7pszTEy0P2wDBoPySSMajWqiPibnF5H+3WsyjE0uBr3GRXRMwjCMUI/kD4rzmmAeQd9rGE1NTULjJ2wS2W2uQzXt7e3W6Rt27GFsoYUM2EnxnmP2NkzTxNjYmGcfy2A+tX2RKJMPFG5+CMprr73mOs7ndtKI28kKVDBNs6A88G3Nv66G+jsMBUzT5M779fX1uZ48bZom9xpg5+dSF5ulUin09PQUnCLN08H+ttdXXnk2TTNvbrmUMs82bNgX0zjLWEtLi2dbu2bNGmntOoNqvY0i0heN/Nmmb3PDlz3h7+QIZ9xi4W5xgdyKaBUveG7PKIqzk3Rj2RMmEokEiYLvld7FBlwB4O2333ZdePGx35uFd955R2hF9vDwcF7D/Webvp03OAhMTLY67fnt//Vx7jN4DTo4j4//oxd/hWvX6Ogo+vv7rYliEdgEv1s5Gx7OnaRgv1sRAN59911s2bLFlyPgJtv5vHPmzMGfrv+GH7OLyubxyz9cxu3gPvZ7s7jxebJl1Mc/2/RtKVdPMFvcJk6TyWTekdE8/su6XxfSaW/j9u3bl/eC6lyc4lV//eKV3s7FU6XIEIE3YfPZ/3iLrwFG5ySX3bH62L+fJa29fe2115BIJKzFbff9Ti7/Wf11DkqyY9ReeeWVvL8ZPT09+LNN3y5wvNgxy8BEOzZz5kx87UfLhZ5FpI6JhE965FpBmyVqR6lcvXo1L52XPWHmvTg6HWR7HWX91aJFi6TZwwj6jO+++27e384jcYHCtkhmutp56PdmY2hoCOPj4wXH8PLaA3bKl92+//ra4546grzkqnpuWTp5R1Oq1KcJjuz0Nk2zpHttnf26fXKc/V2sD/jGXzwgrNeJaJ+57An+Ma6qePSp3D3wxQZVnN/Z/7f7pkEWjcjwZXnxH31qPtcWN99PRKcsX9sN3o7ZL/+3u6y/s9lswZUrvIll54Kp33/uiwAmrsJgv/vdZz7HtcHOnDlzACDvpDa7XtVpUqqMN954A6Zp4o477ih45/i/N3zTUxbbFcpOv1PVrxiGgf+28Vu+48uoH7JkU9AXhmyZOt0W5TnbLxnvrUB+3bYvsm9vb7feTQzDwJ+u/7pl2759dK6HLgaVtkgTDJXtnOoxIBnxg6JSn6y2SASeX/r5/7w4b9LQvkiUUeyaaxnjOrJwpuuqVaukjMEVyy+/7x92OfbFENlsVsnku1tad3d3o7GxEbt27SpYdG+3o6urS8i34jE6OlrgK/IWddttWPaEibfffhtDQ0M4efIkDh06xL2qPB6P4yP/Zpr1tz1N3eZgVPhXbL7B6Yt4TeiLtAEy6lKxuKZp5i2y+uST8/K+A3Ll9HN/ckvBb7/2o+V58YLaUi5U2vH4Xz7IDS9lE4Mb9sUoy56YeJ9k14/t27fPOi2cMTg4iL6+PmzatAnZbLZgPN3LnqamprzvY7GYp29rf3/28yx27M/jxG1+1+9muVLzPcwFUlTqDBDcFumLRv75D9/lhre8Uziw47Y61x63WLhbXADYv3+/klM53J4RQMGpF2709PT43mXT8o5R9MiecuGV3gw2oOZ2vOdzzz3H/c3510Zw5coVGIbhe7GPMz2ceVNfX28dvWSPH4/H8ebf5E/WuskEYN1rzY7kYwsM/u77r+XFY3az5zZN09eRTHZeeuklABPPYr82gWFfvMImmzo6OtDW1uarjLiVYfuRgwMDuWOhn/3jnQLWe9cPJx/8n5a8v1nDXr+ef6UPT7aIPjf++Q/f9V13i8nhweou78XOGW/Vn3hfUwLkFhzwTje4cOECBgYGuJPHgL/6W4xi6R2k/Mng2OqYcDvpnHg4/1qwXSF2RkZG8o6DY7LZ8XUffPBBng32K73sf9ud/3/+w3fxzDPPAJjYBW7H3lccfeF6nvxiiNQxr/CxsbG8K3ea3861j+fOnSu4FkxEdimYpoldu3ZZbbG9HrS2tnpeIbZr1y4AKHjxldEXl/KM9vLhXFDE2m/7KTXOPlhV3TvyfLe1wp63OMg0Taxfvz7vb/v/APDMk+8gHo8XvcquFFS2OTJ07tixo6z6NMFRkd6llH1nvy6yaGTBggUwDAM7fzwxaOfls7rJmTp1qqjZaHmnvO80p1/uB1B8l5Xb5KVhGAW+aan2y/BlefFPvdRXEHbt2jVX309Epyxf2w37uyNL+/3/3FrwvSjrfjhxGpV9Yej6/36kIK6f/GR2bN++HT/9/beKxi+GinS1X6Vnr//pdBo/+PRfc2WMjY0hHo9bA9BsAZvKfkVl+aNUtlXoC0O2TJ1sEZcd3iRYsfdWt0mmYnWZvZfZFzAbhoFn/3hnwS7yoP1UIpHA22+/HUhGMcpdhjVqUNnOlWMMKGj8oMjQ51bfZaSfKGyhq53Dq9xPaQoy1uMWns1m8X+eyk2k866kDwovXVevXo3BwUFXfX6es1h++T313SnHPn6hYkKSlwdDQ0NYvXo1xsbGrEXMXqd5BK0Hb731Fv7x93ILiHn9Ku+5WTo5F24DwMKFC/N+f2x18dNd7ItyZPhX9nFUZiN7p7O/f1y9erXgt8yO6+9PK/hO1A6ZviJ7pnvuuQc1NTU4/Up/3kYIwzCwatUqHF19veC0593/dBW9vb348z//cym2lAsVdrB03PG/LwHItXP2K1vc4nt955xbSCQSGBwczBsDt7ct2WzWqtu9vb3Ys2eP9d2lS5e4ssfHxwts4S3qco67837n5xmdYbt3784L543ruM3psr7t6NGjeeX1rbfeQn19fcH8BpXyJwIlm4PaIn3RSG8nv3NP2YKLFdKUiz/CC3eLy/So6Mx7OxPo6enhfnf69GmYpomnn37aU0YsFvO1mODv//7vrWcMe8EI4J3eQL6Nbicq8BoP0zQxNpC1JtadO6j9Pjsrf6zxsTe4zkGA6639Bb9PJpPcfHn55ZctGcCEozYUz181x8Lt9t50002+bGewht1el5zXi9hXmJcy2eRWT+315dKlSzAMA72dQ0Jlr7fDvZDYnTIAGOlLFTg3pmlifJC/+ptnt9uziCBDhl2O32N1efR1Dnm2W/39/di8eXNeOfV7Ok+x+usH9oy8o9H8rk6Vld48PaP9/JOM7LgNLDJ5rC2Shd3ORGwMmUzG92Iynh329LMfQcqbJBzpy29v7dcY8RCpY17hly5dsq5JAnJlb2xsDK+//jr3hcyvbBmkEt7ll4fbZG6QclLKM9oXhLid2vWTn/xEqk4/jDrKme8+27bwpb8r9/LOXhTY8eV+ZBVbaKqyPIWp075jIYxnrGZkpzdvcNgPvHcE52Ix+ykNvP5vqGccxSg2SMKbYOPtMrPbbZomFi9eXFS3DMYGcvXEns68NOf9zf7ZfVP784q+a8rwZZ3x+/r6MDaQLXj36unpcfX9RHSq9rV57wIjfbmFkGwhlNeCHh6GYaC/e8T6bPeRBq8XLrh1K+PTpk3Li2MYBrq6utDbGfwIeFXpytu0kUqluGWho6MDHR0dOHr0KGpqaqzfplIpdF6LY+/eva7+RqkYhoE+gfQLWj9kyqagLwzZsnSm0+m8cga4L24UGXfy20cBuWs7nRiGgb6uwk1ar7zyCtrb24vuTG9sbOT6ops2bSq4clQ2lMZCNKUjs51z9otBx4AmT56stB2Wgeqxg3KTzWYLxovY+zbvZELR8XI/4Rs3bkRTfauQfBF46drS0oJt27bhH/7hH6TKFYFN4DI5zvqkcp6JwTYKXbp0yXPDlfP04b6u4L7pQPfEBrpi70kAf66P/e98F3Qb57enKetv9+7di46r/q8Q4pVhu9ze3t68+QZevvJ+DwDjA/5PxxQdOy0lrn0hPACM9efbZxgGkskkxvoz2Lp1a953w73JvM3CQW0pFyrsYGXg2oXcfNvQ0FDeohHnafT2Dek83y4Wi+Ut3gdypzg2NDTkjduX2kbxxnO8/mZ1iY2v2n1Zv2MgPJxtkp/nYWVu7dq1BX2bYRi4du0azp07V3CtWyn5rqqN9guVOgMEt0X6opF7Hr6VGz791onC5lzp5hW3WLhbXNHTHUS45+FbC3aD8/SyHeSZTGFD7ZehoSHc9sCNJf1WBW7pbcc+6Of2gm1fzAEAzz//POYsqYVp5u6aFunEGDt27EB2lv8KkZpe2MgPDAzknTjBW70HTDS2C++dkRd+22235f3ONE1rYELU0bbXJa9rHUTuYuTJ5mGapjVAe/dDi6zTT/yQnN6X14G6Pbdpmrj5Hv4923OWTOKG8+wu9ix+kCGDyUkmk9xnfv311z1/G4vF0NfXhzs/erNnB9fY2IihoaG8OsKcgGJlzF5/2ek5orC02rhxo/Bv33///TwZMnAuVJm/bKqvumZfvGQPM00TNy6dpGTRiGmaBW0o01Ns94Y9v93Sjzd4ytKD2WB3dHmI1DGv8J07808omn6rmee8dnR05B35zmCrqb3KCLtyzI22tra8BSuM2bNnY3x8HNNvNa2TvuwTbOxEJ94JZfY7Q3mn/JSCzHoQts55yyZOGnBbZc/j6NGj1uc7P3ozgInFMOxkJmdbxWsf//Iv/9LTPippfe3aNVy8eBEtLS1SytHmzZutRa1hPGM1Izu9S1004uxTWD/m5r/a47CwRcsLrwScNaswLOjL94IFCyy7/LxPyGTunVMK0sQ+4GHvp3kT5JcvX8Zsm28aZBe6DF/WGX/dunW4/9Gl+PCHP1wQ1y2tRXSWw9d2lq8Fd0+3wnlHgTsHpnnc8ZHcYDU72cAwDNTU1GDxg4UL+t3kTJ8+PS8OG9i66+d9VhDsaWKapufJZ35kMNh7qnMCnFcWnn32WetzbW2tVf4HBgaAOcO4ePEistksNmzYIGybF+PTcotG/Sx89yo7vPdgamVbtr4wZMvS+dxzz3EXjfDGjET6iWKLRniwNuX8+fNob2/HlJvTaG9vz4vf39+PnTt3ciftrl27Zm0k279/PwYGBnD8+HH09/db78mdnZ0F+tnmKPu44fXr13H+/Hk/j1oApbEQTekE9QMYvCPfZfhcsuxTRVB9Xj5uuX1WO3a7itnhx08XGddpaGjAwuUzi8osFbfnCbpQVVZ+OeXYFx2omJC050Fvby9M08TkyZMBFM5vsUWKznmTux8qvI7EL6wfvuMXFgAofHd0g6UTbwGoc1xnho9qam1+GxnB4gf8b8DlleEXXnihwK5sNosbl96Qp8vrMwDMXpp7B/TjC4uOnQaNm81mMffOyQXvRaZpYt5dUwvKq9scjAxbVKLCDvu4BHuf8Ptub/fZxsbG8sb2im2mLbWNssurr6/P+85+Xb3TDrYQhr1P2+sEG5Nnc3h1dXVF7XCmkdfzsLhsntW+eMa5YYQHlfInAiWbg9oifdHIL30vf7CKZfy9vzQXDz74IAzDsDo+N+Z/xH+4W1ymW0Vn/vnvfqhALm+V2bFjxzA6Oop//Md/9Jz0L8adj6lz1ETxSm+Gs4HkDVzv3bvXaixOnz6N4eFh3PbIFADApEmT8q538UtjYyNm3jdm6XDmvzPMz7M4cT7TfV/MXzk7d+5cV7v9Hv/FsNcl3gQoz65SZNvhOUqf++0HPI9MTyaT1o5wALjjsZm+74lc/oXc3XvOzoKVBQZzinl2uz2LCL/0vQ9LWWj2S9/7cN7gKzCxOMHPkY7pdBqf+c0VvvKzlBNm7GXenmcisPS2d/aMYi9RbFeEjDxz6mXc9ZnZrhMJXg6J/bvbP+n/+EG/9jHZ8z+Sbxcrd8WubGHXRQET6ee1OpjpW/a5Odzv3RCpY17hznJgf+6enh48++yzaGlpQX9/P/r6+qwVxa+9lrvy6+NfvxMA32l9++23PQcTuru7uVdBDQ4O4ujRo3n1wG4ju9ap2IIqPzsu/CC7HoSpk5Uzt7rvJ41WfCk3QOFsQ91OJ+Nx9epVbttEJa17e3vR3Nxc0sSgk4GBAZw7d846uSeMZ6xmVKR3KW0Ja8/YAiw2+eb057xOY3jwKwvzwlasWIFHH320wLZiE2/FsO/uvv9LNyvZtcjDMAzc8encu5Q9La5du4bW1la0tLRYYT09Pejt7UVLS4t1eiAblJl+9zDi8bh1QlKxASE3ZPiyvPfth35jCQ4ePFgQ1+19R0SnLF/bDZ7/cu8vzc373qt/cS4EZp8/+a27C3w/wzDQWVN4Ralb386rS4sXL/adJjx/my1EtctIpVLWIkAReHbw6pZhGJj/kdyYhds7SU1NjeVf1dTU4I7HZlphvBPaSsUwDMy5P7dYxNnn8/BKa+Y3+o0fJK4MGaXoM03T12J/Kr6OF5lMpuBdzPn+GGSsBijelzrr9oEDB2CaJhKzr+aFX7x4EcPDw3n22ndAX7161arfNTU1iMViePvtt9HZ2Ynr168jnU7ntUvsyku2ON1+bePAwIB1PZTohGm5y7BGDUH9AAZv0UipdcmPPlnxgxJUn5cvJyP9SsXenk25K8ENF0F0XOfBryzC+Pg4t1wFxS1d+/r6XMeTvSYXi8kVhcmxt/9uC5llYM8DwzDQ0NBgTWQ7r2xg/YWXDFFeeOEFAMCj37rH1/N59dUszdj4L7sSaP4v5MezX3Ph3KALAB//xp2+7bc/O9M/Ojo6MQ47f771efEnp+b5HsXGsG77RG4O08+JyaJ1rJS4znfQJZ+anvc3Y9lnC8eC7/+y2Ds4Ff9ApR32MlyO8QlnnSk2X1xszME0zbwxU6ef65zHtD8j2+zJ3kHZ3LZzPAnInb7Hs8urDWD4mePlPR+V8icCJZuD2iJ90ci//FH+tSKrVq0CAJx/bWKAfMqU/AlhJy3b+BWBF+4Wl6GiM//ZH21HLBbD0NCQdVzRK6+8gpaWlrx4iUTuiP5EIpE34SfKyZ/fUV3qLkSZFEtve6PktmiEOT5///d/D2BixRkrI/ZdTk65bjBdDRtGrMaIDUq47XZ2PoufwRjWgDI5e1bynQZ7YxyPx4teCcHjX/7o3aKDBmzCmXUQfsuHs57yYM7wqv+8y7UenTp1Cj09PTh48CAaGhrQ0tKC4y/GCnYPufHBP7dww89vyJ9Qe+6551zt9vMsxWAynCfglCqHd4KFHwzDwOo/3Q3DMLBv3z5uHF4es5eGYgtBitVfP7BndL64MHj2DQ8PY3R01Pru6X/1IlKpFI4fP44LFy6go6PDegbRNs7+gmGaJo69mBvULpbm9t8424i61xJS21q2szWbzRbNAz8nB7E88GPj0ee789qjYukiUse8wsfGxvJ02Z+b7Sq9fPkyduzYgcOHD+PcuXNIpVJWm/dXv5NzWt0mNryuYLHjbEPT6TRathXet1oKQXeaiLRdzmP6SkVGe8nDWc4Ybgu1eGV381+ccpXPXlp4i1DtnDt3jruIV9Vze+Gm89ChQwX+Yik4r0kM4xmrGdnpXaqPz9rWM2fOWD6v26IR3kJqANj5j415MqdNm4YbbrihQNeNN97oarsbCxYs4IbXvZbwvcA4KKZp4tTLvXlh7FSrdDoN0zRx/fp17uJh3nuDfbC4lD5Ahi/Li//m35zlxrX3eaXqlOlrO2loaEBdXV3Bu+OBn00MiPN8NXt5d042M179fw/n/cY0cydaNr9TaAdv4byd8fFxxGIxS++q//Se67Oapmn5L5s2bbLC2cILdvqgPU3stre2tvoeO+ClK28nGZArC9u3b8872t5+dLizjTj+Ysw6pQX4uQ8lof8CgIZNufcIP4v2ec/Y0tLiOnFDpWwzWlpa0NeXO4lTVN+6deuQzWa5p9xu27ZNyA4VlKKTt2iE1x+IvLd69UOmaea9XznjsiuoOnZNTAqlUik0NDRYE07Mvueffx5AbjFhTU0NNm/ejObmZly9etUqi5lMBrFYDIcOHcqbbLQfec5jfHwc2WzWGj/1SznKsEY9MvwAgD+pHnQMyDRNafapIqg+L19OxhiaDLzs8Jr4tiM6rrPrJ434m7/5m6InvZaC2/ME3cwnK7+YHGe6Bhn/8cLpE27YsMF1TNg5eQvkfMx/+N3NJemuq6uzxu9e/uGBvGsw7Cfz8p7bLZ0Yd9xxh/W5+W2+ft6ikebmZmz4/47yf8DBnn6rV6/Os9k+9pfNZlH/2pCl124Dzy4gN88EuI+Bu9nhJ7yUuE5bz7zSzx2LO7q62/q8bt06mKaJPSuvCr2DU/EPVNphbzNkzAMU2/TtbKPsp5Z4zSW5Xcl95swZ7tUz7P9bbsmdQOSV77zvurtz5Ydt5HS7bpHX5voZ//Uznl5Kvqtqo/1Cpc4AwW2RvmjESXt77m4oGLkC0dvbi3Q6jbvvvlu16oIOoLu7O+8o9KD09vaioaHBupOJXb1QjSSTSe5gHvts56abckeMLViwwFrVZo9jH5wSxshfAex0cLzksqNC7ThX/LHJzolGOD8+r6MGCid3/PKzn/3MV7wzZ84E0sM6A9M00dvbi5ER++Kbws6H7WQ8dOiQlc/xeJy78MZ55Q9g7/iKrxRPJpNlm1hgV1YEwVnOYrEY99QDL2pqanDuXOEuSIC/qryUq2Jk88Ybb7jm5enTp/MGysZGR5FOp3H27Fm0t7djYGAAW7ZsAZC7486N06dPuzrqvMVhPEeFtxPVS4YXfstlKpXKKxdO2V6n0Hi9NPtxxBhBFziIwhbt2WF5x5zdbDaLhoYGXL9+He+++y7+6q/+CpMmTSqQw4N3HzkPNrjLYC/eMhaNMFQv5vzZz36GdevWKdURGE4y8k78YNjrOXupsvfXDJa2W7duhWEYeOutt/LC3eJThV1bKAOZ/qwmfGQsDLdPTrkt3nJO1InAq6Mi8AZdVbN9+/ZAOt12lLG0pN7mMLx8q7BJJBIYGhpyXfjB29XJWzTitqvKuXii2G5Ct/DBwUHs27fPVz3IZDLcU0Psx1r39PTAzOZkx+NxqzzV1dXhjTfesN7NSsHunzoHLi9evIj+/n5uG1GYjhOLng0jdz/6e++5L5bxi/NdiXddYTF27tzp62obCrz55pu4fPmydXqRHzo7O5FOp9HX14dMJpNX7lh6BTnJNkzcFo0E8cmLTfzYw53/33XXXQCAmTNyR7YfOHDAOmEEmBibOnbsWMHi/kwmY72rsXpnf89hz8d2cG/eXDihx975T548id7e3rKNe2gqEz8nMchidHS04Ij8qBLmBFMp2Nsxt4m8IDjHYIqdilsuws4ne3/19tsuKyAk6fHCbYysra1NuP5ns1lrjMIwDMD23lhq32y3wW2i286pU6fyFl4CuTmK4eGRkjbgjoyMuC4w5/m/9nBu+nGSgPnuYVDsncbZDzjt5PkZfvKpGvCzmbwYXuOgxfAqU2z8zxnn6tWrGB0dLRgfZPFm/Ny/5S0s8QO74t7Nj+bBizs6Olowv+FHliY8pC8a+dofftz6bM/0pZ/L3Xd/8uRJXL16FY2NjQW/Zdz8CX5h4YW7xQVygzGGYeDv/u7v8OKLLyKRSFhHCotiLX4B8NUnHwIwUeHYCztvNZmMgn/vl2cHliELt/Tm3QtuH9ywp4N9so8tPjBNE0s+O3ECjdtuyGLc9fkZeSeNOAf27E6BV9lhnD59GoZhuDZsH/o1/g5KGYtHfvWph6UeBWzHXk+BiUUHzDFsbW21HKtlX5i4827NmjUAck4di+/8/76v3GTF379/P5LJpHUMs5MP/1ruPnD7hLZpmlZ7AUycMsKz2y1MFBkyAOCrf/BQ3iA1SxeRwdAv/NsHPR3zs2f5O0n94KfMF4OXVqZpWguXgNyzZzIZq0101uEln5uaV2ZqamqsZ+ZN0MTjcXR0dGDLli0YHR1FKpVyTYf7fnkuV6fTXh7sN3d9foavNoctSGALXuzys9ms5XivXr3aGnS/+RMTk0ysrRKZzJk/f76VB/byZT8yH8gdu2+aJlb8Si492G5qls7vvMPZZguxOuYW/ss/+Gje36lUCjd/wrQWPLH6wHZ6O3nttdew5OdtgNsL8T333MMNZ7DyyJtQCFoPnO1Vqfhtd9zaT5U6RVnxyxNXs7Eyxu6Fv379uucAOLuq5fP/ekXBdyx92RHfzc3NMAzDWqQbi8UK+mde+6nqub3g6WQvoc5y4+eEIQZbZObcsRrGM1YzstO71AUIzvbMuTiEffZaFPmxx4vfdepln58BAyd3/ZJ7P9vY2CjlykAgdw2AYRi450uzLXuK7fa2Y29/7Wlt91VEB1Nl+LK8+J/6rWXcuDd/wuQO9ovoVOlrOxcssP8f/NUFeXGc75XMd2SbBHiLTVi/wiZ9mR6eH+C2ENcebl8Q+8s/+AXXZz148GDRRR979+7Fp38nt4nmrbfeQnNzMxKJBIaHh9Hb24uTJ09iz549ePPNNz3l8NI1m83iypUrGB4etq70u3LlCm7+hIklS5Zg0aJFePHFFwHk+zR2vyaTyWDFL9+Ud8oC++7KlSu+dlu60dXVhbkfTVl/F6uTvGf0WmhApWzbYWnrV9+ePXus62Gz2Sxqa2tRV1eHrq6uvHfjZDKJVColJFsmpejk1VX7NUMiYzWMYu92wMQ4jFtdf/BrC/J+45xwaW9vRzKZxOnTp/Mmi9i7Cqsn9v9ZHNYGs40kbCylp6cH7733Xt5kmegCTUpjIZrSkeEHAPwNeLLGgJqbm/N2RI+MjODcuXPcRfHlLlNB9Xn5cjLSr1TsvoTXWLzfdwivcZ26ujqr7WPXhvh5RygVVekqS66bHOYzXbt2TYoehj1vir1buI2t3PwJM2/uyg/Ozbaf/f5y7mILgO8b+0lvFn/hJ/nfHzhwoOCkEQBY8tnJOHz4MP9HDpxl2+7bsr6VPdOdn59esKCct8iC/b/05+PX9lsTNm7cmDfm62ZHsXDRuHv37i2o73d/cZb1HPb/2dg4wzRNzHhgKO/3LL3ZKWfORRNU/AOVdshui/bv38+9Zt2ur5QNy/YTgHiw8XW3+UjeGLbXog1nGCtXzoVcvPTjtVHOeTHn3BnPBirlTwRKNge1RdqiEZa5iZ6JAQT2AjU+Po7UsP8dzmmXMQheuFtcANZujtHR0byVhqXA7mADgMGe3NFUbvLYlTXAxIqsIIwN0lnx55XeDHvnbA+zYz+u2tq9NMy/0sSPA8x0poYnVsMWO2nEz7O4wexz5g1b5OG2As/rxIl0Om0NDgHAKy8U3tEsgle62eup22/Z70f6k9ZntmLSubPPnt/jQxMDJPaFBDzGEvyynRzKcu9l59ld7Fn8YJfR19dX8kTwYHyk6AkIxWQP9owqW0EfpMwzeOltmiZuvvlm6zOQa3ft1/3Ynzs1nLUG/Nva2rjPa5q5e/l27NiBjo4Oa6Ehu1ud3avqbIeTQxlfbT1vxzMrz6lh7zxiDhBzetjk+OjoKPr6+rB161ZcvHgxbwDFMAy8+uqrSI/mfrdgwQL8j//xP/Lksr6j2A4zlgesPrIdavb2hw0ejP88PS5cuJDXJrrtnhapY27hWzbmdl2wNI7FYkiPTvgE7DnZ5Le9XTQMA7FYDL1duWPZRe/1ZnhN1tjrQZBFH0Hx03bJPsFMRnvJw97uu9XnYrz9xvaCMKdMnhz7pK5b3Vf13F6o0skWb9ppaGgI5RmrGdnpXeqiEWe/zvwzu29mv6qCwfoCwzAwOpAukOGX22+/3ffJT/l9deE9uYyrV69K3zWZHAre1jvT2plnfnf9y/BlefFHBvinLqRH+W2niE7ZvrYd+8Ime1vP3hGcg9Zs8NdtQNvOUO/ERIq9HvD8YXsaDQ0NcU+BY3FaW1uR6HVPE/v7HMBfwGqaJi7VT/hily5dshZRArndonv37i1arnjpyvzI4eFhmKaJZDKJnTt3Ij0KLFq0CDNmzMjTxVs0ks1mkRzO5k1AssULp06dCnT1LpBfn3inLxR7RvskgJ/4IrJF8SODpe3hfeI7ZtlzdnZ2YnBwMK+8Hz9+HOfOncMLL7wQCV/H6c8NDg4WTHAxH77U91a3RSHOv+3lGgBGB1NWOO89kdHV1QXTNPM2KvHksbaN1SF7WT106BCA3ESZvS6VsmikXGVYoxYZfgDA3zgTdAzINE10XLuOgYGBgkW9LS0teP/99wt0lrtMqdQnYwxNBrwxDPYe3N3dXeDr83BLp/7uIWzatAmbN2/Grl27rJOine8IMlGVrrLkuskJejKWGzLKcHrU/QoJHqlUCr29vda7XE1NDYZ6x1BTU5N30lwxnYD7HE7eHIzLjcv2fpdtFgKABbNv8f0szvSzj306r21k471uY0fOMSgWf/bs2ejo6MDZs2ctWfaFvDw7ioWLxq2vry9ob9l7tb39N00T44l8/wTIzWHZ393ZRkjmyzhPSqTiH0SxjXfbGCaqj+WN15ioPZxtDPAaWyrmK4vAG2vgzYna65rfDUKl5rt9Xr7cUKkzQHBbpC0aeemllwAAe9flVlMNDg5iy5YtuPXWWzE4OIiO4+O+O9bec/x4vHC3uHbmz5/PXURQKh+sz62udqtUW7dutT53dXUF1nfhPfc7WMuNV3o7jzpydlg85s+fb8XtPDExoFaKE1ZTU4O2Y6N5uzt4i0aYbD9lh/1u7ty5BWEAcHlP/sk1bPLW7bm9JoOvX7+eN0HYfao0B51NurJjUHmwemrn9ttvz7ObOVYN7xV2dOx75mzYHbCrBwZ9L/S5vKfXOh2B3Rm8du1adJ5I4tSpUwUdC89uXpgodhmJRAI//vGPrR10IsdD71tfX7SN8UobwzBwZPNl1/If9Ehov2Xei9d/Wrgilg2E2e121n/7d53Hk1b5aW5uzhtsmz17NpLJJP78z//cOq0kk8n4LlNX9w+61j+nHc42AcgtAGk7NsrVl81m0dHRUXDXtGmaSKVSWLt2LbLZLEZHRwsmk2pqanKLIc4ZWLVqFVf+yEjuLYotJmloaMj7/tq1a8hkMnjtH3LlgB277xykZLvZstksruyfGIg0TdMaWHBbjCFSx3jhLS0t6DyexKRJkyYW142Nofec4arTOZkSj8eRuDgZb7zxhutv/PaFhmHgQx/6UF6YjHrgPA6/FPy0XTKuM7HbKKO95MHKmdvLAiubXgsn42cmfsvaX7s8584JHm4v/qqe2wsvnUEWFPNesN5///1QnrGaUZHepbQn9vbMPolun4jn+eT2etLwXje6u7vzyhbPlsbGRkyfPt36e/78+Xl+IA9e+KJFi9B2tLCftOs2DMNaHBoEpqPlcOnHxDKcfcepU6fwwQcfWGm7bds2X3Lc+lmR/N+7rq5gsvH0O62+7Payw0tfUIrJcLbxjfsmBr2di0bsg7+86/BYvKNbmri/4aVJsQE4px0HNlxwfRb2/r9+/XqkUin89V//NTKZjLXYNx6Po66uDgdtMtjuMHZdKlvUUeydmJeuzGb7O0l/fz96zxncd1H7aS32fvvqgYQ1aQBMTGizdEilUgWyisEmM3rPGb6Pu+c9I8vTrq6ugt2+1Mo2MJGPx7d4LwLKZrPWeAL7DXtn4k3esDwbHR3F3nV1gU6AKYVS0o/ZfPz48bx3AFaeWJiIv+42QeVnITP7m7U5br4kqwes33NbLOJcNOJsf3jvD+zvdDotPF5ZrjKsUYtoHrjF55XdoO++w8PDeP2fcrvb2QkUQG7HO2tzzp07l7cYsdxlKqg+r75WxtiBDOx2lDoGYU+nxsZGS86+9blrhs6dO5e3ee/8zuDzGW4US1feTnw/8wSy8osnxzCMvOvLAHlzNPa8KXVRiteYG4/h4WG0trbmbSY4vuVqwcJs5yIK++di6W2aprXZrsfl0GznopGamhosWrQIF/f4P6mdvUs533edfXA2m0X7sTGhNG4/Nm5t0Nu0aRPq6+utdBkfH4dpmtYpTCJjp17P4kYymbQWtrNnaDnMX41z7cBgQVjsjIlJkybh/vvvB+B++gPzyXa/fDb0eUhAbZ9S7jbeSx+vXLINBH4XjbD3Tec7H/u+p6enIN95MkXm8NmYBBu/5vm5kyZNwm233Yb58+db8t3Gqxil5rt9HIldca7ySjE7lHzqoLZIWzTi3FGTzWbR2NhoTbgmx5NFTx1QASvkbHd6qeQN0v+8DFfK/Y2ycF694rVDwx7HOWjs5+WeJ8c+sMJ+w7ueppgcPxR7Nt6qOj/6ZdQRtprx+vXr1pUUfrEPehSzmQ0c2n+TG3B0P8bq0qVLuH79em5y/+cnHbBBdrbClXU27KhdFau4vbh+/ToSiQS6u7tx9OhRPPvss75/a5oItGgkF6Hw/nWGyJHqxSh1h+DYWOFVO7wFWpYD29JSED+ZTGJwcBCJRALLly/HjBkzrOedMmWKVYdPnjyJjo4ObN26FYODg7j77ruxevVq7uIP629jIszPBIAdwzCsq2ZYubSnUyqVwjvvvJM3wGkfDKytrS1weuyyncd7O3H2UWwR2aZNmwAAO3bsQCaTwfjP8+DChdxkg/OFmvW72WwWicFEXl1mC77YoL3se7PZqmb7IiI2uCpyTZNp5hbwZDIZ7sIJr8lEkTaj1PbFftygjDaKtwtZFjIWr9rxasPc2mz2m+PHj/uSx+qdc4DCbYIZmOi3yt1nuGH3iS5dukTiZVtDF9Y2B5Vh72uAib7HebWHs57YJ11PnDhRcCcuO3r33nvvFbbJbovbxB77noWbpokTJ07gL//yL4X0OfHzLkKF1atXC8Xv7OxEX18fEolEUd+TSrvIw97Ou5Vhe5m1L1qwDwSzuAynX+q2uIThp/7lTdp7FCX2TBcuXLD6s4MHD3ouknCOpdjp7+9HLBazTifwYyfAv6agtbWVWy+c7x65Z833X50bBpy7K/1g38HoXBwtCvOb7ROVKhgdHfU8ZtovftqfVCqFw4cP56V7JpOx3jGK1XWvTSMUcBsUBnLvfbJ0OP+2bwByjnE46yXvHY73O951NLz/2fPajxV382tLOWlEo3Gi2tcxTRM7duywFrgB/z977x0nVXnvj7/P1N2d7TvbKwsssJRd6b0oAoogoqgYG1iuJfl+c+9NL/rTmJue3PtNbnpMTIwlsYslogEVLIh0QZq0pW7vbWbO74/x8+xznvOcM2dmziybhPfrxYudM2eeXj79E44Y9dhjj0lTtv8jYCjTSdHCzEmDx7p16/DEE0+YvtPbZ11+YzfsuHcTgUceeUSzXsiB2k5YncN4EQwGEQgENLSdzNhChKqqujYaGdJoziODbWZkNBLtfbhv3z7Gw/IG0TwvLLZNI0s2gKqqOHbsWNgJsKkJfX19bKxaWloQCoXw1ltv4ejRoxp+wii6czxobW3VOSApsKZ7UlUV+FTmEGlsSQ9y6lQ4yt3777+PY8eOobe3Nyq57gVEBzEdkxFvawVm+iNy3o64XmAsi+V1JbTW6dz+05/+pCtfVVW43W7Duki/EQ/6+vp05X/44YdQVdX2lGL/CrCdI/nCn69if/f09DBmvvJq64Sr0buy51bKVRSFGY2Il8GJEyfw4YcfmnplkDKb8NnfXQbAPsY2EqiPqqomVLEUTVvMILNC5Z8bYfxNA96LsRDtDocDU+7I1nlY8mXybZD1xchTzahPC784TNoWUZhACIVChjkGxbKj2TMy9Pb2YsOGDfjlL3+p+47fp6qqoqGhQSekJQLxkv8sl5bPK8l37drFjAPm/3spK0f0iG5ubkZrayv6+vpw7tw5LPpyOP+6KOytvTVNatSw5CsjWZtlfYkVfBlkfbh9+3amtLaKz/3uMulFHo0l+tqfztMIbvmLzWoILyPwa+rVV1/VGGxRCMpImH6XX/eMZyb4vdLT04Pf//73+PDDDzW5Hi9am46enh4kJyfD5/PpBIj0Llk3Z2ZmIiUlhVn20/sygnX+50vwl7/8RVOeqICm9vHgwz1PuSMbhw+HvVNJuM0bdIne2KSMIC9AEtKLBF53dzcqrzZOQRAMBuH3+5nlLYFnmFVVxaQ7MjXfk7A8LS1N089QKITKq1Vdrm3eM5TuN+qvbD8Z7THZ85ycHNSuSdP0X1EU6XnGh0UU9834m5LR29uLYDCIXbsMXBIi4MSJE6x+Hnxb7BCqxVoGP37f+c534t7fRuDPdTvOy8cee0z3bP7ni6X3rjg2snD/BNkaIYa4p6dHpzjkGZCkpCQ89thjUFUVBw8eRE9Pj4aZtqPf0aJy5UB/1q9frxGs2o2Ghobz0sd/Zdg93vx9HA1k+4Y3/KXPPF0h7tOlXx+lK0M8jzZt2sTaaRVG4ZEVRcGk2zNNjUaIxhTzRFvFG2+8gcbGRmZgMOveApw+fRp1dXUxlQcY0+V79+5l9+ypU6fw1ltvmZYjrp1QKISbfzyTpVIVozfIxpzKOHDgAAuRfN13Jhm2W8bfRLOG7aa1ech4LFVVsfALw9jzaNLTUHkOhwM3/3gWkwPw+yBWHot4ppKSEtz72yW6744ePYotW7ZIhf1kCMwLAyuvVtHe3s4Ed0b485//jO7ubmmYY9m4ihER+PpkDhVGRiMX/0e5xvCEzwkfq6Ebre/Kq1VpKlIZZH2kdWD1/WjKFtHV1WWqvLJSBuNx14RpdSOZTjAYhNvt1kV44Q3TefDP4tmnbW1tOgcgK4ilTr5vZohnnxLMnAgodDV5By/49zLDcvg72sg4ROS3eKMRM8Ub/38gEEBdXV1UivdEns8XMHiIdg6M3pets3jligBQsqwX69evBxD2epY5UJ45cwYvvviiafsShXjrM5M/2zF+diCSDIPug2eeecawDBonVQ2nMz5w4ABef/11/N/fXyF9P39x/JH6jBDtuHZ1dSX07rBSDk+LNjQ0oLu721C+Hy3s2DNGNL8RAoGAxlhRURSs/sFUjbGFTGFMRhRUpxn4341cZdw2qqe/v5/RpGv/31zLffnCn69CS0sLo8Flxi/09+Q7stg7otG2bG+NuzFJw5MSH0LvNjc34+zZs9ixYwdu+tFM9t7LL7+saV80fTGDyDtVrNTSw/R87v8t0v2O5NFWDGVUVcWMu3OZHL6rqws7duxIqFzLCIm8Uwb7jI+mPn49ioZPjz/+uOlvN27cCAB46qmnAIDJDMRyZZ95RForlVerSE1N1TxraWnRlMuvWVVVkZOTo5PvAnq9VLTz/uijjzIdUVtbG+O3iIcfDAwlmjretthuNPLHr20AoFeW171h/eIyelf2PFK5tACNjEZeeOEFvPXWW6aeuKR4amtrQ39/P7668n/Zd3Z6/huB7yOFxDpfsDLeBDOhhuw3B9cNeHlEazRC3hsfPdPGLjfeo4MHWa/J+qKqqib0toiSkhJN+957RE4gmhmNyLyyDh06pCM2o9kzZjh37pxufdM+BbT50fgDndr+3iNaAXtzczNTzPGCxN7eXjz77LP48NGzutzURgLwd/+gD6Xd2tqK/S+Ew6v19PSwqAnHjh3DI18Np+Ugq0WxL7FCVsauXbuiFoY+dv/bcUcaefa7H2gEshS5wcpvI0FcU9u2bcPmzZvxySef4PXXX7cUZvrt3x7VPZNZKquqyow1kpOTkZSUxL775OU+eL1eZGdna7yveKHb8OHDkZ2djby8PEyYMIGdfbxwu62tDdu3b2fl7t+/H1v/fI4pqmi8eK9GI0Oy1157jX2391m9t+emTZukZ1goFEJnZycOHjyIY8eOYc+ePWwsDh48yN4j5oXmQJYGwEj4TcYgQFjp/sGfwvvZ5XJp3iPiiBdW1r2h6MInB4NB5OXlaX5D4dBle8Fojxk93/9Cl2asqB0ixowZw/4WideD63rQ3t4eV4QwI+F3vGcrKXfMBNFWII4fEfV2g1dM2HFeEgPA48M/a2kTIyZENIjiwc8LvU8RZQ4eNE7bpaoqkpOTmUHZhg0b0Nvbq9l/dvQ7Whx4sVtjtJloQ+Pz0cd/Zdg93rEqYMXzjDcMoc8ypTov9HzrN0djbrcZvS6e3ydPnmThUvc+0254fqqqqvltNLm5Ca2trXj00UdZ+3Y8Gb+QwOzu4OmXSO2ltUNRxHp7e/HjfwsLc06fPs0EjGfPnsW2bdukEUjE9acoCl79n72G7ZYZMUSzhq2829nZaXpnG5Uh8o60hrf8caDNsvQ0MoE2D0VR8MIPtuvKUBTFkAeMhFAohOPHj6OnpwdPPLBZs2eDwSD+9re/ae4eGSi1IBCem+3bt5saVDY3NzOacffu3SxEMUE2rtQunv9LSkpC3Rt640txPHlaceufz2joczLmsprn3gx1byiWhXeyPvLtEoXXdq9tGXieyaiMzs5OZmxCY7v/+bChtyyS5bp16xAIBFiKR9FYR/SUFYWt8dxLn3zyCX72s59F/btY6lQUxRLNGyu9bqRMNfseAF7+SXgd0b4xomWjMRrhzzSZjEBmNBIMBqOKEsjPQUtLS0wKzAs05PlHtHPAv887QMr4UzvkinVvKMxR5cyZM9LoWKFQiEXuGew1lcj67JLLxgu+HUZyDofDYfhdf38/G6ddu3YxOdC+ffvwwzvkEUcS2fdoy37ttdcspQ63q81Wyjl06JBtURfsWMN1b0SOmsEjEAjoIo28/ONd7O7idWgy5x2qU4TRnXvstciKaYo0oigKnvveh2hoaEBzc7MmNRYPihT9Xzf/CadPn2YOZ2aRRvY+087OypaWFqnuim/ToZd6NTyOx+Nh8mi/3w+Hw8GicT773a1SOiMe2rS3txdPPvmkrn2EQ+t6pbLqbY/Xa/oBhOfLCg1Pup6df9HKPp1OZ1wy2ljxz3TGR1Mfz28wfcWnxs5WnWvMIl3Gq2MCwPhLvjwxyrrouMTzOvxaJINuQizzTnX/9Kc/ZU7YdkdYN8NQoqnjbYvtRiPnjrUA0Huo9bVYL8PoXdnzSOVGMhoBwgrNQCDAjEPE3xN+8pOf4LnnnkNXw+AekNGMXaIRTVuMlOdGebW6m+QRQvh3jEBETXdTkAneZcw5X45RX6xY2NH/7ef6pL8RD0zxfxEnT57U5deyY96JsROtzWmfUptycnJ07aR5oD5SWR999BG6urrw+uuvh9vZpx2D9vo+3f4n4yqRyKSyxXHtbgzvsa6ugdx8f/jDH9B6pgeqqmrCvPF9iRVGZZCgz6rnVcPxNrbmjIQ9kdZXY12HztvPrLxoIK6pnp4e9PT0MELEjJg4d+4cOjs7petSlp5GVcOpZioqKlBeXg6v1wsg3P/u5pBOkMb3NxAIoK+vD6dOnUJnZyfq6upw4MABlm6JH5f+/n4cP36cneGd9f2sDbyipbOzE3v37tWN63vvvac7k7oag+wd8uoUBX5HjhxBT08P9u3bh87OThaJpKGhgRHYvACHmMm+Fv0aoHVvdEeR1e7p06fhdDqh9ITHcuTIkZr3xOg8oVBIU58ofObrJhzcqfdsM9ofRs/pHKayjdaNjAAm9DSF54SYxq1bt0pTmxBkKbWIQBTL5ttilUjm3+MNleKBOH7iOpTRJLGAT+Vjx3kpQ0d9PzMoFJXT/P+i1zP/nZV5aWlpYeuGPy+oXqq7r69Ps74S1W8ztJ7uxtmzZ9Hf34+mpiZbwzXLDIjPRx//lWH3eIt3m1Xw+4b3oOL3RKRII0f3Wg+BHCmNotH9RiB6iu4Joz7ziodIXjQyKIqiEQKeO9oa4ReRYUaX88LFSPNIa+fs2bN44YUXEAqF0N00wFuqqoqzZ8+is7MTZ86c0dDCfBmiscXZI3JjFaN2R7OGZe8GAgGNMdz//M//MEOYaOoTBbM0hjyPEE2kEYKiKGg62SF9TzYmVo1GgDCt13C8TWOUTPvq0KFDEcsh9LXAUAguoqmpCYFAAB0dHXj22WdZNC3ZuIrCXP6elaWuIfqWX7+hUAgd5/p1RiMOh8NypAgZZHc+YB7CW9ZHRVGYYeb+/fs1fY53bcvqEh2FeKMPozK6uro0KXjWr1+P7qaQ4Tl67NgxpryRGeuIBtliZMN47iV+/UfjQRptnbQfZQbIIuyQhcjOZKO9bkSHimWIxiE8f8U/N4s0IosaJCvDCvg5OH36dExObRdoyPOPaOeAf5+PjC1b83bsJb4MkgPKoKoq6uvrTfuzd+9ejQLUDsS7hs0UqENFHh+pHZHOjd/85jdsnPg7MxAI4Ng+uawxkX2PVHZDQ4NGKWolpYaVcq1CVo4ot47HeFYEzU08ys1o+x4MBtHf38/G1eFwoPlklyEPKTMaoTrNHHzcbnfE9lH0ohMnTrD2NJ3sgMfjQWtrqzRlNQAWRfLU4Qb09fXpjGtlfAPxXVZ5756mENszo0aNwuHDh6EoCv74xz8C0BqJNtV1aJxc29ra0N3dHRdt2tjYqEndITvje3t70dTUpOGrOur1TqHECxjtJfo90aFdjdqxojE8c+YMDhw4YLlP8SIRdIoRT5JoxFofyftFwwqClfNIfId3zo30G9k7JSUlpny1qGMk3Y1YplH9x/dHNhTk6+zr62NnKBnFmZWfCAwlmjretthiNMJfHuXjwt7LvFc5ACTnWS/P6F3ZcyvlknCDlHaBQCB8UXyq0Dt79ixSUlLwwgsvsN+Qxw9dAoQTJ05E1Rc7QPXZpUCKB5H63t7errlMZMJjI6vT1EJj7wsrcDgcSC92GxqNiHXL+mIkdDRCdnmStJ2idR2BPvf09DBvyyeeeEJ62dqxzsgDoL6+XmMQQPuU2pSUlKQbn1AohN7eXmSWeNDQ0MAsdvft24fu7m6EQiEcO3aMhSynccos8Rq2h1eOA0BOuTyqi69QbuyTnKeNkiD2JVZEKsNIsCOieEw2W3NbtmzBm2++qXtHZhTFo7AqU0PQEn71q19ZaoMZxDX1+uuvIxQKMQLcbL/t2LEDp06d0pRBIbb5vaaqqua51+vVCDWDwSCScsMhqelsbmtrY4Z99fX1CAaDLPRdZ2cn89rlDQNoDPfs2YO+vj52ZmeWetj3vBEFCW6JWNm/fz9LqSSWSeeICH7uXnnlFfT29uK9996D0+lkxlOBQAC7du3C4cOHNd7G1dXVbA5UVZtbmxAMBqXrg/+sKApyKlLYePCgiCQUgSQUCrH6+PdFjwEegeTwOPJGbEb7w+h5WqELqqpq+i87z3ihptjnlAKtp8GWLVtYflIZ/v73v+tyFBoJ5fm2WL1jZFbcvb29LNRfLIh07lB433jBKyzsOC9lyCrz4qmnntIZgFkBf76L4Nc4rRHymKd75KWXXtIZjWzfvp3tbyBx/TYD9efcuXO2W7b/4he/0D3LH54mefMCEoVErKlYmFlx3/BGI/xnUSHMn7mx0ptW2mv0juyeZQbZ7e3o7+9nxhKqqkbNA4kGCHbQ1GZlkLCQv8M//vhjZixGxm1A+J4l+vzEiRMIBALILPVqypEZX/DGBbT++Dp73HKjEWq3mAIlmjUse/fXv/615p4KhUKmZ51RfTK+UVVVZFcks3e2bt2qmVMxzLSMrnY4HMgfkS4VfEezHoz41vwR6ZqoHxs2bIhaeRBNO8iTEgjfg2RIy48r0cJGDhp8fUR3q6qqSztDZeQMS9E8ozQpJEim55SyIBqIfRdlLjxka0cca/6MiHdtR8KpU6c0huFmZZAAkwx4U4uc2Lx5MxRFwccff6yjVSnSiMPhYOfAvn37UF9fj7a2Nqxbtw6AVuBPY1E2NjfqfOcyGClmZIhl/CJF4yHYdTeJn428wmX3KTDQXvLspLE3izQiU7a1tbWhvr5ew4OJshHiN2RnaSAQkDp4DIYs5AISj2jnINK5wyPRNBCPrq4u/PznPzdt34YNG9DQ0IC6ujo0NDSYGg1aRbxr2Oz+Hmz5vxEitUOUIzU0NOCtt95Ce3s73nzzTaiqysaJXyNtbW1R6WDsgpWyeTlGJDlqNOVagVk5iVBElo/Lw1//+ld861vfirmM5Dzg2WeftRyBQBZpJHd4qmF6GpkMUTZOIu3M6NB883eBcHRWp9OJbdu2IX94Gvr6+pCUlIRx48Yx+qS9vR11dXXsXgaA1MKwTNbjCcuDZVEJ6T5OL3ZHJbNKKdBHbxb5FqqrcFSmRi+2c+dOHDx40FbaVFS00xyQjonmKassSfOZeOJgMIi6ujqcPn3asA6iczJLPBojWPq/qakprrSv0SKRdMr50vHaDSvno7h2xDQyPEieL9v35NhqxFfTu8S/MjlIIMB4KFFeQwgEAtiwYQP6+vpQ3x2WpZATuxlaW1tx9uxZDQ1tpJtNJIYSTR1vW2wxGvnwww+ZUvWKz04BEJ6YzMxMjBs3DgCQN9X6ZWr0rux5pHLPnj3LvLPfeecd9Pb24tvf/jYOHDjAhBOkaGtoaGCRBeg7OnQJ7e3tUfXFDgx2fWaw2hZeeSMeXCIBQd+XznbHZbHrcDgw4pI0Q6MRsW6rfWlubmYey2L7xlymV/zy9YgHEwmGDh8+zIRH586dk4aNtmPeeQbse9/7Hrq6utDf38/2KbVRFBTS/6dPn0bZ3LAgm6wBxXdI8EJjMmpxZsR2kQFKzYoC6felc8KEXlJSEoYNG8bqypuqslzPBL4vsSJSGbxg0AyXrB0Hh8PBIq3I8lVHIkxdwxvR2NiI06dP236xydbUu+++y8LY8nnWZWhtbdWUQSG2RYF9W9tAmiiRUN+4cSPypoRw5MgRHD9+HF1dXXjuueegKAP5V59//nl4PB7G7FDZtK/5c4PW31tvvQUASB3XyX7T3d3NvF0IZ86cYftQ9MSkMkcuTJOG3OPrVZSBCFYul4u1lc4KozWTNzXcfmJmqFxqj0z5oSgKPB4P8vLyoKqq4R6jM4WMNkOhEKuP728gEEBDQ4Nc8Dgv/FveAMNofxg9pzL4cTc7zwoKCsIMam4ue1Y8M2z4QjlNeWaTwH/u6+uTemPX19fr9qFdd2pTU5Muuks0EMePF4oA1o3VosElt41LiJV19dJw2hlRmMMzB/xnHvQdPy9mbaRzkfeqE41GKLIRwY57IlpQf2Qp6RIB5whrEbEuwB7Yuab279+Prq6umPYmv2945bhoNMILyvh75tSpUzGfiVajsMkwbEGKodFIeno6+vv7NWc3v9+NwEeNFOl1O859szKIln744YdZv1pbW3Hu3Dn09PTg9ddfZ/dqxbxkDV8RCAQwekmWphwRqqpq7ghaf++88w47/yLxz+J8RbOGZe+KKQ2DwaDOeDOa+kTB0bilftTX1+PEiROaHMxmRiOike2M1ZWa92hsY1kPvDF3aWkpqpfmaL7fs2ePlJ8zQyztEI0M+HEl+YVIw/b19aGnp4fVR/Q5r/Tetm0bMyKhZ7UrCnWGJLJII7F4Gsr6HggE8Pe//133XLZ2RFqZb0+8azsSPvjgA42MyKgMGq++vj4mmC+fO+BY1d3djY6ODrzzzjuMFuKNRoiffOutt3D27FkcPXoUDQ0NqK+vx6uvvqoxylcUBYvvrNHwyNEg0v1j9H0049ff3x8VbRvruS1rK79ejPapWB95PcvK5M8DmdEIH8mH9gvxhrwsRQybzRuNiBEWjxw5whxSeJ6dn4NYafzzQSdfgBbRzoH4PsllZIrQRNNAMhzofZsZ9j/55JPYvHkzPv74Y4384cMPP8SBAweYbLClpSXmdOzRjp+YFs5M/jxU5PGR2sHLv998801s3LiRKd7oDjIap2h0MHbBStnPPPOMxojcip7ArjZTOXydZCwsGhCYQXQmM8LSeydrjCBiAbXZavq/YDDInLSAcF+nrCpjdAUfodjIaEQ23kYG1wXT9G0wU1b7xg2kxHI4HIxG/NOf/oS2tjacOXMGDQ0N6O7uRtq4LlRUVGDYsGFMTktt53mAvr4+DL84VVO3TMlM4wEAJbNcEY1GSD489+ZR7Bk/drK919zcLI0aIb5rplMDBuaAInWeOnUKiqKg+lN9FS+zJJ1KQ0ODzhmLX6dEu4xakqmL6szLFQYLiaRT/ll0vNHoUcW54+UuYhAIGXh5hSiPAgb2FMkLtm7dKt1b/DNKX9/f348jR45o9O8URZ509mZ9kp0/oVAIycnJ0t/ZjaFEU8fbFluMRnjC73/vCnsnf/LJJ+jp6WGM67F11hev0buy55HKpdB5DocDEydOZO356KOPWLqHESNGMIWjmCIkmvYlCoNdnxmstEVU6orgLxw+rOe+v/ZoLmC+PCtQFAXb/tjEBOIUypX/ni9L1hdZm3t7e3UWm/T/y9+VE3Wy/gFgxPru3buZAYcRAZmIef/Rj36E//qv/2L7lNpIBz7tZZ5o+uD3Dey7+vp6Jswg4p0IDBqTd38TOY0Klf/3//lE81tFUZCeno69T3ajsLCQEVpUx7F1CjZv3oyzZ8+it7cXR48exQ9u/mssQ6EBPx4ykGDGzJI2GAzi9/+5wZKHl9mafudXp9iFbVcaDEKkNfXuu++afv/SSy/pyiAvep4gB7TnAO09h8OBzZs345MXVGbMwef6PnLkCEpKSnDq1Cm4XC5GYJB3rEioyrD/ab0Sg0BWtKIRGLWZ9uKHjzQzYxSK2kHv0W+am8PvhEIhuFwuJuAwUo7T78zOHXEvid+npqZCVVVs+Z25wJX6FwqFcGyd1kOU/u/v72dMKU8g7nlCayj56quvGu4Po+d7nujQ9d9s7cn2w4Gn+1hfmpqaGONrxICqqsqifojl8aEcxbYMJqMj1mV27phZ/ceDr17xK7z22mu2l7v5lwP502XKI9rnMu9SivLGz4vozSD+DWiNekWjkfLyckyaNIl9H+mMj4RISo7e3t6o1nwisPH/HY1Z0HoB0SPeNcXjiSfCecRjSUPHrzMjA0uZ0Qjd2WfPno1rrYpCd6vY+Wir4T1JSla/38+idGVlZZmW19vbi7a2NrzxxhuasohWtWM/mpXR1dWlybPe2NjIhG2/+93v2Jn26quvYsvD55iA1uVyoaenBx883MAMSGRpvEiRTPjfu15iBjJEX0Xinym6CV+GVcjeJZoqEAiwdcB78KuqqklfIyujtXVgHYhGIy98W5sig/+eDzNtlNrP4XDgmf9vm9TLMJb1wBuv1tfX4y/fDEc/e+eddyJGWTFCLO345JNPNMoAflzpuWg0wvNRQJie55XawWAQLS0t6OvrY8/6+vrwyvf2aZ6RAJ43GmloaIjJ2UPW929/+9t4++23dQZOsrUj1kkeb0bvG0H27u7du/HAAw/ojNdVVZWm/jCqj3fMIHz0RKdO4XDw4EF0d3ezM8BoPHlDrQMHDmjmEAB+ee/f4HK5UFdXF5GnM0J3dzfq6+tx9OhRFnUUGEgTFg0tK+LnP/95VG2J9dyOlbYX+QPaN2J0SFLw8HcsAE36RJpbRQlHvuTloryxCSlyaF3xRiOUguro0aPo6enRrIsPP/wQQJgW5ufgr3/9KzOkP3bsGE6fPm0pUpedNM0FxIZo50B8//DhwwDkRiOJpoGM3t+6dSsefPBBfPzxx3j99dfx5JNP4nvf+x7bX/v27cOJEyewY8cO/OpXv8KRI0diSq8ERD9+ZGRpxZBtqMjjI7WDl1Vt3LiRrYPt27ezv7955e+YMY+VshPZd6tl//WvYXmvVaMRu9pM5cicyQiivLa9vV3DGzU2NuK1117Do48+GrG+B655JMaWDoDa/Morr6C9vd207UD4zuE9+BVFwYv/tTuqSCOy8Rb5Q3r/kxd0r+oiIQIDeok3fxqOMvD888+zujs6OnQRv/r7+3FsXfi+TU9P1/EKTU1N7Gx8+eWX8e5vTuuMPsyw/6k+qaGQzKj68a+9K+Xp6Ix655132LOOjg5pCuNI55nYbnEOurq60NPTgw0/PQpgIFKaqqo4tk4fNeV3v/sdVFXFY489xsqg8f3g4XqNbI6ncRwOBwKBAHNCTSQSSaf8s+h4rZyPVpx+ROdSMi7ieV2SK4h8teyMIPDPuru7dfoP+p9kC6dPn8axdQpeffVV9p0smj+BDFBFI7CysjKoqgqfz2fab7swlGjqeNtii9EIjzOnz+DEiRM4e/YsfD7foCpkjDB8+HB2aBPjy1snORwORmTzmyPSBXsBcjQ3N2uUNyJEYkPG2IiKIiveL2EB+IBAnELoiu9FgtlBK/NkM2oPoDcaIezfvx9bt27Fyy+/HLE9doLac2D/Abz66qsAwgLbjo5wzm9KgcIrLqjpMoMSQJa2wXiMSYDFCKNPX+XzGofHSits4eusq6tDIBDAd7/7XTzyyCM6IXgicO7cOZw8eZKF1vrggw+YkIeI1hdffBHHjh4z9bKMFqTcNUvLYTeMGHUjZeQvfvEL7NmzhymgaE2TcJ0Emm+88caAQis4IFwvLi4GMBAmm/Ys/e/z+Vg0KH4/GYUWpkVFdfFnOq1r3vOLL5cZh3Gl8YoqVVVZ9B4SDFKkEQIxSKJyXPRWk50hZCBBBDqfW1QrgJT3vKqqCgA06XyAgTnljUYAwO12o62tjTHiMtDdaPaOiL7ePktnrc/nMzQQpL/6+vo0c83fyzzDZaasSSQdwisJedTV1Wk8Dvv6+iwJC4BwWHCZp208eOCBB8KGDSE1YkSheBBprK16mFLoUVl6GoKZ0YiRt36siKTo+P3vf69Zp8bnU2JBCoYL+NcEbxjC70Ve0MPTVHZEMyIltRFdbPQ8GNQr2XkvN4oy5XQ6oSgKu9cCgQD27t2LvXv34pFHwsLVgwcP4uWXX8bWrVuZUo438BxsXnTnzp3485//rOk7CV3DObDDKVOdTidcLhd6e3vh+LSvvb29jKbgaYX+/n5dBAGiN6ymRaG0Fnbj29/+Not0BoTpHVVVcerUKbz44os4ePAgtm7dir6+fqiqimeffZa9+9///d/sfSC8nnbv3i2903keTIw0IvOCVBQFKlTNe2b8qRVQO3t6etDeFh7/9evX4+zZs5bC5toJM/5HTG/ER7cLBALMqKmvrw8dHR3MaUZVVebJ9fTTT6OrqwuNjY2sb7LolI8//jicTic6OjosedJawe9+9zts374dJ06c0EW0AaDbX9Q2u0DGTm+99Rbeeecdxku89tprzMgPiCwrCgaD+OijjzR9ICMcvv0ul4tFedm+fTt27twpVSKkpITTU+bm5qKrq4vN5bPPPhvmjfp60dLSgvfee08XLTcSaD6///3vAwAeeeQR/PKXv8Rvf/tbtLW1obm5GcFgEI8//rilsZYpgWRzmQjIzvxo9n1RUZHms9/vl8qqdu/ezRRTwMCdKq4LSpdIvK0YoQQYkE329/fD6XQiFAqx+5XmkzfUItB+5XHy5El8/PHH2LRpE+rq6nDq1CnGi1zAPy94I8yhjsbGRuaAQ84dZ86cQXd3N7Zt24ZnnnmGKX8Sjb/85S8AYot0PdQgyiVIUbx582aoqoqGhgZ0dnbqPL6HOo4ePYr33nsv7igcsULGL5GSnGS/qqrioYcewrPPPotnnnkGBw4cwI4dO/Czn/1MF50NgLQvzU320FBA+N5Zv359RIcSMt4ihyUxgpkVoxGC0ZrS8KUW1x0vx6EIsjTWP/rRj9DY2Mja4vV62d/bt28P81VcpBFFUVg6zf379396R2v7YUTX8M59ot6D+kWp44hvVdUBw1Pi6RRFQUdHBwKBgMbQyKrMShx3qzSvKJMi5wpRWU98N/3bvXs3jh8/jm3btqGjvQPr1q2Dqqo4d+4co0FobXR2dlqWXQ6lc6e+vn7Q6NLBghV9pdXUVYlEKBRCS0sLTp8+LTXc6u0N8zPEz5KMk9dViRG/z549i40bN+rKOnv2LDuDae3GEy38Xw22Go088MAD8E9SsXPnTuzfvx9JSUnsMs2dZP1wMHpX9txKuQ6HAwcOHEAgEIDX68WwYcM034tpID7++GM0NDTgoYcekm6oaPpiBwa7PjNYbYuRIpD/TMYdhOLpAwIt2SFn5pFInyvn+Vi5FGmkvr4eXV1dOuMDO8bVqAxqK13SYnhRghnhm8h5z52k4v3334eqqvjNb36DxsZGTV5uHkXTwgIOXvlOYaMcDgezVKT5GD4/zXI7ci4KEyt8ipv29nYUTXczgvXo0aNMMDpxRYku56l3RCtCoRAOHz7MIgtFi8V3TIz4zm9/+1s0NDTg2Wefxcsvv4z//u//xksvvYQf//jH+Na3voWdO3cid5IqTUsiS5shg6IoyJ2k6gTPr7zyirWORICVNfXEE09oohzs378fgUCAMaGyMt544w0mvOTR2dmJYDCITZs2MetuKoMIVT5/JjCwn+m5y+VijALv/WhkzZxdMxDeGdBbyQIDRDZ5aZESnequXKBfw7zRIQ/R0KW1tdU0NDSNn9k64Q2mxM+qqmLkJZnSskmYzOfxzJ2kMsaElAJivmz+bC2Z6dXU6XA4sHDNBDQ1NSEQCGD37t0sD+Hi2y9ivyPDJlVVLd/VWVlZhmOVP0VBfn6+hpBvamrShOd/4403mPCd+vCLX/wiolCUb4sdjEtDQwO6urrwyiuv4MiRI3j99dfxyiuvoLe3F9/97ndx+PBh/PCHP0Rrays++OADZh1tdO7s3r07IQzVd7/7Xdb3eIxiZeFOR18aDtMvtjsapQU/L7TXZEo4mVJKVMCEQiFs374dPT09eOedd7D4jonMYj0RzOHZs2c1Rkz8WA8WcicNePVv3759UAwq/5VhhW6wAqv0gRH4daYoChoaGhgNTBEARG93Pky+WIbdMDrLMsfro/NQ+zo7O7Fp0yadgv+BBx7ASy+9hI8++ghPP/00Ex7W1dWhra2NGVb8+te/1vEhiaT5ZQgLDQfeJ6UwAIxZlKMxGunp6cGYxTnMaITuRaIv6LnH40FdXR06Ojp064/ox2jaLZZhFuEq0nonQz8AeOqpp9DU1MRSc61fvx4vvfQSOrM/wYMPPohdu3bh6NGjeOCBBwCEjZLb29s1a1I2ZzydKEtPIwq3FUXBxBWl7LsXX3wRzc3N+OSTT6KaS/FuIfBl/PrXv7ZcHo941mVfXx9effVVFM/w6Hhqs/pIme9wOPCrX/1K079gMIiOjg5G1+ZOCpf72GOPweFwYOfOnXA6nRoFCK3N/fv34+zZs2htbUUgEJAK5Hg62qzv3d3deOGFF/Dwww/jt7/9rWb99fT04NChQzojd3J8AKI7n2XvEj/Q0tKC9evXswhqdK9SpIkf/ehHePLJJzFrdZX0LA8EAgiFQhr+KHeSyujU9evXM3lFMBhEMBjUhSk3OkNDoRBOnz7NFKuKoqA37wT6+vpw8OBBTZo+K8YCMqVFfX09Ghoa8JOf/AQNDQ3YsWMHurq68Oyzz+LEiRN47rnndHw/Oe5QZBJ+LKJFIu4mPhWmUX1WHJZ4kNHcgQMHmNGj0+lES0sLi0jJQxSQ83vw73//O/vs94dTP5KD2+uvvw5FUfDkk08CAPt/8R0TNe2tq6vDiy++iEOHDqG7uxsulwvHjh3D3/72N8N5sIumuYDYEe0c8O+TURAZ1Inrd7BpoFjrfPvtt9HS0oLdu3fj2WefxXPPPaeJUm2GWNYwyZMiRbAYKvJ4K+3o6OiQnnPk0JRdGz4DSA4WqexEy6StgtIvWeHj7WpzNOU8+OCDCAaDOHLkCI4ePYrHH3+cORyeOXMGLS0teOWVV5gR4V//+ld8/PHHbG2/9957tu/T3bt349ixYxoHRDG1QyAQQGFhIbKyshjfWHNFERRlIBU37Y3e3l7293PPPaer0yi9Bb9/rfaRZEC5k8JRAgCtoykwQIsVFxdDVVUmwyV+gCK3kWEayaXa2tpQONWJQ4cOSY0w+EgM9D3Rw/wzOptaWlrw1FNPoaWlBcFgENNXhdPjEC1Av+kvOInW1lY2hjt37mSRJEl5TrJro/OM2ia2xWhc8ydrz7W2tjbkTlKl0bF5p09ytnzjjTeQO0nFsWPHsHnzZmzYsAFbtmyBoihoaWmBoijo6emxlPajo6NDZ9QeDeKlU3p6enRriPDPouONRs4Za1QvGaz2h3eq5CHqP3p6ephOI3eSioKCArhcLjzxxBNMZkAOmbSGeedtKr+2thabN2+Goijs7AVg2ZkzVgwlmjretthiNMITWK3NbYwA0QhZotBPGL0re26lXN6bXMa4nTp1SnNYEgMGyBUW0fTFDgx2fWaw2paGhgbs3btXR4CLhhu8IlQNDbxjZnQiPte8GxqINMKnp6F5pOgaRn2JJCSguojYijQekYQ0Zt8nct6p7AcffDDiu+1t4bHr6elBZWUl+vr62DiMGDGCvUfPTtZZT6vQ3CS38AsFwnPQ39+vEW44lPDenTBhAoCwED4UDCsRH330UWzevFlqJBAJwX7rg817PYhW+qFg5BDqkTAY8x4Jv/71r/HAAw9g7969eOKJJ/Dtb3+bEdlGZdTV1WmUubS2Dx8+jMmTJ2vezckeYGRFa1jRiESMHkDgvSZlfTTLK0/EiijgZRbcwXD5DQ0NGkMVERUVFZpcmUVFRfD5fHC73YZ5AENBMOaCQOcTEXnNzc1obm5mAneRiKL9oStbCI/8zDPPIBQc6C+FJ6b6mpqa0NfXx8Z83bp1QCjcvvr6emzevBmdnZ14+aVX4Xa7sWfPHuzcuRMdHR14/PHH0d3Zw5Q8W7duHfCci/Gu5qEGVCZ45fsnKiG+853v4MEHH2T7/ty5c1i/fr1p2Xxb7DLO+MEPfoAtW7bgj3/8IzZv3oxTp07h8OHD6O3txZYtW9Df34/Gxka8/PLL2LhxI95//320teiV+g888ADq6+tZhBe7QX1/6KGHcPz4cezatQvPP/889uzZg6amJmzatIkJOwiyMfrTn/6kUf6EguZrMpq2GYHWvyyak6qqOH78OFPokNfHli1bsGPHDnzw/lZ8+9vfxjPPPKNhkOw0ICGlKQkwzhetGAwG8cILL2Djxo344Q9/iAceeIB5ilOUpoceegihUAj9/f1oaGi44H0aA6KhG8zAR6GKBfw6UxQFJ06c0KTK6Ozs1HmJ8Z/FMgYLdBfyoLs0MzMTEyZMYCFv6XzPzs5m+4togI8//hhtbW1wu93o6elBT08PTp8+rRGGt7a22tLHaMrweDyaVHhk+OF0OqFAQXd3N9xuN4s0AjX8Tk9PD1wuFzvDXC4Xe+52u7Ft2zY0NTXhvXe0EaMURbHEPz/55JN49tln0dXVhWB/OJ91MBhET08PXnjhBXR0dOiMs4Ho1/vPfvYz9jd5aPLtoCgxhG3btmlSKBw8eFDXH95JgPceFI1G+PcDfQH2XVtbG0sTMdjrIVFlvP/++9i1YzcefPBB09DQTMAtnBeiMTSNH5OdcO87nU7s3r0be/bswd69e1l9fX198Hg8WLduHTo7O/HGG2+gvr4ejz76KA4dOqShZ37/+9+zCDlW+37u3DlsemszWltb0d7erkl5ZATZepWlxzN6l/q/e3c4RRJ5ronOHt3d3fj444/xwnMv4oknnsDOnTsBhA3wA4EANm3apCs7FAxHCTp37hxSU1ORkpICp9OJP/3pT5r3ZApMkWdKSkpi/IbD4WDnal9fH1pbW1mUwIcffpiVsW/fPvT09OjufSv08Lp163Dy5EkcOXIEDz/8MHbu3Ikt72/F448/jqeffho//elP8a1vfQsPPfQQM2AkZVV3d7ch72aEWPcH3xdxHM0Uw2J948ePl75XW1sLRVEwbtw49oyMqXw+H7Zu3Qqn04mjR48yp7X8/Hx9fSaezSSzfPzxx+FyuZjBPp13xKcC4TX8xz/+UWoQvmHDBjQ3N+OPf/wjXC4XfvnLX2rSDhHsomkuIHZEOwf8+2QMePjwYWasxuN83Fex1CnK8Xbu3Ilvfetb+PnPf86iKv3lL39hEco+/PBDHDlyJHzu9QzwdPv27cOxY8eYkoho37Nnz2L37t1obW1FQ0MDcwp78MEHozobzhestKOlpcU04iWVIUYliUYHYxcSVbZd5drdvi1btuD5559n+/PJJ59kDpx/+9vfErJP//a3v+GVV17BAw88gE2bNuHpp59GKBRCT08PM1YFBqJKK4qCQL+exgbCDmgbNmxAf3+/5oyRyv4EnQ+LeqlGp4IMBeVGrUCYB+zr68Mnn3wSPgND4Xbm5OTg7NmzOHfunOZe5PWACgYiFIrt5c8hMqjnZap8tDD6fWdnJ06cOIGGhgb094WN/knOylKHc+P09NNP47nnnmM09PPPP4/e3l488sgjePfdd9HbmRqFsQABAABJREFU3Yeenh5WPsmZKFIIPSeDGKO1o4b051ooGJYTiykOCWfPntWkF5KV3dvbyyKJk55g06ZNpnLY5uZmTXTJaBEvnXL69Gls3rxZ+t0/m453MCNnHT9+XCrXkUEWVaSzsxMHDhxAMBhEXV0dgLBRI/EpoWDY8M7lciEjIwO9vb0sKiIQjrhHjhQEoo9JbqQoCv7617/C6/Vi06ZNaGhoQG9vb8Jk7kOJpo63La7Ir0QG73XauENBdrXeSp9/HglG78qeWynX4XAgJSWFCVH5SyeS14NMoRBNX+zAYNdnBqttsZIqgCwZaZ2c2tKPnLEe3bsk7DM6hPgD8ejbHSibmsrq59PTiAen1b7w3lRGZYgCLFpX5JkUCxI577Hsx56eHqSmpmosjOl/EnDHWrYYFeL0BwEUTdQbbB34exPylw7UO2zYMOz74yEmnC4oKMD3vvc93H///fjLX/6C/Px8pKSkwO12Y+zYsWhtbUVmZiYLd054/Q87gUmWmhyxP+MvTYkrPPNQmXdAnpIk2jL6+vpYdAhaI8c2d6FkRfh7cS3xKVn454D2XBFTT0XTPtGLj0Dt27e+HmNuGjBeobsjOTlZ4wlLigvR0EVVw9awMuV24w4FhRdp20fGK8Tg8+HPqVzeaOTQxhYULR+oz+v1ore3lzEyfJ4/2XhQGNa33noLwMBYHj9+HJ+82c5Cx7/xxhsoKSnBrnfOwF3ajrNnzyItLQ3PPvssPB4P/vbwNnRlnMSECRPg8/nw97//HaFQKKq72oigPbsthKp5ik6obRQ9gY9AEgmDdaeSouzQoUO671599VUcfExBfo1HE6o/0eD7/vvf/549l+2JkSNHapQsEyZMwIwZMwAAn3zyCR555BHU1tZi9OjR2PrsUYy8IT4jnEjzYpaTnfYKRcyis2v79u1oaWnBwacaMHJ12ODk6NGj2Lx5M86cOQNFUXDrrbciJSUFiqIgOzsbgUAAbrcbXV1d8Hg8bI02NzcjKyvL1BvtL3/5C1uL54tWfOihhwBocyzzClzCt771LWk5X/nKV/Dmm29i+/bt+PznP8/GoKenB9/73vfw2c9+Fh6PB2lpaejq6mIRjkhR5fV6dWNE9N7Bgwdx4MABXHHFFRpjQDtTCQ0WXv/DTsxcOSbucuINEcqvMxpHUtYBYQElGbM2Nzfj5MmTqKio0IRajnetHjp0iKWas4LOzk407lBwvPq4Jg0A3YXknUX3b3Z2Ntrb25lShFfckqF/RkYGWltbUV5eLq3Tjv0YTRlOpxOnTp1CaWkp+9zZ2YmkpCR89EYzJl7ejuTkZCa43f9GM6ZUpbOQyjQGJJAkIxOv14u9e/dix7pTqLhmoL6jR49a4p9JuLhr1y5g6yi8deAooz1SU1PR3t6OP/7xj5g+fTrmzZuHP/3pT5g5cyZe//1uzLhqNAAwY5ZEjt+JEyd07/P0omgocvr0afT29mrOE0VRsP3Fk5iw1qv5raqqhm2JJrVY4w4Fl940UeetGw3sXJcPP/wwCgoKNClGCdT/xh0KqubmsDOCUqIQiI6kMWzcoWDEbGieKUo4FPXGjRvh9/vR39/P1kNfXx/cbjeLAvfnP/8ZAPC5z32OKexi6fv2F07gv1P/O+J7FL2mY2MxZq4cg+effx6nT5/GJZdcgldffRXDhw/HsWPHUFJSAofDAafTKT3Lo0011LhDwYnqEyz6BhBO25SRkSF9l/rd3d2NQ4cOQVEUtLa2ssgSgJ4+5nlXkWeiZ2E+NBsnT57Etm3bmAwjGAyira0NL7/8Mnw+H5KTk/G3v/0Nt99+O9544w2UlZVFFZ2MH5/GHQoOVMvlHURnVFdX49SpU1Eb6sa6P2KlRSPVN27cOI2XNtE6hYWFaGhoQHl5OfMC9ng86OrqQnZ2Nnp6epCdHZ6X6upqTXQDQO60FAqF4Ha70dTUhNLSUnR2dmo8l71eL3Pkefanm1F1gyrlNwAwJYnT6URjYyPeeust7NixA/fccw8z5Hn9Dzsx46rRtigaSEFoxKsPZUSKOJFIREtXGr1/4sQJ5OXlaZ4NNg1kV52EhoYGDS8j4/sPPqbg3SMvaWQ1ZHBlBWb3/1CRx0fTDr/fL40OGo2uJdo6o0Wiyrar3ES0r6mpScODnz59mq3tRO9TMjjg6586dSoAIC0tDW1tbdi8eTOaXvNjzuh8RmOLxgXifSWr0+j9+u1AxujI/UhNTUVHRwcadyhQVhqfyRTpq6mpCcff6cKwVQMyY4r0ImvHyfd6MbJywOAuEt3QuENB17Iwn0rG9bIoSEeOHEHHxl6M+owD//u//4uKigq8/fbbGD9+PLoP+LBz5060t7ejvr4ew4cPx7vvvouenh7U19dj/fr1OHPmDAoKCvC3H27BtlOv46abbsJjjz3GeGRmJCLo24zm/dyHIWSM0vclu1qFy+ViNEVXVxfa29sZHcobjcjKprXkdDrx6KOPorq6Gh999BGSk5Oxb98+tLW1Ydq0aVDVcITDRx55hNEbDz74IMaOHYt58+Zh27ZtmDRpEnw+H86cOYMzZ85gypQpCIVCOHnyJPbs2YPJkyejsLDQ0h3JO10QFEVhaYF2796NSy+9lI1jpPFLFBJd32DTMeH+hPmIeNJ0k4Pojh07WBTGxh0KJl8xDP39/ToH0nfffRf19fX47ne/q3ku6hBIRuzz+dja3blzJ1555RUsXryYRcspLy+Hw+FAdnZ2TLIOQrT0nJ20p1gW0fdmPIcZ4jIaWbduHbKysqTKBsorlogQ69HC4XCgq6uLMTA8g93S0oLi4mJT75wLsBdmITkJ4jOzdSQql2meVVXVMKuy3MCR6hWfWd3I4iX0z4Cenh5kZGRoDmDaT7zRiOJQAIT/zs7OluY0FkFRF+rr65GSkoJgoNvA8EgrLBM9amgtkNBw3759TKEuEo08Du5XMNIGoxEgPqVpmDgL940I5X90DBs2jBlPDBhfGb8vGo2E39dGGvF4POju7kZaWhr6+/tjii4jAwv3FwxBVQesz+lMSU1N1Qgi+FyZBN4Qzgjid0QQiUYjBGJM2PkjFF1RUYH9+/czj8hocfDgQQQCAWRnZ8PxqSfpsGHD0NDQ8KkCeMBoLhQK4cSJE6ioqEBdXR1Ongzggw8+QHZ2Nrq7uy2t/4KCAnYni2OlKAqKiopwSD3DIqHw+EeKiMCH6zfCYBqMRAvRK3fXrl2aSEunT5/G6dOnP02fdX4ErATxviDFBIvoIyxLUqqpqqoxnjHD//t//y/iO9EYLw1V8EyXyIABcgOUaGHFUzxe0N0PAHl5eQiFQkhPT0dDQwMyMzNx/PhxZGVlobi4GOnp6aivr0dhYSEOHTqEnJwcFqZXVVX4fD44HA4kJyejqakJ3d3dOHOmHm+++Sb27duHqqoqBAIBpKenY8+ePcjPz0d6+oABQGdnJ/M4Ki4uZgrv1tZWlh7NDtBdtGPHDlRVVbHnzc3NmntKZtAYL6Lho8LKSTkdB4AZKZGQ0ul0orS0FFlZWYY57knhOVT4OUVR8NFHH+HEiRPweDxwOp3o6OgIh+9V2tHe3o6kpCT09fWF18mn0Uj6+vrw7rvvsqhAu3fvRltbG0sxkJSU9GnUg+g89mU4eOAARk4eOBw7OjpYmpX33nuPrc2nnnoKBw8olqITxgqig8wQCAQ0KZd4oxHi8Xi6oqmpCW1trQiF/Gx92C2biMdgJBEwGkNZBE9ZikKdkwR3tRNP7fF4mDCZzlg+tWRSUhL+8Ic/aM6cX/ziF0hOTkZJSQnzIrMT4ryePnWK8YIA8Nhjj8Hn82H//v3McDozMxMtLS2oO5GCn/3sZ4wHKC8vN03VFA3McmYXFRWhvb0d+/fvZ8Zl/JiJ8g3eaITnlWTGJYRQKIS3334biqJg48aNzEuPIlK9/PLL6OvrQ0NDg8bg3G786Ec/AgCpEU0iINvj0QhjeZmS2Tt03uTk5KCxsZF9zs/PZ2nG8vLycPToUZ3BD18+GffwjkrBYJAZhqiqivb2djidTgQCATgcDni9XrS3t8PtdqOzsxOtrf0RaVBaGzTX3/nOd9h3B/drz/jc3FzU19ejuLiYrT2v14ukpCS0tbUhKSkJH3/8MUaNGmVriHMew4YNw5EjR1BVVRWXI1YiUFBQgJycHBYdrLOzE36/Hz6fD42NjVAUBS6XC8XFxejs7ERzczPS09Ph8Xjg8XiQlJTEIsGRrKGpsQnvvPMOM8ju6+uDw+FgyoyOjg7k5eXh3LlzSElJQUtLC7Zv364JuU4YCjL48wEx/W80htmJuJvOFyhF5QVcQCRs3boVubm5yMvLQ1ZWFvbs2YPGhkYoSgGA8FpqamrS0JdGDsK8g50YaSRa8CmsW1tbsWjRIuzYsQPnzp3D8OHDdRECNm/ejEB/eL/zBqpZWVnMoVNsR1VVFV544QXTPvGgOunOP3PmjNQ4MhQMQfk0QjqdQbt370Z3j1MjHywsLMSmTZs0NOCCBQs+jSJ3GrW1o7FhwwadPiU9Pd2yoW+4ffJz0OFwYP369SgsLByIAvjpOPBGI2Z4//33AYRpmzNnzmDChAmMzqbID5MnT9aV99FHH7Foq6Is5LXXXtN8JrnRwf2KhraPlafjaZ8LGHpIS0tDe3s78vPzcerUKbg9bqlRRXNzs26tRAKvXwvLsAdSn0UCL6vw+/3o7OxEXl4eent70dnZifb2dmRkZMDv9+PMmQb86Ec/YuniiJ6WyamSkpJs02fJINL3gD6qmxniMhqZMGGCxlMNACquDG/atLQ0jeUdPbcCo3dlz62UywtRJ06cqPNoHDFiRDiMlBBmK9r2JQqDXZ8ZYmmLqqrMQ5cOdj5EGb0zdnUSAGs5K3nw8zn1Dr8m9DZ/iRutVSMUFRXh1KlTpkLMRM7NUCi7srIS/VeGrfJIYA1ohSnkqUXzOfueIpxpCjNdkQxGZO1QFAUVV6oaJo8MKOb/3zLsO9QCVVWRnZ2NUCikKUOWkkS0dCwrK8Px48dRW1uLI0eOoLKyEiV5zag7dzTScERExZWqJod0rGUkCnaUbVQGEc8iyBuqp6eHrZFpd+Si7lzYS81IMGcWaYTKJAULEGYs+vv7Mf7GVPSErBG5Rqi4MnxG+f1+FkmENzgkoR4pK5xOJ5xOJ9sPfFsJ5J028prweyQ454Vtvb29qKysNIyoQEzYiKsd6AoM1EF7RebBJ5sv8rqhMaPfNzU1YdEXRuHwif1ITk5GMBiE2+1GxXIV48ePx8GDB1FUVITe3l709vZi7OpwBC8SYAHhuam4UtUwaNSOlJQUeL1eNnY88ScSgZVXya2UKfR3PEj0nUqE7vlux2DWmcizJVZ4PB709fUhKysLFVdGF/2JiHsZEV9RUYHi4mLmuVldXa0LWQ/8a9OKwIBCzgiiYWROTg7GjRvHlBkXX3wxQqEQNm7cyM6Y8vJy5OfnY/fu3WEjt08NOUaOHIkjR44gIyMDPp8PJ0+exNSpU9HX18fq6O3thc/nQ3FxMfr7+3Hu3Dl4vV74/X50d3dj8uTJaGxsRFlZGTIzM+FyudDd3c1yOZMAnz4vnOtGH7pQUVEBRVHg8/nC519FBdxuN7sb+vv74fF40NbWhvT0dPT39yMrKwt1dXUYP3583EYj/Lzz56g49mYKs0SuHVFpSvcP1SmGxwbCd+yxY8c00UvcbrdpRBqiIYwUBOfrjGpra0Nubi7cbjdOnjwJr9eLjDlNeP/991FTU8OUQ0u+MgqtXQ2ayA+9vb2M7j5x4gS6urrYPX/RLRlo7rTG10T7nJCZmYnMzEwcPXoUl31lFEaNH46XXnopqv5bra+kpAQ+nw979+5l/Jf4Pq1pRVGYMU1aWhoqKirYO6qqYsuWLQgEAti5cycKF4dpGz6Kjcg7yMALZGWK/4KCAvRfqY/oES3sXpc+n0/H7wID+5/oW0AuIBefjbx6gL+j/UfRpIABQ176rqOjQ2fonZOTg4KCAmRmZiI1NZUp5uyQDUXzfl5eHv7yl7/giiuuQEdHBzZu3IhVq1bh72lvY/nyy3Do0CH4fD6Ul5fjV7/6Vdz1EdxuN8aMGcMM3iatzUJbT5PmPKN9zUcuNIs0Qu/xRiMOhwMVV6oaBX9+fj42btyIqqoqbN++HUuWLEFdXR0OHjyI8vJybNu2DStWrDC9q+Ppu4jc3FxTQ5p4yo4EmXyptLQUJ0+eRGVlJQ4dOqSpz8yrWhRe8wYf9D/vrCamhFNVFTU1NTonI5fLhb6+PhYdJjk5GYFAAL29veju7mZGI/QuRcFc+tVy7Dmww3RsR48ejQ8++AArV67EM888o6HR7r33Xvx35/9i/PjxmDhxIvbv34/q6moEAgF0dHTA5/PB6/Xi5MmTyM3NRW9vL9LT0zF79mwWtai1tRXnzp3D+PHj4XQ6sXPnTlRVVTEed9KkSUw+3NjYiMLCQuzfvx8lJSWoqKhAU1MTnE4nXn75Zdxwww3weDwIBAKYP38+zpw5g+TkZDQ3N2PChAk4cuQIxo8fj4aGBhY55+jRo/B4POjt7cX777+PGTNmoL29HePGjcMTTzwBAKipqUFjYyOysrIQCASwb98+DQ87e/ZsdHd3o6enhymzampqUFJSgg8//JDdNRUVFWhra0N2djZSU8MRjj0eD3NUpOiF3d3dSElJQUdHB1JSUtDe3o7U1FQmN6D5JMPiykeq4C/KxNmzZ5GTk8Mi1XR3dzNP6f7+fowbNw7d3d0Y/sho+DI9KCsr0zlximv2fNBA/0x80FDhsay2QzSesVJGrLRiPBjKcgk7yxnM+qItg09p6nQ6MW/ePPR3vsmiapWXl+vkHCKtKKvT6A6tvErfBpLZ8EYAxP9UXKmioqICycnJ7DtZVO+2tjbWjv7+fkyZMgXNzc3Iy8vDO++8o2l3eno6pt7h1RgjRtIDVlwZdiLp7OzUyLxluqLk6WdQXx/WCfD6kMoV4faR3Jx4k0AggLVr12LLli0oKChAb28vFn9xJFJSUtDV1YV77rkHP//5z1k5WVlZOqMRo3kfda0b/YLRiPju6dOnGX/e19eHkSNHoq6uDmPHjsVHH33EaEtZXzMzMzFv3jwcPnwY8+fPh8PhwLJly9Db24vXXnsN3/jGN/DQQw9pjGJjxeIvjURbTxN6e3tx0UUXISMjA8nJyTh8+DBqamqwYcMG9Pb2oqenB01NTUhNTcWkSZOkRtHl5eUap/J/xL1uhkiOGHbDzv44nU74/X7muDX7nlwpDZ+Tk8PoN6uI5IAqRpZfsGABPB4PTpw4gbKyMuzatQsLFy5ET08P+vv7EQwG4XK5kJ+fj6amJqSlpYWj5TychfaeJjQ2NmLkyJE4ceIECgsLUVdXB7/fj8OHDyMYDGLs2LFwuVzYsGEDiouL8cknn2DevHno6urC9u3bMXz4cOZ4VFxcjFOnTqGgoACBQACFhYXo6urCxo0bTY1E452buIxGysrKMG7cOBbiHgDObFZQukhFQ0MDiouL2SFJz63A6F3Zcyvl8of62bNn0dXVpQlVRx7eMovFaNqXKAx2fWawoy2ySCOqquLYG72oXOrWvc8bmvBoampCKBRilpkAsG9dK2quz5IajVjtC9UjO2jFsJ/RjgcdKFY8IRM572Zl8wSBqqo4s1nBjLXFqKur0xmN8JFGaD99/FIbMmfE3g5FUdDwvhuplwWY4HjMmDH44IMPsPPpc/DUhOdm1apVePnll1H/vgtFF2sJoUsvvRSjRo3CgQMHoCgKJkyYgOTkZHR2diI1NVXXjoe/+DqQetRaoyP0Z/bFedJ88NGUcT7mPd4yUlNTpUYjpGzr6elh62bfulakhSMh6oTXMi868cxwuVzMC5nez8zMRH19PU6+FULObL3XajThSc9sVlBxWfjMIcMJPg3N5MmTsWXLFk16Gl5YSM95UNtPbAyidJHKrOepn+PHj8eePXtY9JLRo0fj448/ZqFFyTMeAPa/3I7SRQPKscOHDxtaWsvmizcyyc/Px9mzZzFr1ixs3rwZB17pgHNcWClAfT+9CZg2x8NST2RkZOCyyy7Dj+74K+56/Do89dRTzBgrGAzizGYFYz+byxg6p9OJ5l0pyLpUZcY11F4xSgv9f+ptoORSS9MFAIypscKQJPpOTU5OtmQ0knauCqvumwan04k//OEPCWsPj2j6vmzZMmzfvh1jx45FbW0tjh49iqqqKhbS9L777kNLSwvq6+vx/TVPJOxsiRU+nw99fX0YPXo0dj35nqbsMWPGoKamBk888QRuvvlm5ObmIjU1lYW05L1bgHD0qi996UvhKAGfYuHChWwtk9fDN7/5TTY+Q5FWzM7OxrRp0/DKK68gJSUFd911F3784x8jKysLs2bNwrp16/ClL30Jbrcbzc3N8Pv9Uq/YWMMnnjhxAqmpqSxdigzz58/XfJ43b57uncsuu0z3bPLkyVG1paCgQPdMbJeZV/TDX3wda3+w0HJ9YtnDhg0DEN5HZP0fzT1F4OddjIxH9whgHskhkWtVNGak/kWqMysrS6Oc9ng82LFjhy7c9rx585gwyOFwoKysTBpJJZH0TySQYK++vj7sffJpOXV1dWhsbITP58Oev7Zh1IpkZpTkcrnYnQaEPbnKy8tx8OBBZGRkoGt7JlCpNQ6wwj/fcMMNeOyxx7Bs2TK8+ctPcO9Dy5mnzY4dO3DTTTfhnXfewdVXX43u7m709/fjpZdeQufuDExeM1mzz3hvLyswGz8KjwyElfunTp3SvU/re/369SgpKYHL5UJTUxMLTQ2EDTwaGhqQlpbG6sy5xqUrI971QPzR9Q/MNMyPbQV2r8vS0lLp+uf7nXedlu7mo4WIZ8TpTQrSV2iNRuh+9Hq9qKysxL59+1j5b7/9NjufSbh+++2349FHH8X48eM1hvXxyIbKy8uRlpbGlBp03i1btgx/+tOfcGhdHx549m68+eabOHnyJMrKynDy5EnceuutKCsrg6qq7G557/enUFZWhrKyMlb+smXL8OKLLwIApkyZolljkdpXW1uLHTt2YMaMGUhJScHOnTvZegSAT9b3wD8n/Hd7ezsLDw4MGIPI0uvKIo0QTQ2E748zmxWMqQnzJKmpqVi+fDkyMjLgcDjw85//HJMmTUJ7ezuuvvpqdHR0YOLEiaiqqoKiKNizZ09Mnulm80h8/Fe+8hX88Ic/ZOns7CjbCiLRKiLf1LEzDVkzo3M84CMfiQ5RvAyL+Fl6Rg4JM2bMYOuL5j0YDCIUCiEpKQn9/f0oKCjAli1bkJyczCKPUDmpqal4/7ET+Lf/vpF5TBNyc3Nx991348EHH2Qphmm/X3/99SyFiaIoqFRnYeXKME3DG+LxkKWhKy4uRk1Nje75lVdeGXHsZs2apXs2ZcoU3TN+bwJhAxQAGDVqIOY+//eSJUs0799///0R2yLimmuu0Xy2SmPytF5mZiaAAVrSiPal8+Hh74fpSqIP09PTAYCF9Ofh9XrxzLeN6VBxbZ8PGuh88EFrfxBW5ojRPlNTU1kkdLvPucGE1XaYKQyj0bVEU2csSFTZdpWbqPYRfZSI+qIpY+HChXj99deZkaPL5YLD4dCUQakceIiyNlmdRkYjMhkfb5wspkk8s1mBc7GTfb7uuus0+kdCIBBg7ejv70dKSgpycnI0shtyVGhra0PrexkYsSyXGVpHUrKf2azAvagTXq8XgUAg4h6bvsaPU6dOobu7m81363Yfsmd2YM6cOWhtbcWpU6ewdOlSbNmyBenp6czosKWlBW/+/iDWfn8h2tvbkZOToymflz3zdcrm/diGfhRdPPBZURSc2QypDqa2thZ79+5ltDrRUF27M+Cb243a2lpNhMXq6mq0tLRg/Pjx2Lt3Ly666CK8/fbbKC4uRmVlJcrLy+F0OvGVr3wFXq8Xzz33HHbu3Ik77rgDBw4cwOzZs3UpQIi+OXXqFIvkRnfgw198Hff+YLWuj2PGhNOA3HzzzdL54GVLzzzzDMaPH4/GxkaN0chQlNv9I8HO/pBBc15eHtxuNz56oRkTrsvUvTdixAjMmTMHf//735GWlobTp09HnQZzxIgRyMnJweLFi5kMWJaGd/r06QCAadOmGZZFNHVxcbFOTlhZWQkALBLwhAkTNL8dOXKkrjyqMxLGjh1r+v3DX3wda+/X0optbW3SiNIyxGU0QhgzZgz27duHsrIyNAQ7cNttV+F3v/sdQqEQSkpKkJKSgoOPHYxc0KfoMaDjZM+N3uXBRx8gZT0x9EA4L2tubq7OYOSLX/wifvCDH8RUp50Y7PrMEEtbRIKINwDhmd7Oc1pGmxhnWSgintjnlY7tp/s1kUbMPGdkfaGQ2IDc+9FKGXxZ1E/yVpblvYul7HhhVjaFywTCBElPQ1jowxuNEEhJzisj2s/0IzOOdoRCIXTXa3PR0Zi1nOxF3qdGIyRsKfBVYNGiSkyZMgWbNm3CsGHDWE77GTO01isygxEAqNvfAFhMT5ORkcE8eUSCv6ch3H6jnOJWQGOiKAomT56MrVu3xlSOWdlW4HK5UFtbi61bt+LKK69k6X2MylBVFZMmTdKF6u7r60NaWhqamga86trP9CMNWoMQguihyAtLeYaiv7+fGWhkZWWxcjrPBpGDsKDszJkzbO+5XC4oioKMjIyIwgJHtw+q2sWMP3iDELFtdFbxOT95TzACeSz1NoXTN9HaFo1jqC6eWGpoaNBYxdIcUG4+AMy7SoTRfOXmhi12s7KykJ+fD1VVsXDhQjz/5j74MOB1QGeAy+UKRx2pqMDJkydRVFQEV3cqMjMzccstt6C5uRkbN25EdnY2/vDYOzqP8I4zASiKS2MswkccIdDn7gZIQ7gtWLAAGzZsYJ9HjhyJgwcPMuYmNzc34tlt99lKgtj+/n6NknPx4sXYvn07zp07hylTpuDQoUNobm7GqlWrUFpaip985iVGQIowil4RL/i+33fffWwf7dixA16vF83NzZgzZw7bdxMnTmTvjx49kICWIodlZWUhKysLeb5yAEdta1u8cLlccDqd+MIXvoC33noLaY48fOELN+Hw4cMoKChgRP3Xv/51Xc5T0WCEwAsd+PcJaWlpcDgcuP/++/HAAw+cN1qR6uf/vvPOO1FQUMDaS/mL6R0CCeKB8F4SEauxCIE38v1HR91+eyY43jHl1xl/lhKdVFlZiVOnTmHUqFEIBoNMyWpURrywg8bljUnpLnG5XJg4caImtZHT6cT8+fNRX1+PvXv3sn0dbX1WEU0Z/Lz29vZi/fr18Pv9KC0thdJ1DH5/ChPO7tq1C4d3OJE1MxONjY0sItfZs2dZbmkgPC633347nn76aZyu60aKcH1Y4Z9HjhzJ9vy6Mwfg8XiwZMkStLW1IS8vD5WVlexeojNv7dq1eHD5E9Y7bwC+HZMnT0ZbWxvz8uvv72dG3jR2Yn/4qE+nTp3C8OHD0dTUpOH1iGYmeqinQZ5mNNJcWlnHVDbvqR4t7FiXoXYP7r//K3jzzTcN80jL+i0K/Xk6m9DdoE9jo6oqhg0bhr6+PlRVVWHfvn1wOBy48cYbMWzYMCiKgpKSEhQVFeG3v/0tkpKScO2112oMJ8S2RIKz24d/+7fPMHpr3bp18Pl8AMAU66qqIjk5GXfeeSceXPcEnE4nLr74Yml5/JqQneUkoHa5XMxxwQw9DWG6benSpUhJSdEozHfu3KmhiTvPBeFH+IzOyMhg7QbkBvN8RD4xOiNFpgAGeHZVVXHLLbfgxRdfRHZ2Nqt33LhxcLlcWLBggdSpZtq0aVFFEqKIbD0NwH/+539ix44d2LlzJ66//nrs2bMHR44cwY033ojW1lZ4vV58/etfx3PPPcciL1pBvPtDtvdFQ1jNWd0kv4+NzgP6Pd1VopKMeBo+0gjP19I9Tc/cbje6u7uRnJyMbdu2Ye7cuXC73ZgzZw42bNiAkSNHoqmpiZWdl5eH/Px8NB4/juLiYixatAiPPvooawOdq9nZ2Vi4cCEOHz6M2tpaTJgwQWeIYBdNcwGxI9o5kL3Pp0riMdg0ULx1kixNURSkpaVh3LhxyM/Px7PPPotVq1bh8OHDyMvLY+kPvvSlL+EHHz6PmTNnAghHK6QUkc3NzXj99ddRU1ODvLw8Jg/8/ve/z4wCRa9zO/tiJxI5j9E+twOJKtuuchPRvqKiIlxxxRUsHeT999+Pc+fO4Re/+EXC9+mqVaswfPhweDwe5kz5+uuvAwjTJB6PJ6wPidAO0WCCl18TjIxGug3k/yIYPdQ9wC/R8+LiYjQ3N+uiwVI7PvzwQyxcuJBFDJ02bRp2797NInsAwLlP2pBRP9CuSBHSqey0tDSoqqpzluTT4vY0ACkpKZg6dSrOnDmD9PR07N69G2lKLkaOLGSy0K6uLowdOxZbtmxBSkoKOjs7WaSDpP5MDB8+HJdffrlOrupwOHRGI0Zz1nlWO7YOhwM9DVpjcKJVKAo1rQOat7bTfSh0udi8T5gwAYcOHUJ5eTnOnTsHt9uNvr4+eDwe+P1+ZjBZVFTExgYIO3rs3LkTRUVF7DsRtIZk39tBp6xcuRKAPjLpv7KO1w7Y2R+KsKOqYZ1J2+lOzVpPS0vDqlWrcPDgQeTk5OBzn/scFEXB97//fUb3mvHxy5YtCzu1Njdj7ty5jL8iGbCo84wFQ4mmjrctxnF+owBZhF966aWYMHMUSkpKAIQvBwrt4s02/LkORu/KnlstV/Rs4C+67du3s795S3aZgiCaOu3CYNdnhljawis7eY8L0SMjxT9ACIjeTqqqMk89WchtKi81f8DIIBQKaebWSl8ihSuyUgb7zuuFx+NBfn6+RshjNX9UIuedyr7uuuvCn71eVFdXAxgIB0V/e7O1Hk2ANqUI5dYlgYkv1zi6i1E7ePT39yMl16GJ7EBrIaPQwz6TsKW4KgczZsyAy+XC/PnzmcFINCgcEXmwFyxYgDVr1uDuu+/GZz/7WXzuc5/DF77wBSQnJ2P16tUYNWoUvNnA0aNHTUOoRwKNCUXKkXmXxFt2JNx333343Oc+h8WLF+P+++9HbW0tG1ejMoyiO4RCIeZhQ+OSVhDuEy+gFgWY4kXPe9NROcFgkIVspTXpy3Nqfs8L6YgIjoTUPCeztievX76NokELbwRBhizieBQWFgIAikaGLcWJGOFDTPOf+VDqADB8+HAAYSGKbA6oPOpfWlpa2KtGeJcX2tM5u2DBAgQCARQUFMBflsa+I2/nrJJw+ofCwkKMHz8e8+bNg8PhQNXE8Jpwu93wer0YNmwYkpKS4M0eGJsRI0ZAURT4cp0aRlIWaYQfU2+WqlHejx8/HoA+2pPb7cbnPvc5Rofcfvvt+sERYPfZGgwG8fnPfx6pqam48847mcfdmDFjMH78eNx+++24/PLLsWDBAowaNQrV1dVIS0uTnjv3338/Zs2aJY2uEC9WrVrF+v7Nb34TiqLA4/HA5XJh8uTJGD9+PCOczeB0OnHvvfdqnhVUZsbdvkjzYmT0Bwx4GJKnARm7+Hw+DB8+HMPHF8Hn82HChAkaxbIdTAFh3Lhx7O9vfOMb54VWXLRoEYDwOiLF8P3334/CwsK4jRMuQAsrdEO0iDY9I6DdN11dXWwd0h02cuRIlrbHyCAqEWs1Pz/f9HuzOnmjEfq7q6uLCejI4+rSS8OuaqtWrYLf70cgEEAoFMJ//Md/6KLE2NHHaMrg99tXvvIVfPnLXwYQNmaeuiB8RzgcDkyZMgWLFi1Cbnkau789Hg8Lew+E8+bed999AMKGpIqiwF+uVcCbtc/oOa1hr9eL3NxcnaG17F0rKCws1JVVWFiI2lmjUVVVhfvvvx9Lly7F6tWrmQEbT7MxOkCokryHysrKEAqFmNGADMTPebPl6WUizWWkaGGKosCbHV6jRl75VhDvuhw/fjxmLAzfd/PmzTOMVMRSy2RDx4PTXqF0Fzx8uQP8OH0XDAaxcuVKZlRNNCGlkFIUhQl7165dCyDsMc/PbaS+815T9957LybPH4+CggKkpqYiJycHKSkpyM/PR01NDVJTU3V0d1TrVfIurcdZs2Zh2LBhzOCRIi3wERIyMzNRO2s0Vq1aJY2kIfY75VM+ub+/Hy6XC16vl/HjBJ7nkRmNEFwuF3uPePZQKMQE9jyIrjSKwsobBgNgcgFCeno6vvrVr2Lt2rW49957cffdd+O2227DlAUTkJqaitmzZ+Ouu+5CTk4O5s2bh1tvvRUul0vjIUsyHquId39Eqks0Gkkv8EiNc2Tg5U+ySCO8YTxvQELl8qlPFUXB2LFj4XA4UFhYyM42n8+HQCDAvr/yyitx5swZFgVu2rRpYf7s0zVMvOKcOXNw//33M5kZpXXt7OyEy+WS8sKJoGkuIDpEOwey90eNGoUpU6boFKCDTQPFW2dFRQWmT5+O1atX495778Xs2bMxZswY/Pu//zsqKysxYcIEjBkzBl/96lfxjW98A8nJybrx8Hq9cLvdyMvLw+rVqzF27Fjk5uYiOTkZycnJuOWWW3D55Zdj5MiR0gg3dvXFTtg5j2L0nGhpSDsQbdl33nlnQsq1oxze8YJA0YKA8DlcU1ODO+64A4WFhUhJSWGGrXl5eaitrbV9n/IRmO68805UV1eziDvEY/h8PnZ/kRFjeqHX1BBfpDUjtZu/S5Mk74qyVfp70qRJSM1zaZwISD5ZVlaGa6+9FgDwuc99TtcOMsB/9913oaqqxmDESptF8DJ6mUyFp6282WF+LxgMwuv1YsGCBQCAguFZLAWMx+NBT08PkpKSmMHGjBkzkJ6ejtGjR6NwZDYURWGRp77+9a8DADM2Fukbw/2bpZWfE61IfeHLdLvdUFWVpVujeUvOUZhTpNPpRG9vLzOKoXL6+vrgdrsxbdo0jcEyj9TUVNxxxx3GgxwBdtIpogzwX1nHawfs7E9lZSU++9nPss/JOVojkPb2duYYwMs4ad3JaP9rr70W5eXlKCgogMPhQHl5OaO7EyEjHUo0dbxtscVoZNiwYcjNzUVJSQmu/8Yc9pxyVjY1NaFornVBqNG7sudWyxUvCTGqCF1Uc+bMwTXXXIPMzEzDxRNNX+zAYNdnhmjbYsRsizleVVVFxUIPY6r5se/q6mJWopTLk4fb7Wbvj1meockja2dfoi1DVVX09fUhOTlZI/ChNB133XVXwttnhC/++gbce++9GDVqFPx+P7Kzs5lSlj9kXS4XiuaqOoU2Hw2ChBn0/cjLjYW4gDY05+gr9ELv/v5++Kf3sfqBgXVUuyoPkydP1kQauY47c2LF9RbKmDt3LsrKyuD1epGTk8MupS9+8YuoqqrC9ddfj6K5Kurr66MOi8WD5p32h8/n0wnu4i3bDHfddVeYWUhP1xBT9LdRGeJ+I0YkFAqxv8lgasLKsACRJ0Z563H6ncwog3+HFGC8oG7EZWGBrRjaUMaM8N/zoHVJxDzNp1FZYi5rOsN4oS39PfHasCItktEICVl5L2t6zs8BGXi5XC74/X6kp6cjNzcXXq8XycnJGHdVlsZTOSkpCd/85jcBhA0siPgnRvHiO0dh7ty5GDNmDGprazFlyhRMuaEYFRUVTCFNinl+32RkZDCBS9FcVTN3iqKgammq1GjEyCOBLwMYsDa/+eabmQHOvHnzsGjRImRnZzOm3CwlGV+2naBchp///OeRnZ2NWbNm4f7770dGRgZmz57NlAzjx4/H9ddfz35ndO4sXLjQULkbD6qrqzXnS6wYM2aMbpyX/+ckjWAiFmM3o3nhc4gbgU+dVlVVhcsuu4ytn6qqKtzyUBS5jmJARUWFJlyh0+kcdNpt/h0jMWLEiEGt818ZVuiGaBGL0Qi/zurr65kHd0pKimafdnR0GNLFdq5VEiRGivgUqU6ejlcUBaWlpUhNTcX999+PVatW4f7779eFCL3kkkvQ2NioE2RZqc8KoinD7/dj2bJluP/+++H1epGUlITly5cDCK+doqIiBAIBLFq0CDNmzMDX/7AGy5cvx+LFi+H1enHjjTciKSlJ048bb7wRDocDS5cuxcov6cOjRsM/UzusIpp3FyxYAJ/Ph//4j/8AEDYguvPOO/FvP1yG1au1YYXJQaOhoYEZ6dL9JLbb4XBgxowZyM7ORk5ODhRFMYwkQRDLIHou3vXgcDgw4eochEIh+P1+XfoEq4i3HStXrtTMjVG4bH4fGNHUshRWIy5LYe/x/5OBCdG7slC+fNk8KASvWd+vueYazJ07F5/5zGfg9/t164/oLp6O5M9Pu9Y2zestt9wCALjxxhvh9/sZjfalL30J99xzD/7th8sMyxCF+yOWDDglkbEIndVkLCAzlJelc+SfE90hi3phBTSOtCf5SHMulwvXXXcdPB4PSktLmdFCSUkJ1v7XYvZeJBqcjEas8rXx7g8jWV5ycjJTkvDv1Fytj7AGQGf0IdbBp5/hf8OnpOHL4RVgxDeSkob22bRp05CSksJ+S+lSli9fjquvvpopznJzc/F/fzqQSsXv9xuei7feeqv0OZAYmuYCokO0c8C/Tzz4/PnzWfhzHoNNA8XyfmVlJWbPno2CggKoqorFixdj5MiR8Hg8SE5OhtvtRnp6OpKSklBeXo709HR4PB527piNn+wsIL7yuuuui3gmDRV5fDztIB6RyhBTfUZLQ9qBaMu2qtyzq83RlLN4cfgudLvduPrqqzF58mTmTO33+3HTTTexd4Bw9M05cwbW7JVXXolxVxmnb42lzampqbjrrrtw1113MfmZiC984QsABugqAKi+MsPUMFvkJ63wdITiT/2iItGMihKOPjhmWbruzgyFQggGgygqKoLD4UB2djZuu+02FM1VsWzZMtbGbdu2obe3V+O0y9PD0cjDqI+qqmoMh0nmxdM/RXNVRlOTrBUArvnydASDQQwbNgwjRozAihUrNLRFTU0NsrOzMWLECN15RmXU1tZKI/FH2r/8+InPhg8frpl/PuI/AFRc6mGR7YyiW5Ms2Qxut9swwogV2EmniBF1/5l1vHxq0ETBrv5QpEbe4LxycRI7cxYtWsSihomYPXs2AO05kpqaioqKCqSlpSErKwvV1dVMTyOLqmwXhhJNHW9bbDEa4fGTW59nf6uqisrKShQXF+PIc9atd4zelT2PplzCl7/8ZQQCAVx11VXsGXkUBYNBJCcns9B29D8AXHTRRTHXGQ8Guz4z2NEWVVU1KR/o2d4nehiDTZcgRaohNDc368rjCY4PfhMOx0i56cxgpS9mFrZGZZAgqbe3F+np6ZpLnYxGqqqqkJubiy9/+cu45pprpAdWIuadvFL+8vWtTOBDQjcZ0eR0OnHkuQEjHpnRCK8EdTgc2P4HvUefWCYQVmjs/KPck4/6TnNL62Tj/xxHbW2txmjkf9a+aLn/RuDPLRnE3Lg8NErvHWFLYKPw0FZAfSdBk9PpxKpVq2IuT1a2GYy8g2kOxDLochZBRGYwGMSIESPg9XrZGfrOL8Kpe2RCUdFQhPegE73uyHBHURTmgbzzkfCaohQ05G0qMje8sYWIt356UvMdlSUKpnnBO783ZEJF8vZ666d1rD98v8X+JyUlAQB27drF6po2bRoURWFzcPPNN8PpdGLJkiUaAxoeu/8UTp/EnzG80Q2/lxwOB5782geatnk8Hmz9XTOcTqeO2TTaN/yZQWO07fctUm9J2blDfRSNTIDwnNDYkNCXyikuLg57YHNGMkbtsxPEZFkxWOFhdu7YGWEIGIh00/D3DHzmM5+Jq6yrr75a9+y3nwunDIrHwI2fF1LUuN1uzR4tLy+XnlFOpxMTJ07UeN3yiHTGR0KkfOjXXHONLvQ+9cfojLQbH/y2KaHMzwVoEe+a4kHeQ5GMnWXg901mZibmzJmDqqoqpKWlsXtBVVU0NTWhr68P06ZN052Rdp6JR44cibrdIsRII2LaN9kZcMMNN2D06NG47bbbWK5nALjpppsi1mcV0ZRRXV2tUboCA+mZfnLr87jqqqs0BhI/u+NlFBQUYPr06Vi1ahVSU1OxZs0aTJkyhUXQovksLy/Hn7/0LosqQ/9Hwz9TO6xC9i5/Ll566aUsOsTIkSMxa9YspKWlac5kWRmk6PL7/cwIge5Ssd0UppairdDniRMnMhrB6Bzmy5A952ElApXD4cChp8Jp/LKzs7FmzZqIv5EhnnVJvAk/rpGMRqg+MZonII80sufPXYyn4aP88bwjORBYNXYl3tqo71/72tcAhI2PSMklrh2qj+cL+LbHu7aJBuEjbwLhMcrMzGSG4KTIjFSfoijsPNj1pw72PDc3FzNnzgwbfBQVMcGuLD2NzGiE598dDgda38rRGLBHA3qfDP3T09Px5S9/Gd/85jfx9a9/3VDQH81Y09nMO5CYId5z28gwnfgQ0Rjk7f89qfkdP86yss0ijfBl03zS33w0VdFZgupasmSJNOIonbNJSUnsO34O+D6TkwCVaaQ0FMu4gPODaOeAf//yyy8HoHWm4zHYNBC97/V6UVFRIZVl0V1QXl6O3Nxc3Hjjjbjkkktwxx134JJLLom6fbGuYYquQMpmGYaKPD6adqSlpWHs2LFISkrCnDlzsGjRIsydOxdHnlNwxx136GQw0dKQdsBq2StWrMD8+fMt0xl2tTmacuhMdzqdyM/Px9KlS9m6z83NRXl5uSaSPS9DJ5x6RR7pPhpQm8eNG4elS5ciPz8/YvRH0s9QH0ieb7YnZHWalU84/MyAbFeEzLlvy28bdUYjXV1dzFj5nnvuAQCUlJTgyHMK47X4aBe8bL6goIC12SyKrYj6N9KYDK22tpbJXEi5TfpEl8sF50cjmXyV2nnTTTfhl3evx6RJk5Cbm4vs7GxGV8nObPE8UxQF99xzD+bPnw9VVVmfqP5I+5d4aJfLxZ55vV78+7//O4LBIDIzMzVGIykpKYyn2v1oJxwOBwKBAJKSkjB9+nTL42Yn7KRTMjMzNfvin1nHS3tHFoXQLkTqTyQDLdLN8zofIOw4+tFj3cx4v7a2FitWrDDk1WfOnAmHw8HW7k033aSLAhiP86RVDCWaOt622DZa8+fP13wmJpsxvI7zS2hRKLrc3Fy43W5MnjwZEyZMwOWXX44JEyagr68Pc+bMQSgUQmVlJRNi8UwthYa6AOsQU7GIYTr5aAL8O9EIOvr7+zUbX8x3FQ+iVQICAwciESS80QgJ15KSkuBwOJCUlISxY8fG5F0aLRYtWoRrr71Wd8DefvvtOqW32BdRkR0KhZgXAL8vwkIRa+1RFAUut/yw5z2tAE64qUIjbElEKCkR1157rc6b1Qgeb1h4E2uUAP53skgMiQCFso0EI0F0UVERli9fzvb13LlzAWiVPuXl5UhLS+P2RPhs5YkCvvyLLroIoVBIQ8jJjEaAgT3O7pxPn5MggrxKVVWF1+uFqqpsHQPGacgA4MSJEwAGQnfz65/KFM8xUShJOH36tKZsIw9CnqAHwuNbWBjOfTl+/HhN+fn5+VBVld1XPp8v6rOEhP+0p5ycQgYIE5d+f06EUgbAG4ZQP8VnfP/FdW623skjhjckIvACdrL+LygoiPsOsAKv12s78enz+WxlyIiBLiwsSEg0CkUBC2cZ/hzfuUVzKZ6lJSUlzLPb7/ezMN5OpxNVVVWYO3fuoNynInw+n+EaqKysHJQ22G1odAGDB7vOD5fLhezsbJ3wT1VVZGZmoqioaFDopmggO6N5+gGAJXqP+DVRGTkYggFZW6zwjEbp1Ih+oVQXkTyEzid/+tWvfhWXXXYZZs6ciUsuuYR5wxMoiqEReE9GMfKFCD7CBQCN0Sutj0mTJiErKwsTJ05EZWUlC2EdTQoZkU+iNcrTpA6HAxAiBhBqa2st1xULfD4flixZIuVNjNLTyHg73qgM0KaM5Mshg5js7GysXLlSaihtFGkkFljhoag+npa28+7PyMjA1772NU1oeSC8Nm644QbU1NRENCYlkHEYSwWghiPKEWprazVRXioqKnRCUypHfMav/fz8fEAZEIhaVfbw5ZeVlbE6fD4fk1fYBTJmGaxz2eze4PkONq4GS8hK5FyjNcgbnvAGJHyEPCPDk2HDhrFoUyIWLVoUURFI42xmLHIB/3w4H3yQCJL5pKWl6aJYFRUVwe/3Y9iwYVi+fLlOBkDpzgYbg+GNPVjw+/1YvXo1iouLMXLkSFx88cXIzc1l0U/j8fYfbGRlZaGmpgbz5s3TyPCGAvi06Iqi4LOf/Sxuv/12DS9y9913szQqkZCWZt2IIRIWLFgQFc9HNDYAQA1/Tk5OZpE8KMVdLPe36AgMyO9nvmye/uCNKhVFwZ49exgdyEck4MtITk5mKaNLS0vhcDjg9Xo1fIkspaARFOjb+5nPfAZlZWUoKCjAggUL4HA4MHz4cBZVjSKNuFwuVFZWQnEoUr5I7IMRyFBFk44xyvvd4XDA4RzQ8aSnpyMQCOD6669HcnIyZs+ejdLSUpSXlzMZs8ftgdvtRlVVFZxOpya6MCGasRwqGGoykUTBzFkzHpB8mcAbYVFdtLaNxpoMWej+FXnJq666CqUlpfB4PFi4cGHEO4D2Hu8QwZ8/g6Ff+2eDbauGvEsvvmkCgAGBCR1oU6+yLjTPqZETurLnRu/yuPPOO5lFHxAWilAe7PHjx2PBggVITU1FQUGBLge2mJvO5/NZqtNODHZ9Zoi2LcFgUCfA5cNN84x0wSQXi6wQDZqamhjDXT7Lp1FGisIeHmZ9ocMoUluMyqDczXwKHmoToGdKiLGy2r5YMGPGDLjdbni9XrZPgQFlvEyh63Q6kVMTFnBcfvnlbDxCoRCuvPJKjZAkJycn7Ck1Rau4MvJ8djgcKJrigcfjYV4zRICMXqj17KI1Uzk3Q+ehw/clVpiVEY2g+eKbJsDv98esJHS73WzeSfFjJ+MvrqlrrrmGhU0EYBpeOzU1NZxLnCtj9uzZSEtLw0UXXaQRdgJyLzggHG532OxU3HTTTRoBNRlvhEIhjB49Gqqq4tprrwWFP+bf5Y2Z+LzziqIgbXTYUI0IFNprpaWlTBhH+5ui7YiYujJsSENrl/pEAgwyRBGFhXw/jKI5uMpbNGXwETT4caN2FxQUYPHixRqPNX+tVtBJZfAWuPRd0VQv+162Fylk4sSJE5Gbm4sr/m0aRo8ejbS0NCaMNNofRs9zalTN/CuKguKpSVKBtxHxKq5VWTQkSpND5dD6IK9rAJrw/kZlxwuz8J1mEMdPvK+M8oHGAjK4seO8lGHWtaM151Us0ZZyalQWxYRoMVnkEP6cUlWVGY3wkbNEJKrfZqB1RgYticaS2yZHfukCbIOdayoe5pU/z4juVFVVkzoCGAgbK2OWzwefQXXy5yfxC2KkETFMrhWIRqZ29DFSGaQUjiSUobXDC0aiXU8X3zSB9a22thbXXXddVPxztHUavevxeNgdm5WVxeh5wrJly1haHqMyVqxYAQC69DTiGunv79ec8STgVlUV2dnZWL58OZxOJ4v8NWvWLJRMT0ZqaipmzJihqdNsLkXeTzSgp2fVl+Zq6L8bb7wR06ZN0ygRIiGadbly5UoMGzYMWVlZGoMRflyNIn7wY0qC86qqKmRlZSErK4sJ2aiPvb29uOmmm1A81Yvs7GxUVFQgIyPD0KCdcplbAQmWZX034hnFtcOfdcBApBqj981g9K7YH2qb7Lw2q4/Gi+agdEayzrvV4XBg9erVUFUVCxcu1NC0BJmxRSgUwnXXXYfc3Fzk5eVhzMKBNSkKc62AH0+rCoBoxpoijViV9cR7bsvmiu5FWaSR4fMy2XuicTsPWcRJ2ZzRO3wZIt9I+052vyUlJRl6QvN94+fg5ptv1r27cuVKaRk8zgedfAFaxEIH8OAjy4g85WDQQIRLL70UiqIgp0bF3XffDZfLpTFEXLt2LYBwupTS0lJcfPHFmgjgsSLeNWxGuw0VeXw07SgsLMSkSZM0kdOrqqqw7O4Z0vejpSHtgJWyeVrZqtLTrjZHKofO/DFjxoTXfE4OcnJyNG2OFLmchx3nMLU5WkNe3iC4aKqHfZ4wYQKcTieysrJwySWXSO8qqrOrq8ty+/g7TIyYTHS/w+FA6fQUTQS1Q4cOAYDUWDmnRkVaWhpycnLY3Zubm4vhw4djzJgxyMjIGEh7PDuN6QQBc0dCABgxP1PXbt5JbtasWewzzaOiKJq0L0bzK4s6E4muJIg8kwjxucvlQtkMH+O9gAF5sKqqSE1N1dAnVVVVuO4/L4Hb7ZYaq1NUKLP0d3YhkXTKP7OO14pzZjQQdSNAuD9mhmD0HdkNEI1CMmo6e/gokhSt+ZJbwsZfZWVllvpA/OGoUaPg8/lQX1+vi/6XaAwlmjpu2simdjB4U8ILhzx9aXLcSS5UVVVZUsA6DGQOsudG7xJKS0tRWFiImpoaHD9+XPd9UlISMjMzkZSUhOrqap3VrbgonU5nxDrtxmDXZ4Zo22LmgU0KXNq4Dnf0UUaAsOEP1eH0aD00zEJ9m/VF9MIi3HjjjRrCz6wM0SIWGEiVIRIlsnD/iZj34cOHY/bs2WyfEqiN5eXlmvGn9S7OiyzCwvXXXy/dH6I1IG/p6HCFGRpiZpgC3etg9QADynmnZ4D4IsJG7Ess8Ka4Db36IxGQYjnxQFEUONxhweScOXNs9xQR52bs2LEYOXIk7rzzTgDadFwiLrroIqxcuVJTxiWXXMKMTmgt0N4xajv1kQTUoVAIaWlpzGiB8jfzArUrr7wSTqcTK1euZKGhqSz6u6GhIXyWfNo++i2vYKb9OHr0aADhEOqqqurOfZfXgYkTJzLjA164CAAff/wx6zP1gZRd5JFJwnV+/ACgpDxcFxmFiJa/fLjiyZMna7zSqA2XL1us+S3/v6IobB6nTZsGpwdMoUO503lQH8rKyuDxeNgazs/PZ8afRuva6LnDrZ9/OkcA4zDP4vuydtLvU1JSNMpGp9PJjGbEssS9befZGo8nlDh+oqGRXYQ9r8Cy47yUwZvikhq58UZpBCMhhsM9ICQgATefjoGM8a644gr2G9q/ZDTC71N+/BLVbzM43OG9R9Ef7ITMwzkj2z7vpAuIDDvXVDx7nT/PeEZYTFHAG42YlTFYkNUpRj7gI40kJydrDAUjQfS8sqOPkcqoqKjAqFGjsHTpUtP3ZGsn2vXkTXGz85Jor2j452jrjHW9K4rCPFuNyiBPQD49zejRo5GRHRYoLViwAMCAUJPWhRh5BICGTlAUBU5P2BjF7XajoqKC0Xtmc+lyuRjtz/Mw/D71eDzwJLs0PN7w4cOxZMkS6b1nBIdbH63VqE3V1dVMuMaDH1fqq3imMFrTPeBUMW7cOBQUFMDtdjN674477sCoUaNQUVERNr7xDNyrRlEUFEXBlClTLHsA823hYaY0FNfODTfcgNTUVNYmMrgwet8MVt/lo4NEWwbRJ36/H8NGhGky3vNYURRkZGRozm3+7CaIMhVSiLDobElODb0cDZixeVGRhs6KhGjGmgxGrN55ZvvUShkyhxj+O/EZyRlEfkWMiCjKQcTUMjx4Po43MhHT0xhF9LQCfg6iCbdvVMYFnB/EQgfw4O8/MfrgYNBABJa+7FN5T3l5OdxuN0vR5XQ6sXr1aowYMQJ+v5/J6+NFvGvYTO42VOTxVttBOhePx6NzTqwcWcH+vu+++9jf1eNHx1VnLIhUtphO0+rdZlebI5UTCoVw2WWXWY4kEgneFHfMZziB2mzmPCsDH2nE4dYakQAD92B2djZqamo0RuLRjLcoq+Uh3tlh+b+i42/pf7GMz9y8Gh6Ph9FToqyUfjN27Fh231s1+Hd59Q4MoiL60ksvxeTJk+FNcUudC+2iTVVVZbwTM/AQXie5b5Jv4C4oLy8PGzV5FfZ7vh/8Z15Wn+TzGOrVSK5ul9zSDHbTKRpngH9iHa+ob9C1RfL84osvBiB3bpc6U7jN91Bqaiq8Xi+jTcTgDHxqJFprEyaEjR2szntaWhr8fj86Ozs1gQqAsEHbqVOnWBTFRGMo0dTxtsV2o5FXfvWh5jMdqh+9Um9ZuVK/VX7gyJ4bvSvCDmuiqqoquN1uy3XahcGuzwzRtoXmXPQeBPTpC069G4h6nvx+PxOOKYqCwxs6NHWYGY2Y9UVRFCQlJUFVVSxatAgzZszApEmTwpE3Po1gUF5ejvqtik446PP5dN4r4iUqC38oKoHinXc6NGV9E/cpjR95UTU2NgIIH9r1WxWdRxAfYYRXSrhcLgQ+ydKVzXsr8Z42de/2sXcIpaWl+Pi1Jvau1+tFKBTCtddei/3rm3XEl9iXWPDKrz5kwprk5GQMHz4cy5cvtxwqji9HFBopimI5EoGiKKjfqo3AQzlq7Qj5JltTxcXFjFE3izQyYsQIVFRUGK5Lfq07HAP5qvn0N9SvQ2+0MUL08ssvh9frRVJSEvLz8zUeV6Ild2FhIcrKyjQXPa1ZWuuN27VplkQP5VAohEWLFgEIh/b1eDzM2IAi3ez7WwMr/4orroDX68VnPvMZzbyOGDEiHPXjU+/GG2+8UbrXCQ6HA9XV1WjeoVWK8UofQDvPY8aM0Z0jqqrirUf3szJF8G2cPHkyTmzqZcyQbD2LbZbtJ6M9ZvRcXMOKouD4pm6pZT7/md6lfUCYOnWqhnGVjTOd2XzZRkoOO+9UmQLHKvjxy8/P1xkW2JVygL8H7DgvZfj7Hz4CMLD+aN+La5TPwU6gqCL8vNAckrHVVVddpUu/NHPmTKiqimXLlmmYF3qHN8JJVL/NUL9VwZgxY1BUVIRFixYlPE3N+ejjvzKGynjz+4anzWSRRowi8ZwPPqN+q6LzhOfP9WXLlmkMy5OSkqJK+1FQUMCMFag+O9psBqfTiaSkpIhndzT3rFkZZNhA4xaJf16yZEnMddpFaxuhoaFBF02u7p1ejBo1ivWPjEYIRBeIKQP5v09s6tWG3P4UZnPJG6J4PB6dIaLf78f8+fOxa90ZKZ8ZDe9Qv1XRRUGRgSLuyZQA/LhSpBUx7RO/RngjEPp7zpw5ALSGWw6HA8c39ejeF6GqKsaOHRu1V6s4BxMmTDBU2olrh3h/MXqD0ftmsPquWcQwszKYUden/M2O508BkEdVGTt2LNLT01FWVsboHyPZCUt38ylUVWVrMh6jkfHjx0vlB0aIZqwp0ohV5UIkWU0k8ONgJoOi7/avbzE0uOI/8zwOH/FGBjGFKRmZ8EYj9CxWJPp8voDBQSx0gAyy9N920UArVqyA3+/H3LlzpekFFy1axNYz1TljxgyUlZWhqqqKyZzsNqYH4l/DZvt4qMjjrbbDzHiZHydaJwsXLsT+9c1x1RkLIpV91113aT5bvdvsanOkOygYDBpGSIsFRNvfcMMNMZdRv1WRpg+JBN65gKed+fsOCPe7qKhI45QkjpOZ3JrelckFxTvb4XDgyJsdGv62vLwceXl5KCgo0J1zr/56G4Cwo29WVhajo0SnIlVVcWKzNipupLW1f32T4XfJyclQFAXTpk3DiBEj8MqvPkRaWhrGjBmjofHtok1VVWU8HdES4hwQj+g6FTaWT0lJgaqquOGGG3B8U4+uPNFohP/8+u926WgUKwbvdiORdMo/i47XjDYOhUKWdUpUjkxvefr0ac1nr9fL+EsC7bf6+nrk5uYiKytL2jZRZ9PX18ccdAlW5724uJgZQ9F6d7lcuO666+BwOFBZWYmpU6cOSurwoURTx9uWhMdlGbDwCX8+evRoVIxoPFi6dClbrCTIIGWhCD7FgRG83nCYf88/Ub7DwYLMe4IXZCuKAhUDoYSsRlhwOBwD1uyc9Smf7y0aQo6iiCQlJbH2ZWVlSa1sgYHDja+DfiszGiEF8WDkzIw2zQ9BURQm8KQyiIAkIwYx+gH/W69H3zfyWKKIFlRmcXExm+vy8nKUl5cza9X8/HyoqopbbrkFoVAIBQUFunrpXTvwwQcfAAgLez0eD0pKSqQRYKyCGGlRScMbz8mEuukZ6RqraFI+mHm42YERI0ZEjKpiRoTwhHh1dTVLbbJ48WJdGSoGjI2I4RAFZjKhJ73D10Xh/Gh8krxawwGn04nrrrvOcM0C4TnJzs7GggULMHz4cI2Fv+hJRiFV582bh1AoxLwbzazlKTyoqqqAEvaqraioQGpqKjweDzMMWrVqFUKhkIa5EL3Zwp+15ccDqttu6LzzuCNdNKwCtN5R4lhOmTIFJSUlzNLZzDiHygfCFtK80pTC4Q1ViH2K1mjNCLzhVqKgANJzjuaSQlfSfFD+VyAcVcTj8SDFp2diKNoOL3AnXHTRRTqBP/93tB42dkNxKKyPJEi4gAtIJPh7ziw9jVUae7DBn4FlZWWsrbHQs06nM6qQ0PHC6XRqDBgGA9OnT9elADXCzTffPOTPIDFiHdSBNV1bW4u8vDxd9ByisUV+i6clKPpGKBSyND88DWJE2zmdTihQsHPnTjQ0NOi+N0J2draGZ/R4PDpDyniwcuVKpKWlWdrjPH3Jh6IOhUIIBoM6Y0wjg93zBfHuH8y9Fy34s1iF8bhRKOVly5axsMypqakoKSmBoiiMb1RVVUcjUnqnSDSyWRtVVcWIESNiTrsYCR6PR0MHxgMrykOjcaA2iDQkIF/josGJuHdEORd/z4p1yCJLxjpnF3ABMiRsPX0qw7jiiiuQkZGBVatW4aKLLmJ18Sm83G43RlaNZD9NT0/HqFGjcNttt9nfLpswVGnjwYCihKP32mkAYRfI4x2wxxHYLowePZrdu3Zizpw5Uu/+aBCtQl+kpwA53ygz9JDBCt/W0dGhe8bvQf4c4+nUjIwM5OXlmaZdI3qJnBNkskeoA33Izs7WpKoxKFVDy/EgB0QedOaR4YqdEOl2MygIt9dsvBYsWID09HSNcY6GnvzUQAoYcAYX05H+I+Kf4cwXDfbF9ZmZmckMvci5nYzOv/SlL+nKI92T2f7u7u7WfBbXIukLeYjGZ4A+KuzYsWNtiQJy11134YYbbmCygNGjR2P48OHnxdDpnwG237r3/FyrgKJFsfDzIwCEF60oXBFRvlS+eWXPjd4FoGPQbrjhBowYMUL6bqQFRMqH6dOnY/zqwQ0BbtbHwUYsbent7UVbWxsAuZdHY2NjOF/aSrcuJGskkCCQhMoX3ZSp+V5VVdTX11vqy/DhwzF69GitYMfgIrn66quhqiq+/GhYGU0pKTIzM3WXbDRGIzyhfsVXx0XovTmoDpkHm7hPAejaDIQvofKlqoZg5BXYMvzbzxZrlHWqqrIQiaKA666fLWF1p6WlMWHS7LtKNMYpRLROvsUPh8PBjFAuueQSaV+iBV/GmjVr2IUXbahMKkdVVUaYmV3kYohe6juVwUNV1bi9Msz2r1WL9nt+YTzera2t6Ojo0KSV4UF9qvlMpo7A5Q3GZEJPfi0A4Xni9+qoUaPgdrux6sEpAAaE+g6Hw5IAnTyEAeC6b03T7ANeEZecnIz58+czBYQV5snlcjEl+mf+axbr5/Tp0zVnDdXJG0Pxz6kNt/5gvml9PKqvMxf8ih7Rsv1ktMeMnpcv1c9jpHaI48CvVUVRUFFRwQyRrArDRK9Q+o2dd2o8ggI7zi4roHFLZJ1rfryArWteUUvjQyGJx48fz9YFhaInxcHkW8PhBHnwd6VsrMV9LRUKYPDGmscPX/2shvFJtLHo+ejjvzKGyniL5xnRa2ZGI5HKGAxMvS18Tsjawz/j0xFGA7fbjeXLlw8YJtvQR7MyKBKEFcV1NPesWRkej4fRmKqqmvLP8Z6LdtPaMpBAkuab6AY+RDJ/D/DpaUT6kac9+Pdofszm0ihygEjvLf7CKNZeqxBp30v+b9ioc+nSpablEP/ocrlYCmACP64ej0faftk+SE9PR01NjYamovFjIbyv8+noc7vAt+XGG28EAENnADO+FdBHGhlKa5vod5r7Rf8+0vBdHooSTmdDKUuB8DhRugfZuXjZF62n8LIL0YwfKRkURbHk7Rhpn0aCEY8m/k1re+raPJ0cyIrRiEwYLquDN2bjjUYCgYFou7HsscE4ny8g8YiFDpBB5oBhBw109f3hcPLl5eWMrwMGIkYSFEXB2rVrce8v9NEuYnVqs4J417AZnTlU5PF2tMNMriM7f6jORCiJI/VHUbROglaNRuyaL7NysrKyMGnSJFsdou/5+eUYP358XPsk1r5T9ACHw4Hq63w6/ks06DCr0+wek7Wvry8cfTw5OZk52tE5NuGGdM2ZZhZNzWhti0YxPI8BhM8lIzkNRZWfc7c89aRMdxWt7DTad3nDbvqfxnXBggUsNXlubi5u/N5sVFdXo6ioiI3BWEEuW1BQAJfLpZlznta55+eXsd/GEwUnXthNp/AGv4N9xkdTn3jH5ubmsnMnMzNT8524FmtqaqAoisYYSHQ4EXV3fDlmzimHDx9mf5cvHUiLR2uSUsTffPPNAMLOxPzZMWPGDCiKwuTDRplJop33/Px83Tg4HA5bnTQiYSjR1PG2ZdDS0+xedxaqqkrD2Ik496H8kpE9N3oX0At9Ro40ZtJpQRuBFt2kSZOw79VGU28tq2l4rMKsj4ONWNrCzwEJdLq7u3UHUSzpaYYNG8bGW1EUfLJBa7FqRrDI+sJf+rLf0uVZWFgIVVXx3l+OMIMVIByyWFVVZkXPp3ABgFtvvRWXXHKJpXzXH//NOASaVSxbtgwLFy7UWY1HClFEfXc6nTj3oaITGMqYUcJrv9lhGJmF/r/33nsBhEOdyQjQva82aH5LxOGWp48hGAxqwuHZHZLV4XDgyiuvjIkBeOVXH+qIaJlxBEHmzbTvb43Ssv1+v2moSSsw279WhVSH/t5m+PvDhw/j0KFDbI3IvBUURcGRNztM9zo/5+JzWod8WipCRkYGjr0VtnzNzMzEzJkzNd/Pnj3bcF/z/2967ICuLTzhPG/ePBYJxyzMskzo+NajH2ueEWjddHZ26s5M0aDr7498pClfBirjxOYew3dksCM9jXIiX8ds8O2gtvEGfSIjSOcOoDe8iqS4kO0rHrJ9EMu9bRaq3AqsnF2Rov9YAW9slrD0NL/fw/bmypUr2RwTI0NzMGfOHLYu+vv7NWUc+ns7zp49a1iHkUGITDAv4nyECHzttzs0n0WGz07k5uYOqTCI/wqwe7ynTJkS0+/484y/N2UKdiOP5vPBZxx9q8sS3SHm1I4WDocDmZmZtvTRrAzesDMS7EpPQ/UC4bOv/7BcsHPuQ7kx/lBKTwNoI42oqooTm3s0tEQkQyiRnyDaIxQK6SKNRJIfEMwUzTteOI3c3FxTA/P/83/+j+YzCe+AcOjc0LEwrzFy5EhNSjVCVlYWvF4vi3zodDp1hhWylKOi8ZLD4cCaNWtw7kOFheVNTk7GyJEjdfQaz++d2NSrMyQhpKSkGN65VsDPAdFgFJVMhGzt8HW7XK6YU/Elem3TmqV53/HCKUtnH7/O+bM7OzsbWVlZcLvdLILehAkTMG7cOJzbCgvesvYimvEjwXQkWQLB6j41gtVxpjE+8HqroYEyf3fK9oxRfTwfJzMaIQG+UTRkK7iQnuafA4lMT2MHDVT/oZ6/VlWVnb+LFy9GamoqUlJS4HA4Bn1NJTI9zVCRx1tpRyQZZiS5Tk5ODlOgL1iwAB37wgZ+dsgkRETqj0wWaEe5VmFWzuzZszFt2jRNhOJ4QXMQT1ryePvucDiY3I6/3/g7UKQHxTrN7ue+Q5mm9dPao/vy6Jtd7ExraGhAUlISioqKpL81WtudnZ04duwY6xMADY8hQuZoTjJ6M+OZSO2wizbleWP6P6VxGEaNGoWSkhLWrylTpuDNP+3T/f7Epl7dMyCcBpSM7BUlHH1oxYoVePXX2zQRf84X7L5TfD4fc64b7DM+mvrMomWTIwHtCXHv0W+JBuVBa5/4T17eQt8tW7YMAFj0Q0JqaiocDgdKS0uxaNEinPtQYRF3aP9QxPnU1FSoqsr08VdddZWmLZHOu1jmPR7+1A4MJZp6yKWnObJTK/AnoUX94QHv0UiETPcZ68+N3gUSFzJVafeZGqDIwlPFA7M+DjZiaYtMMRoMBnHy5En2TFEUdJyKPgfvwoUL4XK5cObMGezZswctJ/o0dfEe1iKM+rJgwQIAwB133CH1rOIPn2O7tVFMrr/+eqiqyryLqQz6vcPhwOzZsy0pjxqOdEV8xwjf+MY3AIQ9uGVjKu5TQEsIktDO6XSi+4z+0BWZUf67o7vOsc+8IpbPO0hnwFFu/HiBS+ORbo0whurrOafoQlbJ+hItxDLIICHWcqjdsrPOSKGdlpaG6dOno+loj6YMQllZWdz51+w4S47uOid9TvPn8/kMFVP0Tuvxft26FIX+4u9J4MrvSdnaPv5RAyorK9ne5wV8tbW1hv3iBX91e5t161s0AAP0DJMRYcL3qW5vk5RoIyJ9//79aGxs1PxWDBN4Yk+Dpm88xGftdfrUYGZtle0noz1m9Lz7nD6sId+OlpYW6T7gBardZwYIV9FoxArMwqvK9oEslYmVczoeOsPK2WU3sWvHeSnD8T0NmjOc5lE2rvQ9eZUAYQ+TluN9uncB7VlgNh78GSEiUf02g1hnZmZmwtIzXnPNNeelj//KGCrjzZ9n/H0hU7Ab0djng89oqwtEfIf6kZ+fH7PRVWlpKWbPnm1LH83KCAaDyMvLs6T4i+aejVQGfwd1n5PfR91n5GdnNHUmgtYWQTIDWqftdUGpMQ6/pvn0NPQdPybtdUEEg0EmBCVEkh/Q/WSkEFYUBecOdeDuu++WpkEi4R05RuTm5mLEiBGYPXs2uwdSUlLYmGRkZCArK4vRLiNGjMCYMWMwYsQIrFixwpTWEMdVvAeHDx+OvLw8eL1eBJs9LA86QRw/vr9tJwOMD6MzhBwfbr75ZjgcjpijaHWfAaPLIymjZGuHb6vD4dCE3h5qa5uPNHLusD4kuxF8Ph+bHwqzPnv2bEyePBk1NTVMZkEGuUd3nYvZoJkimESLWMaP5i4nJweVlZW49tprAUDH48cr55MZfYlnBL+Omo/1aL6XGVLxa47OlLS0NE2Z/J4Qy5AZjTidTnbHxaKYGYw1fAGJR6x0gAiZDMYOGujER1rHJpfLpdmzw4cPx/jx4zFmzBjT9iUK8dZnxmMOFXm8He0Qx6mwsBCjR4/WPNfQUSfDMpx4DBmMkKhxtavcwdY12bFnYu17bm4uu6NIbifyjuLdaFSnbGwmTJgAv9+P5P4cVp8IVR1wxqW2tNb1s3ZMmzYN2dnZhg7csvEbM2aMTravqiraTwY1Rp2R0HikR0c/GCFa2Wm07/LjT5EIO04FpQZMJz5q1NEvbSflPPi0adNQXV2N0tJS1NbWwuFwIDk5GUd2njWMBDiYsPtOIUNsYPDP+GjrW758OYCBKOEUgZD4qMWLF8Pv9+vkJXl5eRoaNCMjw3C9V1VVse9EwyxR/33nnXfC4/HA6XQiKysL3WcGaHiZzIavk9/7Mv5TRCzzrqoqzp2T660GA0OJpo63LbYbjeSWahcITXx6fpLmsxk8BnJB2XOjd4HEWRclZztNjRtkgo940kqY9XGwEU1bxD43NzdLxy0QCERdNoG/tAOBAJKynMx7XVEU5pklA18fL5Akazyj8EX8pesvTdMJ23iQMJOvJxLoEB0+tjTiu0A4Z5gIsR2i8Yy4T8XfEVHgdDrhyQgrVNPT0zV9N/LE85emMyEKpfEx2ov+0jTN7+mdVL+HfeYVzzMvnqzLs2zUl2iQW5qB0aNH47rrrou7HB6ydWE0bl6vF4WFhfAJfbcTdpwlZuM9ZswYTJgwwbTdiqIgKcuh2wuiN7FMwUUCOhobWT05xfJUNFbvAlVVkVWYovmNaLzEt5H/LKZf4kHvZRXp07SIxmkiU8PvAVVVkVOSpnvPqJ9JWfIzxyinsGx+jebc6HnGp/c9326+HYFAAKo6EImmsLCQzTWdLZ6M8PkzY8YM3Z5X1YHUJpEgGyPZPuCNDEl47fV6TeeUTwsQC/jxmzp1asIMXY3qtBM5JWk6YygzZZvsfEvJNo8mYDY+osGKiET12wyyOhM1x3l5eeelj//KsHu8Y10b4nlmZjQi3pukhDwffEZSln6/y86LUCiEcePGsbbGCjv6aFYGRbOwYtwSzT0bqYxx48KpLBVFQWqunG8xanc0ddpFa5tBjDSSlKU31jUyYBfPfZ72oFSmvNGI2VzyedbN6PeM/CTDfcu3Jy0tDT6fj3lyrVq1CkA4LK84JmRoOX78eNTU1DBljhnEMnhl9owZMxg/y9NifDSKjIwMXTQfNn6ZDnav5ufnIz09naWedDgcqKqq0qXLsQpPRpjXzsvLM6W1ZH2k/siMsI3ej6bsaGFWxrx58zSG7mm5XstOMrfccgsmT56MoqIiJjCuqakxjHIYT19cLldM0S5iqVOk1Wg8RIVHJDmflXoiPed5r5Rsl6ZtIm3J84L8fUrplQhr164FMOBNKRqeUJ/9fj/zCibEopgZjPP5AhKPWOkAEeK9CdhDA2UXayOGFxQUYNGiRSwVgtX2JQrx1mfGzw8VeXwiZHkXX3wx0tLSNM/5M5LohnjpcBli6Q8pS+0uN5HlWIUdeyaeNpP8lebcyHFHlOOIdfLf8ekzFEVBTkkaioqKkJOTA0VR4PV6NdE2+QjnDocDSZkO7NmzB6FQSGf0LEI2fgsWLEBaWprGoVBVVSRlyp0cefB6vTQJj0U0gZV2mD2P9V2/348RI0aEFfeFPtYmvn3ZRXpDkqTMgfnl4fV64XK54PP5NI5OQ4U+sLsdfP8He69bqY+XNVPGjtWrVwMIG2HxEGU8d955p+57VVVx00036eqh3+3bF45Kk5KSonPaZLqMT50hxH1jpT8y2tsKYpn3vr4+tLXJI+QPBobKngHib4vtRiNrfrBQ85kO/Ev/zyjLZZRcKifYZM+N3uVDoNsBftPUXJ8VdXi2ioqKmOs26uP5gNW2jBgxgjHM/ByIoeeJKHA6nZixtjjq9hBDNHHiRKxduxb/9pMljNmmKBdGUWH4vhh5n/AQFciqquKW7yzQERsy4oMMO6I5HO/+6RWW3hPzjN1zzz26d0TjGXGfAnqiweFwIDU1FVd8qQZVVVWorq5mfePz6IlY+8NL4XK5mCeakTJcURTc/F/zNQI/enfarYVahfOnVouydsueRYs1P1iIUaNGoaCgIO5yAD2BbXZBimtnxppi9ne0kXcigV/zsTJ+1EdRsMgL0mQh2nkDqtFX6UNai2tK1n9R6CYbn888NMfQaCSSMQuVv+KrEzVr1iis/9KlSzUhsa+//npNmZT7lT8zVt03XaoYk33mf3vgwAH09fVBVVV85qE5hu+LGH2V3CtE9OYjRLPHjJ4v/ny1bk/L2kHn0sqVK+FwOHDNNdeweS+5VEVeXp40bYyqqigutnZf3H777ZpxKiwsRO3qbJSVlQEYEFDzayZSKgS6z+m8kBnuWQE/fhMnTpSuZztDnop12okbvz3X1PiJh5Gip3Z1ZOPaWGm6RPU7ljqLioriMh5ZsWIFAGD+/PmW6ruAxGCojDd/r9P5ye8vM6MR+vt88Bljr04z3AfU9ttvvx0LFiywhRa66muT4i7DbJwoD7QV2EHL0vtz585lz2bdJqfrjNodTZ120dpGIMN04glVVcXoq1J0huf8HcPTnEBYgEZCNLfbHU7lclUKE4KGQiFMmjQJXq/XdC7N0kjyzxd9foxhGbwX1Zo1a3S0zPDhwzFx4kTNmND+9Pv9mDBhAkaNGoWLLrrIsA5WvjCuvNEI/Z7Gb8xVYaEyH43iqquuYjShaLDN0+s1NTU6JwQzvtkMROdR2yKVIVs7RgaoRu9HU3a0MCtj7NixmnGac2e55TNNURTk5uZaljsNJVrHDCIPaEQ3Wt2nkeriIdKR1BaHw4Ga67N0bRMNPRwOB9LS0uByuVh0YfoNheKmthUVFWnuZXG/FBUVITs7O255ZaLX8AUMDmKlA0TIHG/soPNueFBvIKgo4RQIMuPBwV5T8dZntg+HijzejnaI4zRixAi43W6s+cHCAYV6UhKLgER0gyxlR7xI1LjaVS6VI6MzYomEGwni3EQykpAhlr7zNLbD4WD0t2icLdLfRnWayb5v+NZsRnNOnjwZaWlpSE5OZroDMdLImJV6hzsjGJ0BtK4bGgaiNZNskr+XaY3LaJKZt5VoylRVFQUFBaipqbHcjkTRpqNGjcLKr0/W0VWKouC6/2+G7mwzkg/b0ZZEwu528OMy2Ge8WX1kSC87Y4xoX1G3Y5R6nfbClVdeqVvnlCXBDKLjZiRZEgUSkOlMZe2Kl6cbKhhKbY63LbYbjXz/+qc1n+myefa+3TomzCjVwidPyQUHsudG75IQKdYQnSLuuOMO9vcHv2pmll6xYvz48ZbfrX9NH979fMFovEUUFxdrDiES3vGoqalBTU0NiouLcfPNN6PuxSRcdtllbI1YCYFHayo1NRWlpaV49r7duO6667BixQrmSS8qB2V94dcm326jQ5O++/FNL0ifi4cdhZGKRrDG7yVquywEcm5uLu69917N52jKBvSXiqIoyMnJQWpqKt74wVH2vLa2FqNHj0Z/f7/hRfT965+Gw+FAWVmZZjxkF8VPbn5RqkB/44fHNL9ZvXo1kpKSdO2W9SUWfP/6p1FbW6uxhI61HN64Bghf9rzhUqQ18PoPjgAIe9+JBkGREEmoyK95CmkcLYzGu76+nhH3sj7efvvtAML93/a7NiiKgmuuuYZ9n5qaysaJN0AhUNl8pBEZ0fTTNa9I2xdJKM1///v/87bm3fb2dp1CHABKSkqgKAqLTEEgI4L58+drQsCpqopf3/2GYf2R2hUMhsMoGvUR0DM5Ox62HgYbkM+v0ZzLno8cORLrvrVPIyQV20Hnoah8yMvLQ3p6OiZMmIBTLyUbpvKwomAg0DlVWVnJCOQdD3fg1ltvBTCQB9LpdDKahD+TZPPCK2EURWH9iRZWzq41a9bEZXQaS52x4H9ufUmzR0TDHn5dGo3rO/9bb2o4ZWXOjdZGovptBqM6KysrddFzogEJJ+bNm2epvgtIDIbKeNO9np+fj9zcXKZwFyEajfARMazS9nbiw9+06J6J7fZ6vaiqqorLyIrw2nc/idsw2GycjGgfGeygZcX3FUXB375zWPquUbujqdMuWtsIZHTLGwzv+H2HjpYQaSJFUeByuVBaWgqv18sE0ZdddhkKCgrw0aO9uOWWW1iZ06dPx9q1ayPOJQ9+XfJOAE9/fYdhGXyaD9l+pKgj/Jikp6fHFPFQHNeSkhINfcXTNNsfbjcsp7q6Gm63GyNHjkRZWRm8Xi/OvZoOh8MRk+LCDEuWLMEnTymYOXOmJYW5bO2Y8elDaW0DYV6QzubXvnckbjlSrO0YKnWKhmCAXDZmtk+tnLdGa0RUaNG58s7/npMqu+hMUJRwKPPFixcz+RaP4cOH6+pSVZWlL6byyCie7mSZp3I0GIw1fAGJR7x0AEFmNGIHnfez2/5m+J0sBfBgr6l46zOLDH4+6GQZrLTDyDGIYCbXoTPqrrvuwhe/+EXU1NRg++/aWepzuxHLuFqhGeyaLyqHvxcoqmAijEbEuYlWFgzE1veZM2cCGOAVdzzcgaKiIvT19aG9vZ1FweMNSPgzRqxTvEd5g6Ofrn2VyYz4NDPTp08HMHDfUh3bfteuK9MIRmub0u/l5uYiEAjA6XRix8MdmntZVVWma5Ept1/73ie6clNTU1FYWGi5HYmgTYmG+d1n39R8pr9/ced63XOSy5KDo11tSTQS0Q6a68E+443qy8/PZ/M0ceJE3fdWDaZljiyRnPsoJY2MDiY9mWicQm016g+/n61AFrE1Vj7jfGKo7Bkg/rbYbjTCo6+vT3do8Yvl9OnTiaweDocjZsWoCH7TFRQWRC3wj8eDIBLhNxQhzrvX64XH48GoUaOYYtvhcODSSy/Ftddey4j0yspK9lsrY6YoCvLy8nRMelFRkUZACJhb6vLCABFiO7SfjS3ijNobCaJXmc/nY56EMuWh0+mUKlejEUitXbvW0nj7fD4kJydjyZIlGkEJRXchglBGRJuNk86wRNUKceyOuJFI0JlDffV4PBrvC/ESzs4eiHpAIGOqGTNm6Mrnx5bOJTJaMDLkkSFeryYZSEghmy8Slno8HhQVhY3KeKF2RkYGC7POK595wz9esGa2LsyMvcxgJFx0uVxISUnBJ598YmktyowIZAJKKxCNZ44ePQoMDbmFFCQIVVXVMMKSUVg8RVGQnJxsGB2KEM2Zy9e/d+9e1NXVaZ7RPLhcLl0ecYfDEU77IRji8eEAE02QpqSk4JJLLkloHfZAv74pIobuTQPjSpfLmpe+GRJxrg11WEmHcQH//MjMzITT6TRUsNP9TPsvPz//vDLUITWEY8eOaZ4Nhf07f/78qN7PysqC3+/XRbU6HzAavXiM1AYLJHTl+QH63N3djcOHBwxi+DWdm5uLpUuXstQdgDZyF+/EQHeSmcDM7Dmgpe8UE2JMTHEqlimrgwxg4kV5eTkzkOJpnUjrc8GCBfD5fJg6dSqmT5+O9PR0eLweKIpiq/EqMMC/0P0Vy96JhpY+3xg7diySkpJQVlaGjIx05lH7rwzZPQVYj7AXq2xArE9cR6IcKi0tjaVzMpMXmdXn9/s1fNHs2bM1jh7/KOv4Av4xMGHCBEyePNn2cs8ziZNw8Mae/yiQyYDXrFkTV5kkr/N6veyZ1Uh6dmHKlCmmcuPBBt+WxYsXM/1CIpGamqqTEcsgW7fR0pKkf6CoiWnpaVi5ciUKCwsxZ84c3HjjjWzcSW9jdg+K30WjkyOZ8cBdrTUwjQUpKSnMmS8YDIZpT8Uabcy+/7R6ascVV1iLCj94kEdwE59/+ghA9DzvPxuilSlHg0jybBGyfSXqgnJzcyO2d/r06fD7/RqDM5fLhTlz5uh47Ej8MM/7GekWo5XJR3LKT09Pt+QEb6WuC7AHtmti51xbzf6mHEKqqmLswkLdIunu7paWkTVWvpBkz+mZeDEmkpjg+0gYNUqefsfM4MPoNzymT5+O2av09Z0vGM2NCJ7IBMKCvVGjRmHmzJmYPHkyRo8ejezsbCQnJzMiVDaukaAo4TC+RFDJyuAZ/PLyckZcZ41VUVhYyATnsogXsgOVP2xnXTNa971YBsHj8Vg6vGbMmIGMjAzWl+TkZNTW1mqEvsOHD2deSzI4HA7ccMMNhnWI4+R0OnHgwAF88MEHmueqqmL4bL2Fc2lpKRRFYZaGpGxdvHgx5lxbjdtvv10nDBKV3wAw82p53t5hszI0vzVqt9GzaGFHGXw5coItDLpwMzIykJSUhNWrV2vGasScLNP84AsX6sNL0TqItL74/WvmTWEGWR8B4L777tN5MovtGT16NGbNmoVLbtSG8ePTJ82cORMlJSUsXx/fX0VR0N3dLd2rhBlX66NLiREQZOAF67WXlWn6yIedj5YAWbp0qabuycuGSa13z507h76+Pl0d/B4gK/9pK6wToQUT9fk3zRDNHjN7rqoqi+6hKAoKJnpY9Ccj4zwaI7fbjYU3TTQl5CPNg9E8BwIBzZjI9mpKSgozVlq5ciWAgfyRVuqwCqvnTm5ublQGYXbUGS1mrBypU/SJc1RYWAifzydNXwUA13zuYt0zHoFAAJ2dnabvJCUl6cImAonrtxlkdV566aVQVdWSEMgqKOLI+ejjvzKGynjTva6qqiaVGkE0kKN7mt+jRrR9Ig12fSN6dc/M7vZ4MWJutqVyHQ4HxoyRpx2RjZPL5YKiKFEJ1O2gZcX3hw0bhuX/JqcdR1+cJ+17NHUOFq1NnoWqqqJwoheqqqK1tVVz9vNr2el06nhtXpjP18lHv8oaa0xfyFIj8v/T3+MXG6fJS0lJ0XhQGiER45qTk8OM+YGBs0FRFBRO9Io/T3j7REyaNAnjx49H9ULrkX9k7TCj6Yfa2qa1s2zZsoTeHUOF1okEGb1IY8TvZzO5k5XzXMZvGRmNqKqK0qk+zWcy5OIjvUbrmMDzNzLnKDvuu6EkC7mA2BEvHUBwuVw6vtGqDNcMM6+2nm4eGPw1ZUd95MAkwo7xswNiO2TnRyTa3Ux+4/F4dJGXo5UlRQOjcV2yZIlpxNlYy40WsnISqWjm58bn88HtdhumQaboL2Kki1mrxhiOnRlUVcWMGTMwbtw4rLg7nAo7JSUF5eXlGr5x9erVmDlzJqqrB9o6aVkF+5vXQYwZM0a3HqdfZRyBW1EUXaSRwolefPazn5VGXBcR6Qy44YYbUFRUhIULFzIew2wu6fu8vDwMn6PViZgZmUUrO432Xd4ggNo/6YoK3XuKomDqlcPZ37R3hgIvEAvsbgd/lsRyzpmtHV4/LYtmb3ZG8e3iaeL6+vqIbZo0SZ+O1+FwMN2zjKc1MmI2c0wTeTCj/ohyKKMyzPSosfIZ5xNDZc8A8bfFdmlgVuGABRJdVqqqIs3vlSoKZBZLHoMACbLn9IwuF1KqAYlbKNTH3NxcVFdXY9KkSRrBkJnCOFpMmDABBRU5cZVhJ4zmRkRtbS0cDgdycnKgKOF8vLW1tSgrK8PFF1+M6dOn61IT0LjOnDkTI0eOjGns+PVHkBmBUF9kyuRIFnf8+9lFaZYVh+np6RH7RApKj8ej6wu/tidOnMiYmliUqrJxCgQCumdnzpxBY+cZw3J4zz6+bF54TuPV2Nioa1d2YapGcELfJ2c6dUoNo3bLnkULO8oQyxHH3+FwIDc3F5WVlZg6dSrzQhTfS8kyVw6XlpZqiHRgYF2LSgsxfBi/f2NNxUN9lKU0IuLeiFklRkAcbz5kfGZmJjwej86ohQSK//mf/8nWhayerIJUU4WZETTGGXnJmnMgHqMR3itVVVWk5w6k3eLbdvToUZZnk8fBgwfR1NSE4uJiZjiWVWjNCw8AvBnRXfPR7DGz56KQ1JvhMDWipPklpefo2uHIy8uTKtdlBmhWoSiKZkx4oTUAlt4hOTnZ0tkezx1v9dzxer248847dc95Qt6qUYldZ52uLfkDxiDi2U1Ccn6/ykJxZxfJ20ZlHT16FB999JFpOxwOhzSUa6L6bQZZneR1kAhvtvPRx39lDJXx5u91uoP5zyINTMaJdL6VlZUZ0vaJZLit8hN2wZfttqxkNHrPrjbbQcuK7zudTuSV6c8+v98PX7b8foimzsGitfk13KO2QVVVBINBAGFPyHHjxrEw+OPGjYtsLC2pU1VVeNKMPWedTqcu2kFJSYmGhlEUBRl5xikhfT6fJsy1EV2ciHFNT09nEThF5XhKVnTep4k45zweT1gRsvLSuNphJuwfimvbzvrOR9l21mkkmxFhduZavZ/E98T6eCMO4sH9fj+Sk5N1oduNjN7NEAvPcsstt0T1/lCShVxA7IiXDjCDHfRLdlF0KdMHe00lsr7BplmNYEc7zOQ3mZmZOgc1UZZkZ+SxRI2rXeVSOYMV1ZOfm7Vr18LhcBhGDTKiYVNzYjPyue222+ByuaSyWgLdZfQepXaee+kMpKamoqKiQiOHqaio0N33WRL5P/+9mM7Cmx7WKVmJnhLpDODlu9UXjTC8n2WGpVb5SLN22EUris7BqqpKeRJFUZBZoI826UmPjiYZKvRBItshGycjWMkEcckll7C9IFs3ZmcUz4PGm1qXh2zPFRYW6vojc1KOBCtnrpnesqenB7t375Z+Hy+fcT4wVPYMEH9bbDcaeeF/3md/U75eVVXx/hPHpB4hssk8+558gmXP6RmVI4aDtQOrV6/WCHv4Pq5atcqUGRUVUkB0uelUVcXffrEDTqfT1gMjVhjNjYikpCQkJyfj8ssvD19WmZkaJUl5ebkuZBONa21tLRYtWhSRCf/617+uG3t+bgiKokjzgSWdrtC8E43VML2/7qcfRPwdWQTyKQ2MwM+xrC80JmQ0MGPGDF1UFyswK5v/3NjYiGMbeuIuWwZFUbDuZx9qPtNYfvRSI2sDP7aysq3WZwY7yuDLka0JilygqipjtmQKnd0vmFuQFhUVSXMmAwPMA+WFJqMeOnOs7l8zUB9lxJJVr6lYxvv222/X1GMkhH/559sMmZBI7aI1+NYf9muek9FISUlJ1AQIldnR0YHDhw/jjd99ZGqQBoTHdsKECcjLy0NJSQnKysqwZs0aOBwO+P1+rP/1Lsv127F/jebLynNVDYdjjtQO0Xjvhf95H3l5eYwZFRHtPPB3Md8Wo3tGXMtGayoexHvu8Os/IyMDU6ZM0b0j0kR2nXUiXvvVTsyfPx+APjIXWbxHMsxMVNsSXXa0dc6YMUOa+zsWzJgxg4W5Px99/FfGUBlv/l7nozR0dXVp7r1JkyZhxIgRLBIPfTd16lRD2sBOhlsUuFKd8Z6jVrHjubMR34nkmcePU1lZGaqrq2Nqvx20rFkZoiB55/PnLJcRTX3RwkoZPC1A411aWgqfz4eZM2fC6/UiOTkZV155JYuyFEudZ98zNuQQ1/0111yje8flcuHdx48Y1pufn894z8zMTE1EPSvtiwZmZbS3t2P//v3MIDdwKLqQv0PlXjbiW40UOUNxbdtZ3/ko2846+bvJjHc041utRMKy6lhD7Tn8RgeSkpKwfPly+Hw+nYOO6NFtBVQ27UFSUvX19WnSbvGwmqKHMJRkIRcQO+ygA4xghwzo5f/dFtX7g72mElmfHeNnB86+p+hkv9HoFgBz+c2cOXN0tHC0sqRoIBtXv99ver5bobvtmi8q58Ybb7SlvEjg54ailCuKgjVr1hhGHBHvs4/WNcZUNy/bla2RSZMmsTQ2Itb9bCuSk5MxfPhw1p758+cjJSVFN1+v/nw7AGPZLKWnoWiCRzfIsxPIEM0ZcPiNzqj0P7sMeKlo2pEoWlFVVWx4eJ/mGfVr/a936XSSidLvJBqJbMfRDd2WI/TQ2Mqiw5Mjn8PhgMvlMky1Ip5RihKOaKMoCkvlbrfhg5GzvPgOnxHDiv4E0PcnJyeHlcfXTb+h542Njdi/fz+6uroM64hl3s+30chQ2TOADToHm9ohRUZGxkD+M8U8jGi8MGL+7EBVVZWUcJk6daqmLjpkFEVBUVERgAGGlr9go2EE6fImL6t/RLjdbk3II7sQTa4+Po87EPb2VRwDniV86hfxMBMPV+0a1q9pozU+Z84cy+0V4XK5pHnzqqqqNOPwta99LeY6xo8fr/OokXmDRwOrxjiysY3WkOcfFbGci7m5ubjllluQmprKCBIevFIISEw+ez7yDQ9ibuw+63nlt5nRCExyzJt5ENP3MqM/YlqA2ML1a63pw23v7e1Ff38/q5fque+++zB69GgUFxcjNTUVa9asgcfjiTnSyVCAWZvJmNAsd62IW2+9NeI7fBQkKresrExnhJKamorRo0ebnjVD4RyS7ac77rhD8/l8tlFRwunf5Pekvm1Lliyxvf7BUj7HC5/PB4/HYyl1QSQsWrTIlryfF/DPAT49TWdnp2ZP8Outra1t0M41ohVcLpehwStPT8QSUtkSPjWoKS0t1YU4Tk1N1eUJloV35WGUwuZ8w+/3S8fwfN9hViFGy+nv70dGRoZuzshYLh6YKZT572QhsVNSUizT1oqiRJ1f3m5UVFTA5/PhH2QZWEJ6ejqWL19uKmi8gKEHnofj5SyJgOgtTPWL7+Tn5yMjI8PQQYvekTkhmRmX89+TA0JKSgrS0tJQUVGBz3/+8wAGIs1ewAUMZfyj0BH/zPCl+nQ0iZ30hd/v1zmc5BfkG7ydGFxyySWDWp8R+HE432u/rKwMU6ZMYTomHnzbqqqqkOJL0b1jB1wul0kqzgG5vdWxEu9++puPNKIoiploN27wzhZ8G/g9db7nPhIiybal7w3tLg0JmMn8aSxramriqkN0oufLJvC0pSjzMwqYIJZBTuyi4YYRYnGSjfTMiEZua2tDc3NzVPVdwODCdqORO36ySPM5PT097Gn8n2OlQn3ZApt3T4W07NIl+oVGzxKZd1sE9VEMFaYoCnJycljKHbfbzZhLPsS+z+cz9J4WkZ2djeu/NdOOZtsC2RxEQl5eHhYtWhTxPXHtxHJBi2WY4YaHZrO/16xZYzlKAr2jqipu//Glln8TLagvKSkpGkMkM1hNUSAbJ1n+06KiItzwkDxHutWy6XIia0Mea394iXTcp9ySzwRKvOGKrN3RzLnVNsdbjpEATGZgJL43+9+0KWVkKC0tRUVFBRwOB/Ly8lBQUKA5Y4zWYyz7VwT1UZZeQTTmMGpHvONtZjRy6/cXMAaAXztWhZKqquKqr0/U/Ka2thZOpxMzZsyImjGnOskr5Nr7p8HpdGLHjh3YuXMnWxMlJSUoKirStVHWx2jGb/TVWsYxkmI9mj0WzXOxHQQyhiMjTPIcNutjSkpkZlhMzQSE58Dlcmna4nK5LEXxcjgctij5ecSyD3ijUyNGwcyIwK6zzqhcGTMi23cy5bFR26KJEhSpfYOJwa7zfPTxXxl2j3d5eXlMvAzd66R8c7vdmDVrFr7xjW8YGnH29vaipaWFPTeiDeykbcU28O32eDyJMxb5FLPuLIXT6dT0KSsrC7m5ubq0Izx4D05qs6EQziLsoGWjKWPWHXIDi2jqHCxamxfgli5Rcfr0aXg8nphTehnVWbpEtRxpRNxDJHwbKvyHWRlkkDVr1iy43W5b1pldoLLJoy6WdiiKgi984QuW34+m7Ggw2PWdj7ITWae458j5wYxvtfN+okhAsr6IzkPkcU5RSOj57Nmzdb/jsWLFCiQlJWHKlCksZD8p4CiCydq1a2Nq/1A5iy4gPiTyfLZHBmQ9pVj4/X88PsiIl7Rj/OLB6NGjAQCTbw7z+KJcNRoaOlq5zv/55ZWaz3aevbJxpb7aXa4ZZBGUJ06ciAmrMzXPpk2bBofDYWI8ER8irWFyGDKKerV69erzch/c9iPjdBbiern5e/NNZTYibb7mB9aNiKKlxWQyJr/fz+SSBFVVMevOyDL6SO1IJK246v6BNhP/ryiKRudFGHN1dA6lQ4U+sLsdfHqi8ddpIxfGc87xxiBmd4oYJVqkN4PBII4dO2ZYlpETb6RI2mbPrDrhibyxeOYaGZKI5Vs5S2OZ9/Nt7DVU9gwQf1tst7R483F9vvnk5GTsfu20JQskADj2bqe07CZJiiN6lshIIyLEPlJdmZmZmjpXrFghbUNaWlpUocnfekI/pucLsjkwQ3Z2NhYuXGhpLsRx9Xg8mggtxKT7/X7k58stnmXrD9B7gaiqivefOcQ+k4BSto5OnTqF99/XplugA8/K3JARUbR48/GP4Pf7NV71iqKwcFEejydmYynZOMnmKDU1FYfebompbCMDMf75pif3ap7R2H6yqRUAdIp/WbuN5jyWNttVTnJysmXjAjGKzcGNTVHX63K5pIZoxEBS+dHuXxnMxsqqhbld4y2uf0VRsPmvH+sILmpbJFC7P1x3TPNsypQpcDgcqK6ujppJpHb4/X7k5uZi39/rsWTJEowcORIXXXQRJk6ciIkTJ+K2225jKYzMMGnSJLz5+EdSbzcZTn/Yp/ksi1jEI5o9Fs1zsR1GoGhHdq0RAj//srbI1gxhsGgJM1C7rr32WvbMqF1meTbtHlejco3u00hlxBotZMaMGaZKxUT12wyR6qypqdEIxhYvXpzQ+i7AXtg93tXV1VLj2kjg73VSuI8dOxZOpxOzZ8/GmDFjdAZvK1asQGpqKtzucH7meGkDowgiwIBBvRiVgeocjAhBSUlJaNylMm/uSHXS2VVRUaFJf0Ftpigk9B6ljLQKO2jZaMo49KacroymzsGitfm7uOeT8JpJSkqylIommjqb9+ij9AHQGBbxQmPZmknUmMydOzfuMoAwD5CVlYUvf/nLEd+Ntmw7QGUbCT3jacdQXNt21nc+yrarzgULFmD06NEoLS2Foig6A39gwNDe7G6Kx2HsyivDSlDxDOD7Qme/aHxJz8V9Sh6n8+fPB6B3rsjOzo7IT8QamXcoyUIuIHYk8pyzQwb01hN7o3p/sNdUIuuzY/ziAZ0dRzd3MFo/2rQ0BDvkOnYhUeMabbmyiOCKojCZkaIoSElJwbhx43DRRRdFjEYYK6yOtZmR0GDeB3Qvvv3kXo2zkJkM9p2nBtKAy/QDoqz15AfWU6lES4sZOQ3LnAMOGvBS0bQjUbSiqqrY8vwn0u/ee/ogAO28nN7aa7nsaNuSSNjdDkoBBQB1W7RpkGKR/8oiNYmgwAPiGSXbPxSVPNo2NTQ0GH7Hly8rr6urK6q+07vUH1H/RDCSsaelpWHs2LGaOkVZdizznpWVhezs7Kh/ZxeGyp4B4m+L7UYjB7ac1D1bvnw5Tu9rRyAQsCQcbD0WkD7vPKlfvPRMXNjTpk1DRUWFhRZHD7GPJOBavXq15llKSgoyMjIMLUFJABZpUx7babzpBxuyOTCDy+WyTNCK4+pyuTQKFTK+SEtLw2233WapDBlovD/ZHs5vLioLRe9ov9+P6dOns2e8MuzgB6ci1keCiWhxYMtJJlDhQULMxYsXo7i4OOayraCqqgotR/sjvygpe9q0aSz6hZFX5sGtp6WXVsOhHiiKguXLl2sE9rJ2W+2LlTbbVc6SJUuk3v6iQYVMsXr2gNxoTgb6vdfrxbx587BmzRoNEyGWTfs3nnQGZmM1btw4uN3uiIK8eMfbLNLIoa2nWcQOMVVGpGgEhOM7w7lAZ86cqUvZFAsWL14MRVGwdOlSHPzgFFwuFxYsWMAil0QTvWTatGk4sOUkli1bJv2e0rJQP1uPBjSfI3nrRrPHzJ4nJydr5ofaYRV27UlAvw+oLWYh/GXrhwefji5WRNtHp9OpOQ8JCxdqvTvWrFljW51WIaOLeFgZJ7O2Rfp9WlqaLsez1bIThUh1ip5KRGckqr4LsBeJGO9hw4ZF/RueLhdpCqfTCYfDoYuMWFNTo9mjRrS92b7z+/0oKipCcXFxRFq0uroaycnJLKoI1TlYXiBjx45F05E+OJ1OFlHP7/cbGoFSuwoKCqCqKrKzs+H3+9HfED5jKGWd1+uFqqpMSWgVdtCyZmWI568RXRlNnYNBa0+cOFGzhluPBlBYWBiXctiwzlZ5qtirr74aeXl5mD17tmG0LLqHEzUm48ePj7sMKkdRFI3wzY51ZhcSuf6G2tq2u77zUbZddbpcLqSmpv7/7b15mBbVte+/queB7reHtxm6GbqZmhmBRkBAFAEDSJzAgTgDJkpucu9NcjKc8xPNiUPmnHts9J7EGxPjuWYezvWYwURNHJIYjZ6EOBLnOIBCo4hAQ/3+6NTrO1S9b+3aa9Veu971eR4f6erq71q1h7VX7dq1C44//nhwXTfzWTW/+7pi805hxw+/87zPW23atCnn99nX8r73vS/nb4LuH7M/hwnw7uL3Uov0MeE0FyJEhzLOqc7h6tqLcr4ulPYwyg+D3TvffYBeWVmZE5e8F5RKEWVeJxvM3J2qXFV1g3b0Hnh2MDNetLW1wejRo0O9IBeVMG04lUoVfYktzvHA8+Op3+c+E3EcB9566y3fzwc+/ff5f78xdffu3VBbW5vzgjBVfvXk71/K+QR5Nl791tfXZz6p/NoTQ/dS2S81q/pBcS2ZBWWPDD0zHBgYgMceeyzzO++ZF8C7eczegOetur5Qg+3Heeedl5kX2PPM0AKx7Od+QQTlg4sXL87siB10D+nlufkxytvxrpg+5os2fgu2ACDyp2LCzCX5lWllZSV0dXXl+JH/Kfqo9U61I1QYuPQZAH1f0BeNtAwv3OrIcRxoHdEIEydO9P1dAbX+QazKZxclv2MApR8g6JB/jZ2dnX//TnBuYOju7oZ58+ZlFji0trbm+OQtJrniiit87Xg3q03paG8dUBBU3tlE3WLar+14Qfekk06C4cOHQ0tLC5x//vmBn2Hx0wCAzGREdjBqThd+4iAoCcy2l/0gMcheNsXewCxGy/DGgomIbHQmUcP4HeXc7PNbWlpK9sFUR662N1lc11wBFRUVMGzYsJzr9PNF1b9iPmPpBCUZpXYxcF0XGlr827Yf+Q8ZS72hFKb/lqJYWc2fPx+qqqpyvg2pqqGCXx9oG9kEa9asgeHDh+fs3OH5M2vWrJzzvTeEsxdhDGsfmmD3rkUHx3EyMWDcuHGZa9fRzi6/7N2SAN6dCPXaVU1T8U8FFdMudqzU8Q0bNuR8Rsbzw2PRokXKfujg3Zw6jlPgS9D5ALRb26leY29vb9E3cv0W6enaDEu2bv7iSo9ii3CK+YZxg0R13To2vfJpaGjI1KvOJzpMXGM5Q1Heq1evBgC1/E5nXPdy3iCNUvGvoaHBdyFifjvu6emBESNGQHt7O6xZswbS6TRUNULBd9kBht5Cp6BleCM0NjbCueeeC1OmTCl4e9zDb3LjjTeG3i6raaqAjo4OcF0Xhg8fDqecckrO7k8qvoQ5FkXD7/Nt9alq37qkvBeIouHlBV7d1DT57wKJYbM+VTiRdNxxQ5+Dra2tLdj1J7udePeEXO4/gjT8dizBaGdYxHEvSqHNwZ4JbQqb3k4j3pjnbb3vUWx8w8jP8xft5+eyAAATJ04sasvvc5jZFNv9DwsusUjQgzLO6c4BnX766azGj7jtYcyhqeLd12c/Q6lLVWrfF0eZ1/FoamoKvdttGKKWa/bnerF0vdwue+FNdh4ax86IYdrw+vXrc372xhhv3j3O8cDLm1PDGwp2nP7Tn/4ETzzxRMHfpDoac+blsmlra4Oqqqqc3dap8quW4Y2Bi0Zc1828AOctYKlvGcoXvGuO4gdlrtiUHmoHAwMD8NZbb2XqovnvzxKzyzrMXKiOL1Rg+5H9zKK2qXAHcxW83eZK4dmrahx6OSWdTkM6nc68qFLMbv7vgj7lVWrn2mLz3N7Lt6XIntft7OyEurz76vxFKSqfwcmfB4ta77ov5OnApc8A6PuCvmhk6/9e63v8Y7dsgBkzZhQ0QL+VTFM3+D9o7l5X2LC8Y94qxjjeWsu/xr6+vqId0xuIKisrQ08ujhs3LpMgnndd6YEpLvzqIJ+gT8eUwq/tjB49GjZt2pT5JIv31pSKBsDQm9g9PT05QeuSL5+Qk9wEfePLL3h7P2/932uLbv+kg9+1FAuuKg+b/LQ7Ojp8k6CgMlXR9sN1Xbj8xtWZf2czf1OH70MTP21V//zA0PDT8b57XozsttfW1gaXfDn8A5M5c+YUtE1vsZlfTArTf0sRpqyCVtaqaITBayNefDjttNMy2i0tLb67TXnbT3o7AHlvCI8dOxYcx4HOzk44+zPzUfzzA7u9Ll68GAAANm/enHOOd10zzi18gBRWu9ixUsfzE8V8P0pNOGC1EYDCdhhUJkHtlSqvULnGMBMVYXaewixXP938T415BP07e3vXIN+OHDlSMLF//vnnA0D4z79RXbeOzYqKCqipqYGlS5dmdl/Q2dnIxDWWM1Tl3dLSorTYOHtcX7VqVcltUT2yY4pqbpCfb/rFJ2/S1XEcGDduHKTTaWhqaoLZs2fDhRdeCEsuK3yrz3Ec5U+9hMWrr8rKSli+fHnJ871rOnLkSObYzHMbM7/zFqNH2bkNI5cN0rjooosK6uP4rf4PNVVsxpVrezsPVFZWwoxzG7Qn6YNsLrx0JFRVVeU88PVyyqB7vrDaGP5RaVDd00WBi9/c64DKDw42s982dhynYJdZjPvW/HvSAwdytyDPjjF+17Jqld63wPPfmKSASywS9KCMc3HNAemcrwulPYzyi0r2g82FW3Ln2aPMUUSZ1/Gora31XZwclWLlOmPGDBLdICZPngzpdDpnjBi58kCRv8AnbJ6cTWtrK6TTaWhpaQmtgeEHwNBLAR0dHTnnB+1g4P3u/f2rAnP7ioqKTHv3PmtKlV9t/d9rYcaMGdDX1+e780n+A/llHyy+W3MYP6iuxXVdeN+1i3yfaV30xWWFc6Hn6M8Pm4DSj2POz93NOexOI93d3dDU1JSZD8hvS0ELQeZf0h44jxn25cWTTjrJ93iYl4/ytVXnM7Jz66qqKphzQYuyzWyK3e9HrXfMBY6qcOkzAPq+oC8aufr07xQ9HuY7vQ/eOOB7/OnbChuZdyyuLY4Bgq8Rk+xOc+OWXwFAvNcYRH4deCtave3w0ul0werXsASV6+jRozNBrNREb5CG3+4HXzn/jsy/swdYvwfe+Z8q8H5H2Rbi1q6pqfEdLFT9KHV+djlfe+b3cxbmeP//9Rdf9h3s/LQxygmrrEvpBL1579HW1gbf+fgjoe1VV1fDBRdckHPMu2HILj/Pjl8MVSVMWZVKVDDKO3slu/d5js7OzgLtrq4uaGhoKBh7vB05sqmuroY1a9bALf/992QPrrDb6zHHHAMAULD7kuf/w//2VmTtYsdUj2P4gUUYX/J3APLA+CyNh+o1+j3ICvrckOen1z6i2gxLkG7+TVP+sXyNmpqazA2U9x3KwcFBqKyshIaGBhg/fjwAQGZnDm/xSFT/KCllM51Ow7p162DhwoWwdu1QMu8t9qKwJ+BCVd5VVVUlt/LMJntcHzNmjPIOVo7jBOYG2bvFeW00Ktn93nEc+G3/npxj8+bNg5EjRwIABH7WU4f8+jr33HNh/fr1RRcI7N+/H1588cXMzw/925vgui6sXLkycCvrKL4EHYui4ddefnaN/3euVWzGlWu7rgu1tbUwZ86czFitM94G2bzr8y9BRUUFdHd3Zx4KP/roo/DSSy+Ftsfl/oOyHrnc51L6zb0OqPzgYtN1XUilUpndIbMnwYvdt0b9PM3+/cGfgS12LUGfRi6F7o6VYeASiwQ9KONcXHNAOufrQmkPo/xU8btfvueL734GpNTb5EHozOsEPaSMSn65nnzyyZl/h3luFFY3m/zP+nqLh725xeydm5++zX/3VCqitmHHcTJjVNzjgeu6cM0Z3w11HgDA58/+UeaYX7nmL3ygyq+uPv07UF1dHbiDfT4//cxObT+orsVxHLhxy10FxxzHgX+54KeZY165cpqXVYHSj99tz93MIKjPL1q0KGcHubq6Oqirqwuc38zumwBDcTuVSsFD//am77kqO9YHkX9u9u7r+edg7aDkdz0A4XP1ysrKwEU0XNqfCpx81vWF/i4mD2/FaEdHR85WUNlvrUVpuFG+A05FZWWl1sS/LeR/XmDSpEmZFaEcyU9AHChcCZgfqL033oIeIPqBsfPIypUrlf9GZzV2nBTbqspLyjs7OzPfDE8Sftess92h4zhkn+HSgfrGyiuvqVOnljzX2wlK5RvxVVWVmS1BbafYav848BbntP59AUDYrfswqaurg1NPPVUr2TZNtj/eYgqAwkUh+XjfYY2L/AWAAMVvgPKpr6/P7Hjl5YjeW/19fX2wYcOGnPM5LKbVwfPfb1ckP/J3XBGSSX6OoPo3YXjxxRdzdtHwI3sxseM4MHXqVOVPKHlttq2trehOhAsXLoQzzjgDAIa2QaemtbUVhg8f7jsmlYpZDQ0NRr+TWwpu45cqtbW1MH36dBg7biysWbOG7KHr7t27hz41+vfF1m+++WbgA+XsMqX6hJIglBvePWwqlQrcZjuIuPO/OD4zIwiCEIbOzk4455xzYrc7ZcoUGDt2LJxyyimB52TPU6jw3ve+N5bPCXixPHtngGxydr+qrsr8m3tuHXbxAyVB909+8/9Bi3EqKirg6NGjtI4mEv+dXfz+LeTi7dQOUPjpd4ChlzHr6+uhvr4eZs6cCdXV1YE7c/j1Aa9vrlmzJvNzW2thnMyeR83eObbUfH7QfLA373LuueeG8jPbhzCozq0X241+0qRJcPnll4fWE+IDfRZm4am9JY97nWb06NEwf/7QZwCyG07ThEFfjZYphY3SOxZle+CoBF2jR0VFhe9qLr+bzTB+H3Ny+K2wqMmug2OPPTbnAcbKlSvhwgsvjKxdqlx1NfKTvXlrxvv+Lj+YeQGuvr4eqqqqcgJetj3VifRiTJo0Sbk8li1bFvrcUuUU9lxV7Xw7C95b2E8AABadPsV3MPHTpm43OjreAB40oPoNyli++OEXQ1VZeGpvzgp8P7xtK4OuG+savd0GsLUp6yBu/9Z/cOhtijDb8QdphxnXg457nwE6+YKhsX7JkiWR/YhKRUUFtLa2AsBQ3Bk+K/fzDaUSXqqbLNVr9PyorKwMnfPEFV/8dEt97ic//nkaQTdG3kp+LP+oidumiWssZ+Iq71LxR3dcdxwnUMOznf0pvDATUI7jwPLly2HdunWZc8aMGZO5N6quroY57xkHrutmPi9HTVB9eZ9485s0zl+sMGJWuE//RPEFI9fOP+Z98737WP9F2Co2TeRWx50+NfPpQGybY+cPy5RPPtn2du/eXWDf28mNS75JWY9c8mFKv7nXAZUfHGxecsklmR2m/Cg2vqnuNOLF82L5s615FJdYJOhBGeew5oAoz9eF+xwaBqPnDeUtp512mu/vN27cWFJDZ14HYGgeotj9uMqn6rPLdc6cOaH/TkU3n+yxo9R8ysjZ8S4WjNKGKeZ6dPt6sc9rHLtuku/nYLyH2Pm7w1PlV9nnhsknuhemQucdqn1M91wAgFkrx/j6N3d1d+bfXrnmz4Vi+0IFlR+O4xT0da8shw0bBk1NTdDY2AgTJ04EAP85g1IvmY8aNSrnnFV/nxv3w9uBL+g+NR9vvt1blOK9EPOBD3wg1N+rkj0/4l1T1DkSr6yKjStc2p8KnHzW9QV90UhXr/82adnHve152traYOTIkQVvWNa2+w/ydT7Sde1qb7FiEHSNAMW3Ubv44otD28gONCMntIT+O2ry62DkyJE5u7zo1EOxcsXQyF8YMmpSa6bt5L8Znc306dNh+vTpsHLlysyW2d55Xb3tqItFssEoDwxtVT+Czs9eLQkw1MZH97b7lnuYOBLVv7C6GDreAO5H0INqinr3bNW1A5x55plaWl297SV3FJo1a1bRWMClbVNqUGqraIydNnQzHGZXliBtlf6oqqHiBxaNI4JTH7+bXL++6rouLFmyJHS5+qFyjcOHD8/5PMTZZ58NAEPbDc6aNcv3b/wmlKjKNVu31Ns4fr+rrKyErr+PB9T+xQWFzWK7G5i4xnImrvL2G0u9zzQB+N8bhWHcuHGZ/jZ+pv+nVorlxgDFF9xNmTIF5s6d6/u3NTU1sGztAli8eDGsWLGC5HM0+ZSqr+xrdRwHent74fjjj4dLL70UrrjiCgAAaBxemdkNBdsXjDEy/5i3SCc12n9HOsp7AQwNSpvNnTWBk1N+CxdfeeWVWP2j0uCUi3Hxm3sdUPnBwWZFRUXR+9Ko41s22QsezzzzzMBx6/XXX4eu3nYrdwvmEosEPbDjXPauDRh9idP4QWUvKD5glB8Gqc7iL754O6cVA2Nex8ufsnNi75mAyvOAqOVaar6glK63YNwPb6wAeHfOKK5nTWHbMPU8q05fL1VW3rnZ95fZ9TlixAhYt25dJF+inhtm/unEUxb5ft5cxQ/KXLGju6ngYb7jODBq4rsvznkUmwvF8IUKyuckTSPffRl+1qxZmY0O/OZTdePByJEjfa8l+7l2c3Mz1NTU5Dxr7OrqKqrrPVcP+5JmVPz6QePw0juwervkZ7fTtra2kp9a49L+VODks64v6ItGvv+5+0ser6yszGnw+W+tv3Kffyf0O/7KfQ709fXFumgk6BoBoOgnECoqKqC2tjb0ijGPn93wX0rnU+LVgZeQOo4D559/Pqxfv15bu1i5YmhkP8hyXRf+3788lPOAMOhhYUNDAzQ2Nvq+HY3hcxB+2lVVVSjbNBfzO3/yXvUag873FotlD3w/+MIDOeek02lobW0NFUei+hdWF1vH7+1c703W7EQVy5f8zzgADPXfuro6WLBgQWTd73/u/qJbUnoUe3Acd78xoUGpraKB0X9V+qOqhoofuqxZswaGDRsGz/ziHRQ93Xisco0VFRU5D7ja29vBcRxoaGjIfP8x+/dnn32272cXqNq2p5s9xubHgGI35MXi/rx589D8ixMKm5dcckms9oRg4i7v3t5eSKfTMG7cuJzPQwbdM6nw+H8O+B4P+7Zgsb69dOlS33u0H3z+gUyMwliIUYpi9bVp0yZYtGhR7oRaYyO0trZm3gw66aST4K93voOywIVqjAzSePQHr/nWAWUugaFBZbOhoQH+GpAHZL9MkH0sTv8oNTjkYlG0Kf3mXgdUfthgs9j4pjr3552/atUq39/39PTA9z93P0yfPl1JlwNcYpGgB2Wcw8gVOY0fcdvDKL+wdHR0BL6UuOM/3sj5OcrLYBjzOt5nnPNfZvB7uSEoptbU1JCVazFdv8+F51NVVQWO46DNGYUlTBsutcCBQ76ePz5n//yjL/wu8Bre9773geM4OZ/aocqvVK/x9n99OPRnglX7WNRzs3fBv/PfdviW++3/+nDB373xoNrn7bnkB5R+PPXTtzL/njlzZqi69nuWGPSyYTZnnnlm4LV4WtmfzPGOBe0wFZYZM2b42opK9lz4X+88UPRcz87atWtzPn3c09Pj+5WObLi0PxU4+azrC81HgkPgvYGv8s2kIGz6PldXV1fR3Ui4k/r7YhFvIPfqT2UburCsXLkSXXPFihUFx/y2Rsv+f6nzg8BYSJPPihUrYNw42s8Ved+SD7MwQAVvQDnrrLMAADJvxzuOAy0tLdDb2wtTpkwhv764CWov+e2s2Pa8UfAG423bthX8TvchbNg6oto1AGPcEMqPtrY2qKyshNa21sBziu0wEuYhkinyP91CtQNWKYYNG5ZZCJcd43TKTmeRm414n/fyI+xkhWA3+bFnzpw5kEqlfH8HMDTxqjreLlu2DGprvUmjwj45evToojYBCvuy3/e0Z8yYwS5e5lNXVwdz5szJuZfJ93nKlCkw7u+T5DYQx0IcWxk2bBi0tPh/sgegsO5VPgknCII+YcazsIu28++3J02alPP7LVu2AEDuN+0FIQlg5l7c87ikkE6nlfJ57PnDsGT76N0rAAzNI3ufBPZoaWnJ2S3Vu6/3e7lFhaiLuIN2qS+20AGAbl5TF7/PdZui2BcAMuXn5D5s3717d+ac7LYklCb7ZSLXLfxsSH7+45V7fh8td0r1/VJ/mx0bisWXqVOnhlqYXOpZx7nnnhvav2yWLVsW6rx820HzyqtXr1aKw9llM3bs2JI7jAg8QF80ctF1Jykd93BdN/PJg9Er/Afk7ONew/WOxZnIlroWDLIDz/lXn0BurxjpdBoqKipg06ZNsGTzWLKJs/xyLbXiLIxGNo7j5GiefeVxmXYUtAV3qYlyP3veQ3rdhTR+2hUVFShtPUwb9hYGqLb3/PO9b3/ns3TpUrjoupNg8+bN0NTUlPNde5U4gtEfsfp0WB3vO47Zb+N7bwpQxpeg2KpCWP+OPfZYaGtr09IIYtGiRYHbinNqD1TaKhq6/beYBmU/payDD98Q/DAtf5V4qc/U6BD1Gsf6PLj0bhpLjQ9U5erpnn766dDY2Fh0PA0qR+79Li6b8+fPD7ypKzaZYuIay5m4yttxnII3Q7IZvcKFc845R1m3p6cn80bg7LNboba2NtO+RowYUdCH/bYQzkdlsi/u9hrG3plnngmO48CcOXN8J9Qu/lzhwnMsX7DH6uxJ5GPP95/QocwlomqMGTMmk99R2rzoupMCXxIIe5/FJd+MOyfEgovf3OuAyg8bbBa7bw3bT0u9DOR9PtdxHGvzKC6xSNCDMs7FOQcU9XxduM+h+VFsB/L8fHv8+PFwzDn6D9kw5nXy/fKora31jbV9fX2Zf2fvMj96hQsdHR2Rni94O636oVJf+eXszdECAEw+tSHWl9VU23A6nYYxY8ZoaWD4kX1+qbK64JoTc8oYoPhDcqr8ykReie1f9ssap3x06PPwR48ezTlnw/839Jmy7PLlHsuDoPRj6hmF8STMIr4wO4t4L0k3NjZmYn7QtYSJNViLfvxsFbve/MUj2YuUek8NfunNjwkTJoR+Xsql/anAyWddX9AXjTx4+1Ohj48ZMyazUARg6O3Kjo4OGHjKv6Ps9ZH2jmU3eOo3bIOukYodd73k+5CIAm+SMT8QzZ8/H0aPHg3P/n4vrFixIufzNFhglGsYDW9l3iM/fxbOPvvsgonwUpML+fa8Ty1h74xC2c5UtFX9yD+/2LeBH7z9Kd83U1XiSFztJqrOMcccY8SXbLy2vfcpCFxsEZaw/o0ePTrQlu41Tpo0ybfdYGhjaVBqx9l/i2lQ9lPudYBBVD/8doHyxquNGzeS2CyFpxvmwXIpDQpM1LmOzaDdRvy2/cWwJ6gTV3nnvzXk/d/7HObepwAOHjwIO3fujGzj5UcOQHNzc2Zc9T5/E+ZNPD/C9P+422tYe47jQHd3d8GnU1U0oviCPVZn19Prjx3xzccoc4moGgsXLszci1LaDDpe7C3JsBoq2JgTYsHFb+51QOWHDTb95v6i0NLSUrBLZv6OIwD25lFcYpGgB2Wcw+hLnMaPuO1hxSIAyHm72u/T9tk5yJQpU3J+99If39aef8eY1/F7qabUHLrfy2R7n9LbLSVosYlffXV0dGQWCQKEu5/ZteNQZN+ioHKvoquB4Uex84PuBR+64937VcdxSj67o8qvSrVrCj8oc8Udd/0NHMeBVCqViSuO48Cjv3hOW5tLfkDlh+M48OqfDpY8r9j8RrG2U11dDRs2bMg5Vuw+NNvOBRdcQLpoDeulyF1/Plz090EvkoeBS/tTgZPPur6gLxrZ8ZvnQx+fM2eOb/Lwzss1BccAAN56zoG+vr6cTvPWc/FvlRd0jVHo7OwMZe+cc84hCRb5mvv27fM97z3veQ8AALz2+AGoq6uLvCVSMTDKNYyGtyDmifv/ljkWtI2+Hxs2bICKigpoaGiAHb95PvNtsSg7oxQDs53paKv6gaGtcjyudhNVZ/HixVBVVZXzJkGxNkZZ7xjxklN5U2knyT+M/muin5qqg2ITHthjMMU1Tpw4MXabQbqq27ru+M3zMG7cOOju7sZ0LaMdN3HbNHGN5Uxc5V1qC/63nnPg4MHSEx1+TJ8+HZqammDXE+9k+mdjY2NmcZLKA/Senp7Mv/0mwfPh2j+K7epCmStij9XZ8fatFxwYPny4lk0TuRWlTe/4jh074MUXX8wcL9bf5syZk5PLc8k3udzTqcLFb+51QOWHDTaL3beGHZtaWlqgoaEBqqurc+LiqlWrCs61NY/iEosEPSjjnIk5IK55XhSCyk91e/10Ol0Qu/zyM4+qqqqc8197/EDBOV5cW7NmTSgfMOZ1qqurSz7sv+CCC3J+njNnTsE5bz3nZD6NHoX8XTaydf0Iu0DF2/liz9ODkX2LQpg2HGY+JQ4//M737huL7b7wl9+8ULDTCJYvuucW84nyGYfOuV5b/esfXoOKigqoqqqC4cOHZ+riyd++nDnXe/bIPZYHQeGHV+dvPBW8QKzYZ7ZV5kqyCWp/+f3H21HJ5Oex3v2ccTBv7Cy+aKTYi+Sl4NL+VODks64v6ItGGpr9G1TQca9j1NXVZTpCVcCL8BW1Q50me8VtRW30jhqVoGuJgrfgIJ+5c+eS2Msn+61VbzvmUaNGQW9vL3R3dxeUa9e4kTB27FhwHAeamppQd9fAuE4Vjfpm/8VJHo7j+L7R3dbWNrSF6UUXkdYNF21VPzC0VY7H3W6i6LS0tBS9kcvuZxT1fuqppwLAULzUhVN5U2knyT+M/muin5qqg1KfUInLD9ts5uv6feanVJ7W0FwLw4cP1/60Wxj/4iBumyausZwh60t/32XG6y/eQ2y/PjVs2DBoSNUV3YGmGL29vUMP0xrevR2cNWtWzicbsz+l50d2rPTeIjn++ONL2ubaP4J2+VHRiOIL5lidX1eqY7juuVgalDYbmmvBdV3Ys2cPHDjw7kOYYmPV5MmTc3Zs4ZJvcrmnU4WL39zrgMoPG2wWu28NO/d33nnnhbZnax7FJRYJelDGORNzQFzzvCj4lV9HRwfKM4jOzk5Ip9OZnbyLUV1f+PjGWwCXvXi7GBjzOh6O4+TsHJ99T5Dvz5IlSwoW60Vpl2HmaHTae/Y9UFX90M6OUe+zVFFtw+vXr9fWwPDD956koSHTNnLmulO1BbvTpFKpwPsvqvzKb+6qWH+mfMahey4AQG1jta//9U3vPvPyFmhxj+VBUPnh9XWAcPGl2CetwqJyLX19fbF9eQKgML/2PoNejKo6uufxXNqfCpx81vUFfdHIR7/lv1I06HhTUxPMmDEDLr744syxmef7DxhnXzuvoAFPODP+FVdB14JJ9krcj37rdDh69Ch5srJ69Wro7e2FYcOGwbRp06CpqQkuueQSeO9735vji0dlZWXRldGqYJSrisbl/3ay73HHcWDMmDGZ/wfhOA5pW+CireoHhrbK8bjbja5Oc3NzzrbnjuPAvHnz0H3JxltVjxEvOZU3lXaS/MPovyb6aRx1kL1FqSoYi0riyCXispmvO2HCBOWJtHyNJUuWaPsVpB0HUW2OGjXK91ulJ554Iok9IRpU5X3RRReB67qZxSKVlZW+n1F0XRfq6urgxP8xpuRuJMVwXReOu9x/oZbf2zQAuZ+wzF7ccuKJJ4b+Lnnc7ZVTbkA1RmKM1Sr2VDGRB5S69mnTpuXsepq9aMR13cybtJs2bYrVPyoNjrkYhR+c2zaXMrXBZrH71rA5ZtBbz37YmkdxiUWCHpRxzsQckI15nuu6mTeiR48enTkeVH5eHGppaYm0/X5FRUXmYXXQp5c9vwAAFm8tzNf9PrVVDMxcccqUKSXvT7NZsGABLF68OPMz1bOcfF1vl7jssSBoXMg+PvvCYbBo0SKt+ywVVNuw34s2pvN1r09MnjwZZs+enXNe9vOT7EUa8+fPD3woTpVfZZ/r9flieQXlMw7dcwEArvzRebB06dKCF7bef8MKbW0u+QGlH/M2tWT+HceGBN61ZO9wGrRTT0tLi+9nc3UIE/88vPjnLWw87rjjoKamBurr6zOxdf1nhp5fe5sieC8tY8Cl/anAyWddX9BHv6vP+I7S8YaGBpgyZUrOQPzITft9z/3p1YXf637qNqfgraD870VhE3QtlPaqq6th+vTpZDY++tGPQnV1NZx88smZFaGzZs2Ctra2nIUTlNeOoa2i8eXzbgeA3MDotaXshTJY9lThoq3qB4a2yvG4242OzkknnQR9fX05u/hUVFRAX18fui9+PHWbfgLEqbyptJPkH0b/NdFP46gDv4dAHqVuFs4//3w0P+KEyma+7qpVqzJ5XbGyzP5dvkb+RAOmf3EQ1WZvb6/vpwtL7d5g4hrLGary9vrEli1boK2treSWoLd/+intyY1ff/mVAh/83rjyHqJv3LgxM7nd3d0Ns2bNgnPOOQcAAKZOnRrKpol7KQ4aQTpUY/XkyZOVx3Ddc7E0KG16x8eOHQu1tbU526z7Tdr59UMubYrLPZ0qXPzmXgdUfnC3uWTJEuX71qCx8NCh4G3Hs7E1j+ISiwQ9KOOciTkgG/O8IJ7/4bs5QP492uTJk2H+/Pk5C1Czqampgebm5pxjI0aMyOzsF5aKigr4zVde0V7AgDGv4+E4TsGLrdk7suXjfT7DA6NdZuMtXM/XzfcpTLk7jgMPf/VNABjye+HChUheBhO2DRfzn1O+7ufn1Wd8R2lnBqr8Kvvc6dOnl/SH8hmHzrme39dt+H7BwjPHceB/XfjTgnrgHsuDoJx7+f0Ne3x/59cuMD5j7l3LsmXLco6PGjUqJ0aawm8eaOPGjQAwNFdZVVUF48ePzzzHevDGvQAA0NXVlfP/4447TtsXLu1PBU4+6/qCvmjkyOBRpeP5dHV1wdEj/gH76BGfzugji7n7hR9hrwXTXk1NDcyfP59EP51O52wr502UTZw4sWCbMMprx9AOq1FTUwNHj+SeG/btyCj2osBFW9UPDG2V43G2G12d7E9rUfviC4I0p/Km0k6Sfxj910Q/jaMOSk26FLt5zJ/80fEjTqhshtEtVp5dXV3s+x13myausZyhLu/q6upwuyEd1dv5yHVdcI8Wf4My+99nnXUWALy76C6dTkNHR0dmgiPsG4c29g/KXJFqrF65cqX2vbnquVgalDb9jh9//PGQTqczbb3UBCCXNsXlnk4VLn5zrwMqP6ywWUS6VP/0XnpyHAdef/11qKqqKpn325pHcYlFgh6k8dnAHJCNeR6A/6dq3aMA48aNA4ChHXyzF5r6kT2vvHjxYhg+fHjB4tS6ujpwXbdg0YVfTu/tzj5y+CjfT25NnDgxxJUNgTGvk8+MGTOgtrYWqqqqYNGiRaHm1WtqaqC2mmgL/xBNodgYkmkDR4Z+rqyshFmzZmF4VpQwZV1fX1+0fE3m6/m79QFA5mVJb6HIkcGjSg/YqfIrTs84dM91HCdzfjqdhlGjRkFdXR0MGzYMKpzKgt17ucfyICj88Npp0DPoMERZRBJ0LSeeeGJm9w4qSvnmfQLZG/PC/F3Q9RxzzDHK/oXV5gwnn3V9QV80MvfkCUrH8zn11FNh1KwG3+13xs4bekiTnUg1T8z9HlochL0WbHuO40Ra2FAMvwdfxx57bKAdymvH0A6rsX79ejh2bS8ADH2WB2BoInzmzJlKbYl7eWBoq/qBoa1yPM52g6mzdOlSGD9+PJkvfnjxUgdO5U2lnST/MPqviX7KpQ4oc4u4cwlKm1F0s3O50047jU2d22rTxDWWM6bKO38yuXmiG/h2Y1hGzcpdIO69XeLFvxNPPBGGDx+eEw8rKyuhq6sr8me+bOwflLkixVjt1ZfuvbnquVgalDazj3t9qr6+HioqKkKP+1zaFJd7OlW4+M29Dqj84Gzz4MGDsG/fPuic3Qjd3d2+55Tqp4sWLQIAgKqqKujo6IBFixaV/HyErXkUl1gk6EER57yFCibmgGzM8wBy8+yOjg5Ip9OQnloTKtdOp9O+C0r8du5zXReOHj0a6q3yzs5OmD17Nsx9zwTfz9icfLL/Z9f9wJjXycdxHDj++OOhp6cnVA6VTqehuroaFq4b2p1QZ/H77Nmzobm5Oed+xK+9Z3+2w+932ce9n9NTgz8ZREGYsh4+fDgsWLBASwPDj6Dz88t37dq1aNqU5/rtrOBRW1tL+oxD59zGxkaYMmVK5vyuri6YMGECDBs2DEaMGAF9qycW7I7EPZYHQeWH4zgwfFr0BWx+O+eUimmmy9Tzz9sRJJ+WlhalBR9Ju7fRhZPPur6gLxqZfKx/ows67sdxq2fBhAmFFzZ8cmNBIG8ML4uGyrVg29NJqPKpra31nRwbOXJk4DezKK8dQzusRkVFRebc7Af3qt8K414eGNqqfmBoqxyPs91g6sycOTOzYInCFz8au/QfgnMqbyrtJPmH0X9N9NO46wBj6zwMP2y1WQ79Lk6bUeK0iWssZyjL2/ctx7zJLO/fjV1/n8yaOzeyrfRE/9zXs+FNxuZPbC9btixnp0IVTN5LmdQI0uEyVqvYU8VEHlDs2vM/b+sR9E3pOP2j0uCei2H5wbltcylTzjaff/55+K//+i9IdVdBZWUlLF68uOAc1Tc7wywIszWP4hKLBD0o45zunLnjOKzGDyp7R4/674LQ0j2U/3qfKMlfYOARlFf44bouVFdXF7zEWezvqfOiqDbf+973wogRI5R9GT2zNfLcZPbfDRs2DFKpVOZnnfaerZvqjvczEVxiOVZf98qytrYWRowYkRNHwtY7VX6Vf+7w4cMDF6lecsklpM84dM5taGiAWbNmWTsvqwKFH178bukpvUDMb34mKibLtK+vDyZPngwAQy/uYZC0extdOPms6wv6opHb/vnXSsezOeWUUwAA4O6vPeX7+z/835cLBpeX7wm3hSwmYa6Fwh7WNVZVVUF1dTU0NhYuwgnrCwUY2ioacduzVVvVDwxtleOc6pGTL/mk02l4+R5H+zuonK8RSztJ/mH0XxP9NK46aG1thZEjR6Jsnafjh+02y6Hfcbdp4hrLGery9lsgko03YeHdB61bty6SndbWVth1v/9ESbbd9vZ2OPXUUyPZ8MPG/kGZK3IZq1XsqWIiDyh27cUm/drb22H69Olw5plnGvGPSoNrLobtB+e2zaVMOdtsbm6GUaNGwc47DoDrurHl6LbmUVxikaAHZZzzckUdOI0fVPaCFo08fcfbAABonw3w3kwfN26c76K4IKjzoqg2q6qqIj2r+NX/flz5b7IJyuNe+c3QJ3/yF7znL8YP0vQW6u+844CWf6pwieVR+7rf52kAAFKpFBx//PEwfvx4Nvlc/rkdHR3Q09Pje25FRQXpMw7dc1Vtco/lQVD54TgOPHn7W0V/H0WzGH7XgrlBQDEaGxuhttZ/Z5WoPiTt3kYXTj7r+oK+aEQH75tJzU3NMGfOHN9zonwvSiikpaUFKioqYgtMgiDwQHfRiCAkia6uLpgyZYppNwRBEEKDed9TUVFR8D31IDuSP9hBKpWS+zsF8svKcRxoamqC7u5uGDlypCGvBKF8aW5uhunTp+cca2lpgfb29qJ/lz9meT9PnDgR10FBYM6wYcNK9hehkGKfqAj6vfez3+7dS5cuDdTp6Ojw3a3PlvwtzM4iH/vYx+DYY49Ftdvc3Ax1dXWZn/3qq6ZmaDH8smXLAvO4UvdSjqP/sl254jgOzJkzBzo6OnKOv+c97zHkkSD44/dpqvzfewTFDJUdpgTBNtBHwY1XLlM67sf7Pn1CzvZiHvM3jirYLrbzhPiTKpVr4Wgvfws8FSivHUNbRSNue7Zqq/qBoa1ynFM9cvLFj84TXO2bH+7XKP7p2VPpY5T91HQdxDFhE3cuQWkzjO7ZZ58NAJCZQBg7diwsX76c3DdqbQqbUeK0iWssZ6jLu9juItlg3AflX4vjONDZ2Uk6AWLjvRRlrog5Vm/cuDGUNvd7JkqbG69cBj09PdDW1gatra0A8G7/yp/kNuEflYZtuVhUPzi3bS5lytnm8OHDobOzEyaufffTad4Oud7OxN74dNJJJ0FHR0dmq22AofwSADLziatWrQpl19Y8ikssEvTAjHOdnZ0wb968d3/WzBVbW1tZjR9U9oIWjUxa01Bwnt+/VTj22GN9F14U+2QudV6kcnz9+vWBNiZNmgQAQ5/O8Hv20NPTA8cddxxs+dyaUu4WsGjRIhg9enTg71taWqB33dCOMEG7NhZ7SJx97OM3vU/ZPx24xHKdvu44DqxcuRK6urp8F0VtvHIZTJo0Cc444wyoqamBzZs3o/nCPa804R/3eVkVKPxYtmwZtLe3Z2JGMUrtTuSHl7Pmk30t3jjAYeGJt5GDKkm7t9GFk8+6vqAvGnnsvheUjged297eDtu2bcs5/spj+ws60lt/l41zFajKtUQhnU6T2aurq8t5o9BxHFi2LHwjorx2DG3VdhanPVu1Vf3A0FY5zqkeOfnix1sv6MdK7tco/unZU+ljlP2Uex1gYMIPKpthdL3c5qyzzgIAgMrKypw3hZJW5zo2s8slDnuCOpTlXWwCOv93byG4kX8t1dXVsHbtWpg2bVrOQwdM4m6vnHIDqjHSO5Z/r4x1b66LiTyg2LWvWbMGJk6cmHm44b2YMn78eOP+UWnYmouZuBflbM+Edpw2u7q6oKenB/bsHAQAKHhpbPny5Zk419XVlTkHYCjX7OrqgqNHjxYsoCuFrXkUl1gk6EEZ53Rzxa6uLlbjB5W9o0ePQkVFBSxZsiTn+J6/Hi66uCDo5yCKnRe04zoAfV6EZXPlypVFf79mzRqorq6GJ377krK2H9nlWVVVBW88fajkefnkLyRxHAcefwDHv7BwieU6fd1xnExeHXTuypUrobW1FRzHCfxMRhRfsM71ayeUzzh0z1W1yT2WB0HhRzqdhurqanjj6cOZY8UWb6juKuItYs4n+1qKLcCLmzVr1kR6XpS0extdOPms6wv6Sos//uKvSsdLnZt9s/fiI/sKOui+nU5mUJ82bZqit9FQuRZu9vIHZdd1C7b/jMsXCu2o7SwOe7Zqq/qBoa1ynFM9cvLFD3dXKnAb+rBwv0bxT8+eSh+j7Kem6yDoZqDUjS22H9hQ2SymG3YCzXSd227TxDWWM9Tlnf+QLOjtx3079d9KCbqWpqYmWLhwoba+ik0qOOUGVGMkxlitYk8VE3lA2Gs/+eSTYd68eTB79mxtbRVszAmx4OI39zqg8sMGm68/PjSJnz/2VVVVQU1NTcH53pj5wAMPwM6dO5Xf2rQ1j+ISiwQ9KOMcZa6Idb4uGPa8XDs/B9/9+OGc88LEllL3v9kaZ599dqj75TjzIkybqr5ExSvDXY8dCvX5HIDC+y2AoU+azZw505o2fO6552pr6PjhnR+mX3DJ5zg949A9V9Um91geBJUfruvCrscOKp2f/f8o+F2LLZ8m88OmcSIOOPms6wv6opHq2iql46XOzV6lWFldUfgNwTzZKG9nqqJyLZzspVIpqK6uZuELlXbUdhaHPVu1Vf3A0FY5zqkeOfmSz+mnnw5jxnVBS0uLlg7na8TSTpJ/GP3XRD81VQdbtmzJ/Nsvcb/kkkti8YMKKpvl0O9M2pw7d26s9oTSxFneQd9PByi8D4qC7f0jLnuUuSKXsVrFniom8oCw1z5x4kSorq5Wuk/m0qa43NOpwsVv7nVA5YcNNiuqgxdMtrW1QV9fX+bn7E/XAESbfLc1j+ISiwQ9KOOciVzRxjxvyZIlUFtbWxBzKqr83y7PX3BQLF8vRnt7OziOU/LTAHHmRZg2VX0JQ1B9AABUVEHmhWLvWLFPS+RrDRs2bOjZiSVt2PvMoo5GVD/S6XTO+aXaPJd8rti5ftdA+YxD91xVm9xjeRCUflRWFd9dpNjvonxWhkuZepT6TFQpuI4TpuDks64vjhvhjmrfvn2QSqVgYGDA9xt12Fx11VUAMPTdv3Q6DTt37oTXX38ddu/eDQAACxcuhKamJnjppZdg/fr1LL4FpYp3jQBDA+/WrVt9z+vv789ctwpNTU1QV1eXkxR5/7788sujOS0IgiAICeNvf/sb/P73v4eXXnoJtm7dCv39/XDyySfDzp074eSTTzbtnlU8++yzkEqlciYzhHBk54UbNmyIbTc9wTzf/OY34bTTToNbb70VXNeFtWvXwp/+9Cd4/vnnoaamBlasWAEPP/wwvPzyy7B792740Ic+pN3H+vv7AWBoYVx1dTVUVfG52RXUcV0XBgcHtV8WEARBMEV/fz80NTXBunXr4Lvf/S4cPnwYFi5cCIcOHYJDhw7BW2+9BdOmTYM77rgD2tvb4aSTToJvf/vb0N7eDqNHjy74xIQglBve3HFVVRUMDg5CY2Mj7N+/X0mj2Nx0EnnkkUdgz5498PTTT8Phw4czc+ZdXV3Q3t4Oe/fuheeffz5zfiqVgp6eHnjnnXfg8ccfzxyvqKiA3t5eqKyshBdffBH27NmT+d2qVasKPuGxffv2zLx8f38/1NXVQUtLCxw+fBh6e3uLfrbGdvr7+5Xa2I9+9CM4ePAg1NfXg+u68Oqrr8Lg4NDnzFzXhcrKSpg2bRpUV1fDY489BgcOHACAoR1j582bB3v37oXHHnsMAIY+yblixQp49NFH4W9/+xu4rltW7R2L/v5+eM973gM9PT3an0M3SX9/P6xbtw4eeeQReOGFF6QtlAGHDx+Gm2++GQ4dOgTd3d0wffp0uP322wEAYMWKFfD888/Dk08+CSNGjIClS5fC9773PXAcB9avXw9333037Nq1C2bPng3Dhg2D+++/H1KpFJxyyilw9913w6mnnhraj2effRa6u7uJrlIQ3kVlTQd6NP/8xh8oHQ9zbkNDA6TTafj5Z3cCQO5Kr53fc3IWQsSxYETlWrDtnX766bHazofy2jG0ddoZtT1btVX9wNBWOc6pHjn5QqXNRYNSO0n+YfRfE/2Uex1gYMIPKptBut3d3aEfZietzk3migI9HMrbcRzY+T39+x7vWjZv3gz19fWxLBixsX9Q5orYY7XjOJkFIxT35lEwkQdgXLuqNmcNW3MxE/einO2Z0DZh85GvvwUAxd8oz/7Zexdu8eLFMGHCBGV7HMb1KHCJRYIeVHGuo6MDnv9xNaxevRoWLVoEbW1tUdxjNX5Q2vPbeeKPX38z8+9izxvCvo+bv2AkiFWrVsGcOXMy99Im8qI4xqwon//1K2vv2MM37VN6LpStNXPmzAL/4oJLLI/S10eOHAkTJkwouWCESz4X5tw1a9agapvIe7nPy6pA6ccfvrYXXbPYghG/a7F5wUjS7m104eSzri/os4IH3jqkdDzMuQ0NDQAAcOjA0YIk7uih4G0rqVC5Fmx7nZ2dkXX8Equurq7IvmCDoa3Tzqjt2aqt6geGtspxTvXIyRcqbS4alNpJ8g+j/5rop9zrAAMTflDZLId+Z9Jme3t7rPaE0lCWd7Fv5eYf8+6DdDjw1iEYlUpFmrTVsRknnGIU1RiJMVar2FPFRB6Ace2q2pw1bM3FTNyLcrZnQtuEzSMHh8a2/E9A+JE9FzhmzJhI9mzNo7jEIkEPyjg3mNWXos6bcxo/4rbnxaJiUDyPqK6uht7e3szPJvKiOMYs1c//lnr+E1Rf3jhSbCHi8ccfX+BfXHCJ5VH6+plnnkOmHee5o0ePzvy7p6cHVdtE3st9XlYFKj9c14Uj7wTPvRQjrvGUO0m7t9GFk8+6vqDvNDJjmf93+IKOhzl3wYIFAADQOaMRAHI75sjpjbEvGlG5Fk72/MpIZbskTF+otHXaGbU9W7VV/cDQVjnOqR45+UKlzUWDUjtJ/mH0XxP9lFMd1NTUQH19PaRSKaN+cLZZDv3OlM3p06fDiBEjYrMnhIO6vPNzdtd1fScymrr1FowADF3L+973Pm0dVZu22aPMFbmM1Sr2VDGRB2Bcu6o2Zw1bczET96Kc7ZnQNmGzbVLw+21+81q6CyhtzaO4xCJBD8o41z6pBgD0FjZwGj+o7eWXU9tE9U/96cYjPw0TeVEcY5bO50yyFxWm02kAyB078nekV+kDNrZh7vk6F+2gc4Oej1E+49A9V9Um91geBJUflZWVMHXJu4uF4ni2zKVMsUjavY0unHzW9QV90cjcVeOVjoc5t6+vDxoaGqDrmCYAeLcTp9NpSE+tBtd1obm5GWbNmhXRazVUrsVGe8Wg9AVDW6edUduzVVvVDwxtleOc6pGTL1TaXDQotZPkH0b/NdFPOdXBpk2boKurCxYuXGjUD842y6HfmbIZZpcRTHtCOCjLu9Ri+OzfNU/Qn9yYu2p8rIvvPZu22aPMFbmM1Sr2VDGRB2Bcu6o2Zw1bczET96Kc7ZnQNmEzPWXoQa3fTiN+n6fRxdY8ikssEvSgjHPpqdUouSLl+bpg2fNb7JGeWnzRSKkFCdnHGxsbS/rQ0tLiq2UiL7JpzDr77LMBACA9tabgd1EW8djYhrnn61y0i51bVVUFdXV16Nomyon7vKwKVH5UVlbCmR84iUQ7CC5lioVN40QccPJZ1xf0RSPf/Me7lI6HPfeiiy6C39/yMgAMJV0jR44EAIAn/+NtcF0XGhoacrZuo0TlWsLS0dERqz0AgA0bNij/DZUvWNq67YzSnq3aqn5gaKsc51SPnHyh0uaiQamdJP8w+q+JfmqyDjo6OmDZsmWZn3XeftHxwyab5dDvuNs0cY3ljIny9ptAfumX+g/KpH/EpxGkw2WsVrGniok8AOPaVbU5a9iUi+n4wbltcylTG2w++R8HChZRegtIKHYasTWP4hKLBD0o49wTP96v6o6WvSjn64JpLz++PPkfb/ueV+xzktla2b+/8MILS9rfuHGj73ETeRHnMSuo3J/4iX99qWJjG+aer3PRLnZuZ2cnrFy5El3bRDlxn5dVIY5YlB9TsvPNsDnmsGHDSs4fcylTLDiPEybg5LOuLzRPQgjIv1k888wzMz8fPXo09jfkksDw4cNNuyAIgiAIrKiurobW1tacb5oKgiDEhTdB4fftbYztrgVBEATBJvzGQ7/FJN7nCQRB8MdxHGhubjbthtVk5+JB/w4izHMLebYRHv+y8q8HuYcSwiJ9UPAjTLt473vfC01NTTF4Iwj0oC8a2fCJxUrHVc495syOgjcLJqyqL7mNMzYq12KjvWJQ+oKhjdHOqOzZqq3qB4a2ynFO9cjJFyptLhqU2knyD6P/muinHOog6Jumcfthg81y6HfcbXLKFcsB6vIO+3makUtc7UlQ6R/xaQTpcBmrVeypYiIPwLh2VW3OGjbmYlH84Ny2uZSpDTbHrxraHr7YTiP545/3eYIo2JpHcYlFgh6UcW7CyQ05P9fUFH6+A9NelPN1wbLnl1NPWFWf+V3QwjU/iu2MlE9nZ2fJc0zkRTaOWV59Aeg9/LexDXPP17loc3rGoXuuqk3usTyIOGJRfrxOpVIwYsQIJa0wMYdLmWJh4zhBCSefdX1BXzTy3J93KR1XOXfgxcPgOE5OJ3zzb0cAIN6VgCrXEoUFCxag28NaVUt57RjaGO2Myp6t2qp+YGirHOdUj5x8odLmokGpnST/MPqviX7KvQ4wMOEHlc1y6HfcbXJp1+VC3OUd9Cbjgdf0taV/xKcRpMNlrFaxp4qJPADj2lW1OWvYmouZuBflbM+EtgmbzluNmX9n7zSSPanv/VxVVaX9OUlb8ygusUjQgzLOvfm3wYJjqg/COI0fcdvznjl4hPlcger8+2mnnZbz87p166C2tjbnmIm8iOuYFfT8p7KyUvkZUV1dne8uVTa2Ye75OhdtTs84dM9Vtck9lgdBHYvWrl1bcHzUqFEwc+bM0DonnHBCaHtJgus4YQpOPuv6gr5o5Pf/70ml4yrnHni2FkaNGpUZ/M877zx47c+HYNiwYZFWS0dF5Vqi0NfXV9TeP/7jPyprYpUP5bVjaGO0Myp7tmqr+oGhrXKcUz1y8oVKm4sGpXaS/MPovyb6Kfc6wMCEH1Q2y6HfcbfJpV2XC5Tl/fzzz8PRo0dzjgVNfA486WgvnJf+EZ9GkA6XsVrFniom8gCMa1fV5qxhay5m4l6Usz0T2iZsHnm5Cc444wxoaHh3lwS/N/enTp0K69at0/70sq15FJdYJOhBGede+9OhgmOqixo4jR+U9vx2Bnntz4Xll032IjYsOjo6ChbCmciLbBizstvy5s2bM/WV38aDdn1Jp9OweHHh29g2tmHu+ToXbU7POHTPVbXJPZYHQR2Luru7c44dc8wxRf/GbwydNm1aaHtJwoZxIk44+azrC/qiEVKcoZWj3gKIVCoFAAAnnXSS9k2iTVRVVSmd39LSApWVlUTeCIIgCIIgCJiMHDkSAOT7y+XK4GDuW6HZ7cB7aOb3O0EQBEFICk1NTVBRUQFnnXUWAAyNf0ePHs0ZB5cvX669y4ggJBnJEsMzYcIEmDx5cqhzs3ccCbNgpJyeWcRFfrl7z0rC3huNHj1axg9BEAAgN5b7LSQThHLDcSPMNO7btw9SqRQMDAxAc3MzhV9Fye7I9957LyxZsiR2H7DZvn077No1tG3Mtm3bSp5/1VVXhdZuaWmB6upqAHi37FpbW+Hcc8+N5qwgCIIgCIJAxi9+8Qu4//77YenSpbB8+XLT7ggxctVVV8GHP/xhuPXWW2HOnDkwevRoeOSRR+DFF1+EyspKmD59Ovzyl7+Ejo4O2LVrF/y3//bfoK2tzbTbgiAIgkBGf38/LFu2DHbv3g319fWwe/dumDZtGvT09Jh2TRBYctttt8Ebb7wBjuPA/Pnz4fDhw/Dwww8DAEBFRQW89lrpbxym02nYunUrtausOHz4MNx8881w+PDhzPx5V1cXtLa2wv79++GZZ54BgKG59VQqBePHj4dDhw7Bjh07chYxTJs2DSoqKuCll16CN954o+zKkYof/OAHMDg4CLW1tVBZWQkvvvginHLKKTB69GgAGHq2MmPGDKipqYHHH38c9u/fDwBDC0oWLlwIe/fuhR07dgAAwOWXX57Rvemmm2DTpk3xX5AgCCz48Y9/DHPnzoXq6urMC1wAAP/5n/8J+/fvh2XLlsF3v/tdcBwH1q9fD/fccw+89tprMHv2bBg2bFjJ3UkEgQMqazrQl1R+5eKfKB2PopGdiP3hpjcUvMNB5Vq42ou6YITy2jG0MdoZlT1btVX9wOzrYY5zqkdOvlBpc9Gg1E6Sfxj910Q/5V4HGJjwg8pmOfS7uG2qbnHMpV2XC9Tl7e2qeNxxxxW0hSNH3v22+jM/0t8K28b+YcIeZa7IZaxWsaeKiTwA49pVtTlr2JqLmbgX5WzPhLYJm/naU6dOzeww0t3dDccddxx0dnaS2bMFLrFI0IMizp1zzjkAAPDoN9+M5JOqPZ3zdaG09+gtb2X+nZ+Tx7nbn4m8yJYxy1swAlC8vopx8cUX5/xsYxvmnq9z0eb0jEP3XFWb3GN5EHHFIsdxchaMAACsWbMm5+fOzs6c2F9bW5uZu1G1lwRsGSfigpPPur6ofeckBPtef1vpuK6Gii4WcdvUsdfe3o76bUXKa8fQxmhnVPZs1Vb1g6qvBx3nVI+cfKHS5qJBqZ0k/zD6r4l+yr0OMEhS/lIO/Y67TS7tulygLO90Og0NDQ2hzh1EcEP6R3waQTpcxmoVe6qYyAMwrl1Vm7OGrbmYiXtRzvZMaJuwma+9fPlyeOWVV6CqqgpGjBhBbs8WuMQiQQ/KOHdov/7CBk7jB7W9/IUgh/cfzRwPO7eucm5YTORFNo5Zh/cfjbSYJ/8zNTa2Ye75OhdtTs84dM9Vtck9lgcRRyyaPn06tLS0+J6zevVqqK+vBwCA0047DV599dVMjO/r64tkLynYOE5QwslnXV/QF41MWTRa6biuhoouFnHb1LGHnahSXjuGNkY7o7Jnq7aqH1R9Peg4p3rk5AuVNhcNSu0k+YfRf030U+51gEGS8pdy6Hdx21Sd7OLSrssFyvJWyd0bx+g/CLCxf5iwR5krchmrVeypYiIPwLh2VW3OGrbmYibuRTnbM6Ftwqaf9siRIwveAKW0ZwNcYpGgB2Wca+3Rn/bnNH7Eba+lpzrUedhz7/mYyIu4jlleWfuVeUtPNVRVVUFlZWVkfQA72zD3fJ2LNqdnHLrnqtrkHsuDiCMWTZw4MfCcYcOGAQDAGWecUfA71djPpUyx4DpOmIKTz7q+oC8aWbx+qtJxXQ0VXSzitoltzwt2UaC8dgxtjHZGZc9WbVU/qPp60HFO9cjJFyptLhqU2knyD6P/muin3OsAgyTlL+XQ70zZHDt2bKz2hHBQlre3YCjMBETbNH17NvePOO1R5opcxmoVe6qYyAMwrl1Vm7OGrbmYiXtRzvZMaJuwaeM4YQIusUjQgzLOjZpbW3BMdXE6p/Ejbnuj5gx9doBi9xAVTORFXGN8dvvNb8uj5tTAokWL4NChQ7B//37YsWNH7P6Zssc9X+eizekZh+65qja5x/IguMSiUaNGAQBAVVUVNDQ0wEknnURqzwa41A0XOPms60tF6VPUuOkjv1A6rquhootF3Dax7Z1//vmR/5by2jG0MdoZlT1btVX9oOrrQcc51SMnX6i0uWhQaifJP4z+a6Kfcq8DDJKUv5RDvzNls9gbDxT2hHDEUd4XXHBB5t9Bk/sv/Ex/8trm/hGnPcpckctYrWJPFRN5AMa1q2pz1rA1FzNxL8rZngltEzZtHCdMwCUWCXpQxrm/fP8tVXe07EU5XxdKe4/9YH/JcxzHKcjVo3wipRgm8iIbY/ye+5uhoqIC6urq4IQTTsgcz66Pmpqakjo2tmHu+ToXbU7POHTPVbXJPZYHwS0Wtbe3w+rVq2HKlCmx2OMMt7oxDSefdX1BXzQi8MD7Hp/fSmiTq6MFQRAEQRAEQfDH2/bU2xnQcZyc72xPmjQJenp6jPgmCIIgCIIg2IXMC6vhVzZ+O42UWhgiZUxDdrnW19fn/K6urnBnnfy/veSSS0j8EgShfJD4LiQd9EUjp/3PhUrHdTVUdLGI26afvYaGhqJ/097eDq2trbH4wkkbo51R2bNVW9UPqr4edJxTPXLyhUqbiwaldpL8w+i/Jvop9zrAIEn5Szn0O+42ubTrcoGyvL1tTz26urpg9erVmYnpysrKzBtyIxa5UF0d7lvrQUj/iE8jSIfLWK1iTxUTeQDGtatqc9awNRczcS/K2Z4JbRM2bRwnTMAlFgl6UMa5nuX1vsdLLXjo6OiIZC/K+bpg2nMcJ2chgld+3qKRY445JrSOV8bvec97tP0ykRdxj/EVFRVw4YUXKutmL8YPwsY2zD1f56LN6RmH7rmqNrnH8iC4xyLO9qhJUt1gwMlnXV/QF43sen5A6biuhoouFnHb9LN32WWXlfy7qqqqWHzhpI3Rzqjs2aqt6gdVXw86zqkeOflCpc1Fg1I7Sf5h9F8T/ZR7HWCQpPylHPpd3DbnzZuX2WkiDnuCGnGWt+M4gTn9gllLoampSUvfxv5hwh5lrshlrFaxp4qJPADj2lW1OWvYmouZuBflbM+EtgmbNo4TJuASiwQ9KOPcgT1HfY+rvCXNafygtlddXQ0bN27M/OyV37hx42DevHlw3HHH5ZxfX18PjY2NBYu4vQUjjuPAhAkTtP0ykRfZEOPz2zGnud247XHP17loc3rGoXuuqk3usTwIG2IRV3vUJKluMODks64v6ItG7vveY0rHdTVUdLGI26afvewHCWFWyXqsXbsW3RcsMLQx2hmVPVu1Vf2g6utBxznVIydfqLS5aFBqJ8k/jP5rop9yrwMMkpS/lEO/i9tmW1sbnHTSSbHZE9QwVd7exKg3Ef343a9pa9rYP0zYo8wVuYzVKvZUMZEHYFy7qjZnDVtzMRP3opztmdA2YdPGccIEXGKRoAdlnHvljwdDnzts2LDMDiPZO5FwGj/isJe9EMErv7q6usxu3tlz8BMnToS+vj7YtGlTwUJuzM8XmMiLOMf4oJ1y/HRPO+006OnpgdGjR8PEiRNj8U8VLrGcU85Fpc3pGYfuuao2ucfyIDjHIu72qElS3WDAyWddX9AXjQjxk52IptPpoud2d3cTeyMIgiAIgiDoEnb7Y6F8WLp0KVRWVgJA6W3FBUEQBEEQhDJHIV0cNWoUzJ8/H5qbm+n8sZAxY8ZAY2Nj5ufNmzdDOp2GU045JXO8srISzjvvPKiqqoLZs2cDwNDikuxP3Qh4hF2Qs3HjRujq6oKTTz4Z2tvbobOzk9gzQRAEQbAfx40w47hv3z5IpVIwMDBQkEweemcQauoKt1AOOu6HioaKLhYUNrdv3w67du0CAIBt27aFsnfVVVcBwFAi6rouuK4L6XQadu/enfP9Se/biwAAl19+uZaflOWNoY3Rzqjs2aqt6gdVXw86zqkeOflCpc1Fg1I7Sf5h9F8T/ZR7HdjqB5XNcuh33G1yadflgonyvvHGG+EDH/gAvPHGG/Czn/0M3njjDdiy6f2s+x4Xm5xiFNUYiTFWq9hTxUQegHHtqtqcNWzNxUzci6oQtz0T2iZs2jhOmIBLLBL0oIpz27dvh8FDR2Hx0kVw6NAhePjhhwFg6IH77t27obu7G5555pnM+ZMmTYLe3l7YtWsX/PWvfwXXdWHr1q2sxo+47G3fvh1c14WjgwD/7cNbQ//dkSNHoLKyEh544AFob2+HSZMmoew4YiIv4hjjv//978PRo0ehuroaamtrYfXq1ZF0t2/fXvT5iI1tmHu+zkWb0zMOSv+4z8uqwDEW2WKPmiTVDQacfPbzpdiajnzQdxr56v/4udJxXQ0VXSwobG7atEnLXnt7e+bf3m4jFG8gUpY3hjZGO6OyZ6u2qh9UfT3oOKd65OQLlTYXDUrtJPmH0X9N9FPudYBBUvIXLN2k1XncNrm063LBRHlv3rwZAIYm+r0cn3vf42KTUzlRjZEYY7WKPVVM5AEY166qzVnD1lzMxL0oZ3smtE3YtHGcMAGXWCToQRnnHvv+20V/n7+gwcsxXdeFBQsWkPuHAaW9nf9xWOl8b0fAhQsXoi0YATCTF3GN8cXKNKxuqR1gbGzD3PN1LtqcnnHonqtqk3ssD4JrLLLBHjVJqhsMOPms6wv60pfXX9qndFxXQ0UXCwqbtbW15PaWLFmirUFZ3hjaGO2Myp6t2qp+UPX1oOOc6pGTL1TaXDQotZPkH0b/NdFPudcBBknJX7B0k1bncdvk0q7LBRPlXVVVeNvGve9xscmpnKjGSIyxWsWeKibyAIxrV9XmrGFrLmbiXpSzPRPaJmzaOE6YgEssEvSgjHMH9x5VdSfzUL6vr0/ZXpTzdaG019YwMtLfYS0W8TCRF9kY48PqXnTRRSg6WHCJ5ZxyLiptTs84dM9Vtck9lgdhYyziYo+aJNUNBpx81vUFfaeRCXP8E6qg47oaKrpYxG2zlL2gHUXyk9RZs2aR+2JaG6OdUdmzVVvVD6q+HnScUz1y8oVKm4sGpXaS/MPovyb6Kfc6wCBJ+Us59DvuNrm063LBdHl7uT/3vsfFJqdyohojMcZqFXuqmMgDMK5dVZuzhq25mIl7Uc72TGibsGnjOGECLrFI0IMyzjWPqQx1XkdHR87ccvZ8MqfxI257XPqHibzIxnINq1tqUY+NbZh7vs5Fm9MzDt1zVW1yj+VB2BiLuNijJkl1gwEnn3V9cdwI3zAp9v2b154fgOFjUwV/E3TcDxUNFV0sqGxeddVVAACwbdu2UPauuuoqqK6uhlRq6He7d+/OfJomOwFyXRccxyn6vb6wUJY3hjZGO6OyZ6u2qh9UfT3oOKd65OQLlTYXDUrtJPmH0X9N9FPudWCrH1Q2y6HfcbfJpV2XCybLe+/evXD77bfD3r17YcO681j3PS42OcUoqjESY6xWsaeKiTwA49pVtTlr2JqLmbgXVSFueya0Tdi0cZwwAZdYJOhBFee2b98Ob78+CCeuWQyHDh2Chx9+GACG5otff/11GDduHLzxxhswMDAA6XQaWlpaoLe3F15//XXYuXNnZg6Z0/gRl73t27eD67oouTYGJvIijjH++9//PriuC52dnTB27FgYPXo0ii6Wfybtcc/XuWhzesZB6R/3eVkVOMYiW+xRk6S6wYCTz36+FFvTkQ/6TiM3fvAOpeO6Giq6WFDZbGtrU7LX2dkJtbW16NveFYOyvDG0MdoZlT1btVX9oOrrQcc51SMnX6i0uWhQaifJP4z+a6Kfcq8DDJKUv5RDv+Nuk0u7LhdMlndTUxOsWrUKxo0bx77vcbHJqZyoxkiMsVrFniom8gCMa1fV5qxhay5m4l6Usz0T2iZs2jhOmIBLLBL0oIxzf/nO24G/c10XGhsboaOjA81elPN1obDn7dTNpX+YyIs4x/hhw4YVLBjB0MXWidMe93ydizanZxy656ra5B7Lg+Aci7jboyZJdYMBJ591fUFfNCJEZ9q0aUrnb9myBerq6oi8EQRBEARBEASBE5WVldDR0QGnnHKKaVcEQRAEQRCEBJD/MmKETckTxZIlS0y7IAiCIAiCYAT0RSNrL+9TOq6roaKLBZXNk046Sdle2F1GTjzxxEg+qfjCQRujnVHZs1Vb1Q+qvh50nFM9cvKFSpuLBqV2kvzD6L8m+in3OsAgSflLOfQ77ja5tOtygUt5c+97XGxyKieqMRJjrFaxp4qJPADj2lW1OWvYmouZuBflbM+EtgmbNo4TJuASiwQ9KOPc2ONrASDc3HHQOZzGj7jtcekfJvIiG8uV09xu3Pa45+tctDk949A9V9Um91gehI2xiIs9apJUNxhw8lnXlyokPzLsHziodFxXQ0UXi7htFrPnum6o5H/q1KnkvnDQxmhnVPZs1Vb1g6qvBx3nVI+cfKHS5qJBqZ0k/zD6r4l+yr0OMEhS/lIO/Y67TS7tulzgUt7c+x4Xm5zKiWqMxBirVeypYiIPwLh2VW3OGrbmYibuRTnbM6FtwqaN44QJuMQiQQ/KODd4YGjHkPydQ/zmkbPPyf49p/EjTnvnn38+PHLHCyTaqpjIizjG+OXLl8OhQ4egvr4eVZdKJ0573PN1LtqcnnHonqtqk3ssD4JjLLLFHjVJqhsMOPms6wv6TiN33/onpeO6Giq6WMRtU8ce9paClNeOoY3Rzqjs2aqt6gdVXw86zqkeOflCpc1Fg1I7Sf5h9F8T/ZR7HWCQpPylHPodd5tc2nW5wKW8ufc9LjY5lRPVGIkxVqvYU8VEHoBx7aranDVszcVM3ItytmdC24RNG8cJE3CJRYIelHHubw8e8j0+Z84c6Ot79y3U/Lnj7J85jR9x2mtubmbTP0zkRRxjfGtrK4wYMQKam5tRdal04rTHPV/nos3pGYfuuao2ucfyIDjGIlvsUZOkusGAk8+6vqAvGhEEQRAEQRAEQRAEQRAEQRAEIX4qKt6d8s/fWaSysjLn94IgCIIgCIIAAOC4Ebai2LdvH6RSKRgYGChY3bl/4B1oTNUV/E3QcT9UNFR0saC0edVVV8G2bdtC2+vv7wfHccB1Xdi9ezek02kAePeGwKverVu3ovhHee0Y2hjtjMqerdqqflD19aDjnOqRky9U2lw0KLWT5B9G/zXRT7nXga1+UNksh37H3SaXdl0ucClv7n2Pi01O5UQ1RmKM1Sr2VDGRB2Bcu6o2Zw1bczET96IqxG3PhLYJmzaOEybgEosEPaji3OHDh6H/X26EpSceB4cPH4aHHnoIAIbmi8ePHw/jxo2De++9Fw4fPgyu60JLSwv09vbC66+/Dk8//XRmHpnT+BG3PS79w0ReZGO5cprbjdse93ydizanZxyU/nGfl1XBxljExR41SaobDDj57OdLsTUd+aAvK7512z1Kx3U1VHSxiNumqr3Ro0fD2rVrWfgStzZGO6OyZ6u2qh9UfT3oOKd65OQLlTYXDUrtJPmH0X9N9FPudYBBkvKXcuh33G1yadflApfy5t73uNjkVE5UYyTGWK1iTxUTeQDGtatqc9awNRczcS/K2Z4JbRM2bRwnTMAlFgl6UMW56upqeOr2A6HOzd+JJIq9qOfrkqRYFISJvMjGcuU0txu3Pe75OhdtTs84dM9Vtck9lgdhYyziYo+aJNUNBpx81vWlCsmPDK/8dY/ScV0NFV0sKG22t7dr2zv11FOx3CmA8toxtDHaGZU9W7VV/aDq60HHOdUjJ1+otLloUGonyT+M/muin3KvAwySlL+UQ7/jbpNLuy4XuJQ3977HxSancqIaIzHGahV7qpjIAzCuXVWbs4atuZiJe1HO9kxom7Bp4zhhAi6xSNCDMs4d2H1U1R0te1HO1yVJsSgIE3mRjeXKaW43bnvc83Uu2pyeceieq2qTeywPwsZYxMUeNUmqGww4+azrC/pOI2OmppWO62qo6GJBadNvdXcpe94naC699FISn1R8Ma2N0c6o7NmqreoHVV8POs6pHjn5QqXNRYNSO0n+YfRfE/2Uex1gkKT8pRz6HXebXNp1ucClvLn3PS42OZUT1RiJMVar2FPFRB6Ace2q2pw1bM3FTNyLcrZnQtuETRvHCRNwiUWCHpRxrnFkZajzsr9cnz8vzWn8iNsel/5hIi+ysVw5ze3GbY97vs5Fm9MzDt1zVW1yj+VB2BiLuNijJkl1gwEnn3V9cdzszDAkxb5/s+fVt6B1xLCCvwk67oeKhoouFpQ2+/v7M9+NDGOvv78fHMcB13Vh69atOX+/fft2cF0XzjnnHN8dTKJAee0Y2hjtjMqerdqqflD19aDjnOqRky9U2lw0KLWT5B9G/zXRT7nXga1+UNksh37H3SaXdl0ucClv7n2Pi01O5UQ1RmKM1Sr2VDGRB2Bcu6o2Zw1bczET96IqxG3PhLYJmzaOEybgEosEPSjj3Jeu/VdYtuo4OHz4MDz00EMAMLQoZMKECTB27Fi499574fDhw+C6LrS0tEBvby+88cYb8NRTT2XmljmNH3Hb49I/TORFNpYrp7nduO1xz9e5aHN6xkHpH/d5WRVsjEVc7FGTpLrBgJPPfr4UW9ORD/pOI/+65f8pHdfVUNHFIm6bxeyl08GrhhYuXAgA/p+8ofCFgzZGO6OyZ6u2qh9UfT3oOKd65OQLlTYXDUrtJPmH0X9N9FPudYBBkvKXcuh33G1yadflApfy5t73uNjkVE5UYyTGWK1iTxUTeQDGtatqc9awNRczcS/K2Z4JbRM2bRwnTMAlFgl6UMa5P//7flV3tOxFOV+XJMWiIEzkRTaWK6e53bjtcc/XuWhzesahe66qTe6xPAgbYxEXe9QkqW4w4OSzri/oi0aEeDn77LMDfzd37twYPREEQRAEQRAEQRAEQRAEQRAEQRAEQRAEwSbQF42svOQYpeO6Giq6WMRts5S9CF8YigzltWNoY7QzKnu2aqv6QdXXg45zqkdOvlBpc9Gg1E6Sfxj910Q/5V4HGCQpfymHfsfdJpd2XS5wKW/ufY+LTU7lRDVGYozVKvZUMZEHYFy7qjZnDVtzMRP3opztmdA2YdPGccIEXGKRoAdlnBu9qDbwd9lzyY7joNiLcr4uSYpFQZjIi2wsV05zu3Hb456vc9Hm9IxD91xVm9xjeRA2xiIu9qhJUt1gwMlnXV9kp5GEUVube0NwzDHHmHFEEARBEARBEARBEARBEARBYEecLyEKgiAIgiAI/EFfNPKL//OI0nFdDRVdLOK2qWLvkksuyfl58eLFxnwxoY3Rzqjs2aqt6gdVXw86zqkeOflCpc1Fg1I7Sf5h9F8T/ZR7HWCQpPylHPodd5tc2nW5wKW8ufc9LjY5lRPVGIkxVqvYU8VEHoBx7aranDVszcVM3ItytmdC24RNG8cJE3CJRYIelHHuxQcO+h7P31mk2OIRTuNH3Pa49A8TeZGN5cppbjdue9zzdS7anJ5x6J6rapN7LA/CxljExR41SaobDDj5rOuL7DSSALKT/YoKqVJBEARBEARBEARBEARBEARBEARBEARBEErjuBH2otu3bx+kUikYGBiA5ubmnN/tefUtaB0xrOBvgo77oaKhoosFpc3+/n7YunWrkr3t27eD67oFf0cB5bVjaGO0Myp7tmqr+kHV14OOc6pHTr5QaXPRoNROkn8Y/ddEP+VeB7b6QWWzHPodd5tc2nW5wKW8ufc9LjY5lRPVGIkxVqvYU8VEHoBx7aranDVszcVM3IuqELc9E9ombNo4TpiASywS9KCMc1+69l/h+JWLYHBwEB566CEAGHrxcOLEiTB69Gi49957YXBwEFzXhZaWFujt7YU33ngDnnrqqcwcM6fxI257XPqHibzIxnLlNLcbtz3u+ToXbU7POCj94z4vq4KNsYiLPWqSVDcYcPLZz5diazryQd+W4kdf+q3ScV0NFV0s4rZp4hqDoPQFQxujnVHZs1Vb1Q+qvh50nFM9cvKFSpuLBqV2kvzD6L8m+in3OsAgSflLOfQ77ja5tOtygUt5c+97XGxyKieqMRJjrFaxp4qJPADj2lW1OWvYmouZuBflbM+EtgmbNo4TJuASiwQ9KOPcM796J9R5juMEfqKG0/gRtz0u/cNEXmRjuXKa243bHvd8nYs2p2ccuueq2uQey4OwMRZxsUdNkuoGA04+6/qCvmjkhcd2Kx3X1VDRxSJum6XsRdgsJjKU146hjdHOqOzZqq3qB1VfDzrOqR45+UKlzUWDUjtJ/mH0XxP9lHsdYJCk/KUc+h13m1zadbnApby59z0uNjmVE9UYiTFWq9hTxUQegHHtqtqcNWzNxUzci3K2Z0LbhE0bxwkTcIlFgh6UcW7/K0dU3dGyF+V8XZIUi4IwkRfZWK6c5nbjtsc9X+eizekZh+65qja5x/IgbIxFXOxRk6S6wYCTz7q+oC8aGTm+Vem4roaKLhaUNjds2BCrPVUofcHQxmhnVPZs1Vb1g6qvBx3nVI+cfKHS5qJBqZ0k/zD6r4l+yr0OMEhS/lIO/Y67TS7tulzgUt7c+x4Xm5zKiWqMxBirVeypYiIPwLh2VW3OGrbmYibuRTnbM6FtwqaN44QJuMQiQQ/KOFef9p/2z99ZpNjLiJzGj7jtcekfJvIiG8uV09xu3Pa45+tctDk949A9V9Um91gehI2xiIs9apJUNxhw8lnXF8eNsE1Fse/f7B94BxpTdQV/E3TcDxUNFV0s4rZZyl5/fz8AQOZ7kyZ9Ma2N0c6o7NmqreoHVV8POs6pHjn5QqXNRYNSO0n+YfRfE/2Uex3Y6geVzXLod9xtcmnX5QKX8ube97jY5FROVGMkxlitYk8VE3kAxrWranPWsDUXM3EvqkLc9kxom7Bp4zhhAi6xSNCDMs7d/LVbYMYxU2FwcBAeeughABhaMDJp0iTo7OyE++67DwYHB8F1XUilUjBlyhTYs2cPPPnkk5k5Zk7jR9z2uPQPE3mRjeXKaW43bnvc83Uu2pyecVD6x31eVgUbYxEXe9QkqW4w4OSzny/F1nTkg77TyBfP/5HScV0NFV0s4rZZyp7jOPE4ArTXjqGN0c6o7NmqreoHVV8POs6pHjn5QqXNRYNSO0n+YfRfE/2Uex1gkKT8pRz6HXebXNp1ucClvLn3PS42OZUT1RiJMVar2FPFRB6Ace2q2pw1bM3FTNyLcrZnQtuETRvHCRNwiUWCHpRx7vmfVKs5o2kvyvm6JCkWBWEiL7KxXDnN7cZtj3u+zkWb0zMO3XNVbXKP5UHYGIu42KMmSXWDASefdX1BXzQiCIIgCIIgCIIgCIIgCIIgCIIgCIIgCIIg8Ad90cgJ75updFxXQ0UXi7htlrIX4QtDkaG8dgxtjHZGZc9WbVU/qPp60HFO9cjJFyptLhqU2knyD6P/muin3OsAgyTlL+XQ77jb5NKuywUu5c2973GxyamcqMZIjLFaxZ4qJvIAjGtX1easYWsuZuJelLM9E9ombNo4TpiASywS9DAV5xzHycwnF9vBmtP4Ebc9Lv3DRF5kY7lymtuN2x73fJ2LNqdnHLrnqtrkHsuDsDEWcbFHTZLqBgNOPuv6gr5opDFVq3RcV0NFF4u4bZq4xiAofcHQxmhnVPZs1Vb1g6qvBx3nVI+cfKHS5qJBqZ0k/zD6r4l+yr0OMEhS/lIO/Y67TS7tulzgUt7c+x4Xm5zKiWqMxBirVeypYiIPwLh2VW3OGrbmYibuRTnbM6FtwqaN44QJuMQiQQ9TcS77BcRiLyNyGj/itself5jIi2wsV05zu3Hb456vc9Hm9IxD91xVm9xjeRA2xiIu9qhJUt1gwMlnXV/QF43cvv0PSsd1NVR0sYjbpolrDILSFwxtjHZGZc9WbVU/qPp60HFO9cjJFyptLhqU2knyD6P/muin3OsAgyTlL+XQ77jb5NKuywUu5c2973GxyamcqMZIjLFaxZ4qJvIAjGtX1easYWsuZuJelLM9E9ombNo4TpiASywS9EhSnMOyycUel/5hIi+ysVw5ze3GbY97X+eizekZh+65qja5x/IgbIxFXOxRk6S6wYCTz7q+oC8aEeJn9OjRpl0QBEEQBEEQBEEQBEEQBEEQmDBixAgYOXJkzjHvczT5n6WJ8/PngiAIgiAIAj8cN0JGuG/fPkilUjAwMADNzc05v3vt+QEYPjZV8DdBx/1Q0VDRxSJum2Hs9ff3w9atW1n4YlIbo51R2bNVW9UPqr4edJxTPXLyhUqbiwaldpL8w+i/Jvop9zqw1Q8qm+XQ77jb5NKuywUu5c2973GxyamcqMZIjLFaxZ4qJvIAjGtX1easYWsuZuJeVIW47ZnQNmHTxnHCBFxikaBHXHGuv78fAIYWikyePBlGjRoF9913HwwODoLrupBKpaC3txf27t0LTz75ZGaOmdP4Ebc9Lv3DRF5kY7lymtuN2x73fJ2LNqdnHJT+cZ+XVcHGWMTFHjVJqhsMOPns50uxNR35oO808oub/qh0XFdDRReLuG2auMYgKH3B0MZoZ1T2bNVW9YOqrwcd51SPnHyh0uaiQamdJP8w+q+Jfsq9DjBIUv5SDv2Ou00u7bpc4FLe3PseF5ucyolqjMQYq1XsqWIiD8C4dlVtzhq25mIm7kU52zOhbcKmjeOECbjEIkEPk3HOe480aOcRVXtRztclSbEoCBN5kY3lymluN2573PN1LtqcnnHonqtqk3ssD8LGWMTFHjVJqhsMOPms6wv6opGdf3xF6biuhoouFnHbDGOvqqoqBk9orx1DG6OdUdmzVVvVD6q+HnScUz1y8oVKm4sGpXaS/MPovyb6Kfc6wCBJ+Us59DvuNrm063KBS3lz73tcbHIqJ6oxEmOsVrGniok8AOPaVbU5a9iai5m4F+Vsz4S2CZs2jhMm4BKLBD04xLlim5BzGj/itself5jIi2wsV05zu3Hb456vc9Hm9IxD91xVm9xjeRA2xiIu9qhJUt1gwMlnXV/QF420d/lvbRJ0XFdDRReLuG2Gsbd58+YYPKG9dgxtjHZGZc9WbVU/qPp60HFO9cjJFyptLhqU2knyD6P/muin3OsAgyTlL+XQ77jb5NKuywUu5c2973GxyamcqMZIjLFaxZ4qJvIAjGtX1easYWsuZuJelLM9E9ombNo4TpiASywS9OAU5/wWj3AaP+K2x6V/mMiLbCxXTnO7cdvjnq9z0eb0jEP3XFWb3GN5EDbGIi72qElS3WDAyWddXxy32HLiAIp9/+bQO4NQU1e460XQcT9UNFR0sYjbpolrDILSFwxtjHZGZc9WbVU/qPp60HFO9cjJFyptLhqU2knyD6P/muin3OvAVj+obJZDv+Nuk0u7Lhe4lDf3vsfFJqdyohojMcZqFXuqmMgDMK5dVZuzhq25mIl7URXitmdC24RNG8cJE3CJRYIeccW5/v5+ABj6BM3kyZNh1KhRcO+998KRI0fAdV1IpVLQ29sLe/fuhZaWFpg/fz65fxgkKRZR+kGZK2L5wkXXxjbMPV/nos3pGQelf9znZVWwMRZxsUdNkuoGA04++/lSbE1HPug7jVx31veUjutqqOhiEbdNE9cYBKUvGNoY7YzKnq3aqn5Q9fWg45zqkZMvVNpcNCi1k+QfRv810U+51wEGScpfyqHfcbfJpV2XC1zKm3vf42KTUzlRjZEYY7WKPVVM5AEY166qzVnD1lzMxL0oZ3smtE3YtHGcMAGXWCToYSrO5b9Dmv2zt2BE1V6U83VJUiwKwkReZGO5cprbjdse93ydizanZxy656ra5B7Lg7AxFnGxR02S6gYDTj7r+oK+aEQQBEEQBEEQBEEQBEEQBEEQBF44jhN4POh3giAIgiAIQvJBXzSyeP1UpeO6Giq6WMRt08Q1BkHpC4Y2RjujsmertqofVH096DineuTkC5U2Fw1K7ST5h9F/TfRT7nWAQZLyl3Lod9xtcmnX5QKX8ube97jY5FROVGMkxlitYk8VE3kAxrWranPWsDUXM3EvytmeCW0TNm0cJ0zAJRYJenCIc8UWinAaP+K2x6V/mMiLbCxXTnO7cdvjnq9z0eb0jEP3XFWb3GN5EDbGIi72qElS3WDAyWddX9AXjXSMTSkd19VQ0cUibpsmrjEISl8wtDHaGZU9W7VV/aDq60HHOdUjJ1+otLloUGonyT+M/muin3KvAwySlL+UQ7/jbpNLuy4XuJQ3977HxSancqIaIzHGahV7qpjIAzCuXVWbs4atuZiJe1HO9kxom7Bp4zhhAi6xSNCDQ5zzPk2T/8kaVXtRztclSbEoCBN5kY3lymluN2573PN1LtqcnnHonqtqk3ssD8LGWMTFHjVJqhsMOPms6wv6opEffem3Ssd1NVR0sYjbpolrDILSFwxtjHZGZc9WbVU/qPp60HFO9cjJFyptLhqU2knyD6P/muin3OsAgyTlL+XQ77jb5NKuywUu5c2973GxyamcqMZIjLFaxZ4qJvIAjGtX1easYWsuZuJelLM9E9ombNo4TpiASywS9Igrzk2YMEHJThR7Uc7XJUmxKAgTeZGN5cppbjdue9zzdS7anJ5x6J6rapN7LA/CxljExR41SaobDDj5rOsL+qIRQRAEQRAEQRAEQRAEQRAEQRDM8573vKfgmN/OIoIgCIIgCEL54rgRMsR9+/ZBKpWCgYEBaG5uzvndS0+9Dl2T2gv+Jui4HyoaKrpYxG3TxDUGQekLhjZGO6OyZ6u2qh9UfT3oOKd65OQLlTYXDUrtJPmH0X9N9FPudWCrH1Q2y6HfcbfJpV2XC1zKm3vf42KTUzlRjZEYY7WKPVVM5AEY166qzVnD1lzMxL2oCnHbM6FtwqaN44QJuMQiQY8441x/fz84jgO9vb0wYsQIuPfee+Ho0aPgui40NzfDlClTYGBgAFasWBGLfxgkKRZR+kGZK2L5wkXXxjbMPV/nos3pGQelf9znZVWwMRZxsUdNkuoGA04++/lSbE1HPug7jdz3vceUjutqqOhiEbdNE9cYBKUvGNoY7YzKnq3aqn5Q9fWg45zqkZMvVNpcNCi1k+QfRv810U+51wEGScpfyqHfcbfJpV2XC1zKm3vf42KTUzlRjZEYY7WKPVVM5AEY166qzVnD1lzMxL0oZ3smtE3YtHGcMAGXWCToYSLOOY5DYi/K+bokKRYFYSIvsrFcOc3txm2Pe77ORZvTMw7dc1Vtco/lQdgYi7jYoyZJdYMBJ591fUFfNPL4Ay8qHdfVUNHFIm6bJq4xCEpfMLQx2hmVPVu1Vf2g6utBxznVIydfqLS5aFBqJ8k/jP5rop9yrwMMkpS/lEO/426TS7suF7iUN/e+x8Ump3KiGiMxxmoVe6qYyAMwrl1Vm7OGrbmYiXtRzvZMaJuwaeM4YQIusUjQw1ScC7vxOKfxI257XPqHibzIxnLlNLcbtz3u+ToXbU7POHTPVbXJPZYHYWMs4mKPmiTVDQacfNb1BX3RSHN7g9JxXQ0VXSzitmniGoOg9AVDG6OdUdmzVVvVD6q+HnScUz1y8oVKm4sGpXaS/MPovyb6Kfc6wCBJ+Us59DvuNrm063KBS3lz73tcbHIqJ6oxEmOsVrGniok8AOPaVbU5a9iai5m4F+Vsz4S2CZs2jhMm4BKLBD1MxrkwO45wGj/itself5jIi2wsV05zu3Hb456vc9Hm9IxD91xVm9xjeRA2xiIu9qhJUt1gwMlnXV8cN+zS4ixUvn8jCIIgCIIgCIIgCIIgCIIgCIIZ+vv7wXEcmDJlCnR0dMB9990HR48eBdd1obm5GaZMmQIDAwOwYsUK064KgiAIgiAISKis6UDfaeTT771N6biuhoouFnHbNHGNQVD6gqGN0c6o7NmqreoHVV8POs6pHjn5QqXNRYNSO0n+YfRfE/2Uex1gkKT8pRz6HXebXNp1ucClvLn3PS42OZUT1RiJMVar2FPFRB6Ace2q2pw1bM3FTNyLcrZnQtuETRvHCRNwiUWCHkmKc1g2udjj0j9M5EU2liunud247XHv61y0OT3j0D1X1Sb3WB6EjbGIiz1qklQ3GHDyWdcX9EUjgiAIgiAIgiAIgiAIgiAIgiDwI8ynaQRBEARBEITyAn3RyLGnTFY6rquhootF3DZNXGMQlL5gaGO0Myp7tmqr+kHV14OOc6pHTr5QaXPRoNROkn8Y/ddEP+VeBxgkKX8ph37H3SaXdl0ucClv7n2Pi01O5UQ1RmKM1Sr2VDGRB2Bcu6o2Zw1bczET96Kc7ZnQNmHTxnHCBFxikaCHiTjnOA4Efa0+fzEJp/Ejbntc+oeJvMjGcuU0txu3Pe75OhdtTs84dM9Vtck9lgdhYyziYo+aJNUNBpx81vUFfdHIuBkdSsd1NVR0sYjbpolrDILSFwxtjHZGZc9WbVU/qPp60HFO9cjJFyptLhqU2knyD6P/muin3OsAgyTlL+XQ77jb5NKuywUu5c2973GxyamcqMZIjLFaxZ4qJvIAjGtX1easYWsuZuJelLM9E9ombNo4TpiASywS9EhSnMOyycUel/5hIi+ysVw5ze3GbY97X+eizekZh+65qja5x/IgbIxFXOxRk6S6wYCTz7q+oC8a+e519ykd19VQ0cUibpsmrjEISl8wtDHaGZU9W7VV/aDq60HHOdUjJ1+otLloUGonyT+M/muin3KvAwySlL+UQ7/jbpNLuy4XuJQ3977HxSancqIaIzHGahV7qpjIAzCuXVWbs4atuZiJe1HO9kxom7Bp4zhhAi6xSNAjSXEOyyYXe1z6h4m8yMZy5TS3G7c97n2dizanZxy656ra5B7Lg7AxFnGxR02S6gYDTj7r+oK+aEQQBEEQBEEQBEEQBEEQBEEQBP4Efa5GEARBEARBKCPcCAwMDLgA4A4MDBT87pn/esX3b4KOq5zrd1xFF4u4bZq4xiAofcHQxmhnVPZs1Vb1g6qvBx3nVI+cfKHS5qJBqZ0k/zD6r4l+yr0OMEhS/lIO/Y67TS7tulzgUt7c+x4Xm5zKiWqMxBirVeypYiIPwLh2VW3OGrbmYibuRTnbM6FtwqaN44QJuMQiQY8449z111/v9vf3u7/61a/cRx991L3hhhvc/v5+9/rrr3e/8Y1vuL///e/dO++8Mzb/MEhSLArCRF5kY7lymtuN2x73fJ2LNqdnHLrnqtrkHsuDsDEWcbFHTZLqBgNOPvv5UmxNRz7oO408/PO/Kh3X1VDRxSJumyauMQhKXzC0MdoZlT1btVX9oOrrQcc51SMnX6i0uWhQaifJP4z+a6Kfcq8DDJKUv5RDv+Nuk0u7Lhe4lDf3vsfFJqdyohojMcZqFXuqmMgDMK5dVZuzhq25mIl7Uc72TGibsGnjOGECLrFI0CNJcQ7LJhd7XPqHibzIxnLlNLcbtz3ufZ2LNqdnHLrnqtrkHsuDsDEWcbFHTZLqBgNOPuv6gr5o5M/3PKd0XFdDRReLuG2auMYgKH3B0MZoZ1T2bNVW9YOqrwcd51SPnHyh0uaiQamdJP8w+q+Jfsq9DjBIUv5SDv2Ou00u7bpc4FLe3PseF5ucyolqjMQYq1XsqWIiD8C4dlVtzhq25mIm7kU52zOhbcKmjeOECbjEIkEPE3HOcRwSe1HO1yVJsSgIE3mRjeXKaW43bnvc83Uu2pyeceieq2qTeywPwsZYxMUeNUmqGww4+azrC/qikfphNUrHdTVUdLGI26aJawyC0hcMbYx2RmXPVm1VP6j6etBxTvXIyRcqbS4alNpJ8g+j/5rop9zrAIMk5S/l0O+42+TSrssFLuXNve9xscmpnKjGSIyxWsWeKibyAIxrV9XmrGFrLmbiXpSzPRPaJmzaOE6YgEssEvSIM86dc845gec6jgMdHR0wfvz42PzDIEmxKAgTeZGN5cppbjdue9zzdS7anJ5x6J6rapN7LA/CxljExR41SaobDDj5rOuL47quq/pH+/btg1QqBQMDA9Dc3KzlgCAIgiAIgiAIgiAIgiAIgiAIdGzfvh2mTZsGbW1tcP/998PRo0fBdV1oamqC0047Teb5BUEQBEEQEobKmg70nUau3fA9peO6Giq6WMRt08Q1BkHpC4Y2RjujsmertqofVH096DineuTkC5U2Fw1K7ST5h9F/TfRT7nWAQZLyl3Lod9xtcmnX5QKX8ube97jY5FROVGMkxlitYk8VE3kAxrWranPWsDUXM3EvytmeCW0TNm0cJ0zAJRYJenCKc36freE0fsRtj0v/MJEX2ViunOZ247bHPV/nos3pGYfuuao2ucfyIGyMRVzsUZOkusGAk8+6vqAvGjl8cFDpuK6Gii4Wcds0cY1BUPqCoY3Rzqjs2aqt6gdVXw86zqkeOflCpc1Fg1I7Sf5h9F8T/ZR7HWCQpPylHPodd5tc2nW5wKW8ufc9LjY5lRPVGIkxVqvYU8VEHoBx7aranDVszcVM3ItytmdC24RNG8cJE3CJRYIepuJc2I3HOY0fcdvj0j9M5EU2liunud247XHP17loc3rGoXuuqk3usTwIG2MRF3vUJKluMODks64v6ItG5qwcr3RcV0NFF4u4bZq4xiAofcHQxmhnVPZs1Vb1g6qvBx3nVI+cfKHS5qJBqZ0k/zD6r4l+yr0OMEhS/lIO/Y67TS7tulzgUt7c+x4Xm5zKiWqMxBirVeypYiIPwLh2VW3OGrbmYibuRTnbM6FtwqaN44QJuMQiQY+449wll1ziu6MIhr0o5+uSpFgUhIm8yMZy5TS3G7c97vk6F21Ozzh0z1W1yT2WB2FjLOJij5ok1Q0GnHzW9QV90cjUxWOUjutqqOhiEbdNE9cYBKUvGNoY7YzKnq3aqn5Q9fWg45zqkZMvVNpcNCi1k+QfRv810U+51wEGScpfyqHfcbfJpV2XC1zKm3vf42KTUzlRjZEYY7WKPVVM5AEY166qzVnD1lzMxL0oZ3smtE3YtHGcMAGXWCToEXecq6urI7MX5XxdkhSLgjCRF9lYrpzmduO2xz1f56LN6RmH7rmqNrnH8iBsjEVc7FGTpLrBgJPPur6gLxr59yvvUTquq6Gii0XcNk1cYxCUvmBoY7QzKnu2aqv6QdXXg45zqkdOvlBpc9Gg1E6Sfxj910Q/5V4HGCQpfymHfsfdJpd2XS5wKW/ufY+LTU7lRDVGYozVKvZUMZEHYFy7qjZnDVtzMRP3opztmdA2YdPGccIEXGKRoEeS4hyWTS72uPQPE3mRjeXKaW43bnvc+zoXbU7POHTPVbXJPZYHYWMs4mKPmiTVDQacfNb1BX3RiCAIgiAIgiAIgiAIgiAIgiAIgiAIgiAIgmABbgQGBgZcAHAHBgYKfvfE71/y/Zug4yrn+h1X0cUibpsmrjEISl8wtDHaGZU9W7VV/aDq60HHOdUjJ1+otLloUGonyT+M/muin3KvAwySlL+UQ7/jbpNLuy4XuJQ3977HxSancqIaIzHGahV7qpjIAzCuXVWbs4atuZiJe1HO9kxom7Bp4zhhAi6xSNDDRJy7++673T/+8Y/uDTfc4Pb397vXX3+9+41vfMPdt29frP5hkKRYFISJvMjGcuU0txu3Pe75OhdtTs84dM9Vtck9lgdhYyziYo+aJNUNBpx89vOl2JqOfNB3Gnny9y8pHdfVUNHFIm6bJq4xCEpfMLQx2hmVPVu1Vf2g6utBxznVIydfqLS5aFBqJ8k/jP5rop9yrwMMkpS/lEO/426TS7suF7iUN/e+x8Ump3KiGiMxxmoVe6qYyAMwrl1Vm7OGrbmYiXtRzvZMaJuwaeM4YQIusUjQw0SccxyHxF6U83VJUiwKwkReZGO5cprbjdse93ydizanZxy656ra5B7Lg7AxFnGxR02S6gYDTj7r+oK+aOThn+1UOq6roaKLRdw2TVxjEJS+YGhjtDMqe7Zqq/pB1deDjnOqR06+UGlz0aDUTpJ/GP3XRD/lXgcYJCl/KYd+x90ml3ZdLnApb+59j4tNTuVENUZijNUq9lQxkQdgXLuqNmcNW3MxE/einO2Z0DZh08ZxwgRcYpGgR5LiHJZNLva49A8TeZGN5cppbjdue9z7OhdtTs84dM9Vtck9lgdhYyziYo+aJNUNBpx81vUFfdFIZZW/ZNBxXQ0VXSzitmniGoOg9AVDG6OdUdmzVVvVD6q+HnScUz1y8oVKm4sGpXaS/MPovyb6Kfc6wCBJ+Us59DvuNrm063KBS3lz73tcbHIqJ6oxEmOsVrGniok8ANxWtbYAACbASURBVOPaVbU5a9iai5m4F+Vsz4S2CZs2jhMm4BKLBD24xLmmpiaoqOB9Lx+3PS79w0ReZGO5cprbjdse93ydizanZxy656ra5B7Lg7AxFnGxR02S6gYDTj7r+uK4ruuq/tG+ffsglUrBwMAANDc3azkgCIIgCIIgCIIgCIIgCIIgCAItv/71r6GlpQVGjRoF3/3ud8F1Xbj88suVPlsjCIIgCIIg2IHKmg705S9fOO+HSsd1NVR0sYjbpolrDILSFwxtjHZGZc9WbVU/qPp60HFO9cjJFyptLhqU2knyD6P/muin3OsAgyTlL+XQ77jb5NKuywUu5c2973GxyamcqMZIjLFaxZ4qJvIAjGtX1easYWsuZuJelLM9E9ombNo4TpiASywS9DAR52pqaqC6uho6Ojoyx4IWjHAaP+K2x6V/mMiLbCxXTnO7cdvjnq9z0eb0jEP3XFWb3GN5EDbGIi72qElS3WDAyWddX6qQ/Mjw9r6DSsd1NVR0sYjbpolrDILSFwxtjHZGZc9WbVU/qPp60HFO9cjJFyptLhqU2knyD6P/muin3OsAgyTlL+XQ77jb5NKuywUu5c2973GxyamcqMZIjLFaxZ4qJvIAjGtX1easYWsuZuJelLM9E9ombNo4TpiASywS9DAR5xYsWEBiL8r5uiQpFgVhIi+ysVw5ze3GbY97vs5Fm9MzDt1zVW1yj+VB2BiLuNijJkl1gwEnn3V9QV80Mn3pWKXjuhoquljEbdPENQZB6QuGNkY7o7Jnq7aqH1R9Peg4p3rk5AuVNhcNSu0k+YfRf030U+51gEGS8pdy6HfcbXJp1+UCl/Lm3ve42ORUTlRjJMZYrWJPFRN5AMa1q2pz1rA1FzNxL8rZngltEzZtHCdMwCUWCXqYiHP5u4pcfPHFKPainK9LkmJRECbyIhvLldPcbtz2uOfrXLQ5PePQPVfVJvdYHoSNsYiLPWqSVDcYcPJZ1xf0z9PMXztJ6biuhoouFnHbNHGNQVD6gqGN0c6o7NmqreoHVV8POs6pHjn5QqXNRYNSO0n+YfRfE/2Uex1gkKT8pRz6HXebXNp1ucClvLn3PS42OZUT1RiJMVar2FPFRB6Ace2q2pw1bM3FTNyLcrZnQtuETRvHCRNwiUWCHhziXENDA4q9KOfrkqRYFISJvMjGcuU0txu3Pe75OhdtTs84dM9Vtck9lgdhYyziYo+aJNUNBpx81vUFfdHIzZ/4pdJxXQ0VXSzitmniGoOg9AVDG6OdUdmzVVvVD6q+HnScUz1y8oVKm4sGpXaS/MPovyb6Kfc6wCBJ+Us59DvuNrm063KBS3lz73tcbHIqJ6oxEmOsVrGniok8AOPaVbU5a9iai5m4F+Vsz4S2CZs2jhMm4BKLBD2SFOewbHKxx6V/mMiLbCxXTnO7cdvj3te5aHN6xqF7rqpN7rE8CBtjERd71CSpbjDg5LOuL+iLRgRBEARBEARBEARBEARBEARBEARBEARBEAQLcCMwMDDgAoA7MDBQ8Ls//+Y5378JOq5yrt9xFV0s4rZp4hqDoPQFQxujnVHZs1Vb1Q+qvh50nFM9cvKFSpuLBqV2kvzD6L8m+in3OsAgSflLOfQ77ja5tOtygUt5c+97XGxyKieqMRJjrFaxp4qJPADj2lW1OWvYmouZuBflbM+EtgmbNo4TJuASiwQ9TMe5/v5+NHtRztclSbEoCBN5kY3lymluN2573PN1LtqcnnHonqtqk3ssD8LGWMTFHjVJqhsMOPns50uxNR35oO808tITrysd19VQ0cUibpsmrjEISl8wtDHaGZU9W7VV/aDq60HHOdUjJ1+otLloUGonyT+M/muin3KvAwySlL+UQ7/jbpNLuy4XuJQ3977HxSancqIaIzHGahV7qpjIAzCuXVWbs4atuZiJe1HO9kxom7Bp4zhhAi6xSNAjSXEOyyYXe1z6h4m8yMZy5TS3G7c97n2dizanZxy656ra5B7Lg7AxFnGxR02S6gYDTj7r+oK+aOS3P35C6biuhoouFnHbNHGNQVD6gqGN0c6o7NmqreoHVV8POs6pHjn5QqXNRYNSO0n+YfRfE/2Uex1gkKT8pRz6HXebXNp1ucClvLn3PS42OZUT1RiJMVar2FPFRB6Ace2q2pw1bM3FTNyLcrZnQtuETRvHCRNwiUWCHkmKc1g2udjj0j9M5EU2liunud247XHv61y0OT3j0D1X1Sb3WB6EjbGIiz1qklQ3GHDyWdcX9EUjgiAIgiAIgiAIgiAIgiAIgiAIgiAIgiAIAn8c13Vd1T8aGBiAlpYWeOGFF6C5uTnnd0eOHIXKysK1KEHH/VDRUNHFIm6bJq4xCEpfMLQx2hmVPVu1Vf2g6utBxznVIydfqLS5aFBqJ8k/jP5rop9yrwNb/aCyWQ79jrtNLu26XOBS3tz7HhebnMqJaozEGKtV7KliIg/AuHZVbc4atuZiJu5FVYjbngltEzZtHCdMwCUWCXqYjnNf/epXYcuWLUb8wyBJsYjSD8pcEcsXLro2tmHu+ToXbU7POCj94z4vq4KNsYiLPWqSVDcYcPLZz5d9+/bBmDFjYO/evZBKpYr+faRFIy+++CKMGTNG9c8EQRAEQRAEQRAEQRAEQRAEQRAEQRAEQRCEGHjhhRdg9OjRRc+JtGjk6NGj8Le//Q2amprgzTffhDFjxhTsOuKtXPHbjSTM74POCfN3YcDS4WrPtH1se1H1dPzgcg1x6aroYJ8bdA5GDMAoHy4a3PU4+lQOeth9QqU/Rjmue73YGjbYjNs2l7Ydl55pO6bsmabcrjcb29tw0mJv3OORTm5c6pyoY7LqNdj0d1T3OFw1yuF647pGE3MQmBpc7CTpWlQx7ZNp+3GQ1GsshzhHqWfaTtz2MNoLl7ZAqW2Tr9jatufdXPwIcz72fRflM1Tu7ZajPe55B0f/uPjExY8gqP1zXRfefPNN6OzshIqK4juiVEUxUFFRkVmN4jgOAAA0Nzf7XkzQ8bC/DzonzN+FAUuHqz3T9rHtRdXT8YPLNcSlq6KDfa5KHFG9Xozy4aLBXY+jT+Wgh90nVMd1yjygFCbGVpPjeVy2ubTtuPRM2zFlzzTldr3Z2N6GkxZ74x6PdHLjUudEHZPDnmPj31Hd43DVKIfrjesaTcxBYGpwsZOka1HFtE+m7cdBUq+xHOIcpZ5pO3Hbw2gvXNoCpbZNvmJr2553c/EjzPnY913Y94VR/VBF4q0ZOPrHxScufgRB6V+pz9J48PjIjiAIgiAIgiAIgiAIgiAIgiAIgiAIgiAIghArsmhEEARBEARBEARBEARBEARBEARBEARBEAShDNFeNFJbWwvbtm2D2traUMfD/j7onDB/p+M3FXHbM20f215UPR0/uFxDXLoqOtjnqsQR1evFKB8uGtz1OPpUDnrYfUJ1XKfMA3SvhQKT43lctrm07bj0TNsxZc805Xa92djehpMWe+Mej3Ry41LnRB2TVa/Bpr+jusfhqlEO1xvXNZqYg8DU4GInSdeiimmfTNuPg6ReYznEOUo903bitofRXri0BUptm3zF1rY97+biR5jzse+7sO8Lo/qhisRbM3D0j4tPXPwIgpN/juu6rmknBEEQBEEQBEEQBEEQBEEQBEEQBEEQBEEQhHiRz9MIgiAIgiAIgiAIgiAIgiAIgiAIgiAIgiCUIbJoRBAEQRAEQRAEQRAEQRAEQRAEQRAEQRAEoQyRRSOCIAiCIAiCIAiCIAiCIAiCIAiCIAiCIAhlSKyLRm6++WZwHAfuvvvuOM0KCeDuu+8Gx3Hg5ptvNu0KS8q5fLq7u+GEE04w7YYA5d0Ok8YzzzwDp512GnR0dIDjOHDRRReF+jsZ5wVBEAQs4hpTTOcvurmsjL14mG4LXH0RBB2kLQ8h8xYCJjL2RyPqPEdUJP6pI21biEqS2k6SriVO4ojxEteTg+TmwZAsGnnkkUfgyiuvhGeffZZCXhCsQ/qEvZRD3ZXDNQpqXHTRRXDPPffAxz/+cbjlllvg/e9/v+950naEpCJtWxAEzkiMEpKCtGVBEIRcuMdF7v4VI+w8hyCoYHOfEIQkITFeEHCoohB95JFH4KqrroITTjgBuru7KUwIZcbxxx8PBw4cgOrqatOuRIK6T9hePpwph3iGdY3SDpPBwYMH4Te/+Q188IMfhI9+9KNFzy2H/iGUJ9K2BUHgTHaMEvDglMty8oUSGW8FQRBy4R4XufsXhMo8BxblMpaXO7b2CUFIEnHFeInrQjlAsmhEELCpqKiAuro6026wRcpH4IC0w2Tw6quvguu60NbWZtoVQQjFkSNH4ODBg9DQ0GDaFUEQBMFiOOWynHwRBEEQBNsxMc8hY7kgCEI8xBXjbYvrMl8qRAH98zRXXnklXHzxxQAAcOKJJ4LjOAXfkDp69Ch84QtfgAkTJkBtbS1MnjwZvvGNbwDA0Kqwa665BqZPnw51dXXQ0tIC69atgz/+8Y/avlFq5zM4OAiLFy+GxsZGePzxx3N+92//9m/gOA5cccUV6HY93nzzTfinf/onWLBgAaTTaaitrYWJEyfCJz7xCXj77bfR7HjfWPvlL38Jn/70p2HcuHFQX18PCxYsgN/+9rcAAHDPPffAkiVLoLGxEUaNGgX//M//rGxH5Xthnk+/+tWvAtuZH1RlFqZP6BL1e2q7d++GrVu3wpgxY6CmpgbGjBkDW7duhddff73g3KjlGkRQea9du9bXTk9PDxx77LGQSqWgubkZ1q1bBzt37lS2qwJG3WHHnXfeeQeuvPJK6O3thYaGBmhpaYGZM2fCxz72sUh6mO0zajukjiMq7Tysrxj94Ic//CE4jgNf/epXfX8/ffp0mDhxIriuq+xnPmHL4KKLLoJx48YBAMBVV12VaQ9+39H0azvez6bGeSpM+RxnLuG17TvvvBOuvPJKGDduHNTW1sKsWbPgtttuU9ajKrNsP//5n/8ZJkyYAHV1dfCd73xHSzebOMbtbLDjusCTuOIIdr7mEde9hR+Dg4MocQkb1Tp94YUX4KyzztLOZfNjlPfva6+9NqfOJ06cCOvXrydrc4cOHYLPfe5zcMwxx0BDQwOkUino6+uD66+/vujfhc1JVPPDVCqlnU/qfqMaM56r+GLynk4HlfEWM58vRZS2jeFfnHUTtf9y48orrwTHcXw/CWDqO+VYsT4qNt5rCe+ieh9S7L7bD918Lu77JCxU5jkw0c0rgrjjjjvAcRz4X//rf/n+ftGiRdDR0QGHDx/OHOM+N5mPatsuBmUOQdkn4pwniDovqxJT4poDDdN2VPwO29/eeust9PG32H1wUJ/+3e9+V9QPTvcCWMQZ46niOgZxzJeqwCUnNZ2bW4mLzKOPPupeeumlLgC4n/rUp9xbbrnFveWWW9z777/f/frXv+4CgLtgwQJ31qxZ7mc/+1n3K1/5itvb2+sCgHv33Xe7J5xwgltTU+Nu2rTJ3b59u3vttde648ePd+vr690HH3wwsl+HDh0i0w7i2WefdVtaWtzZs2e777zzjuu6rvvnP//Zra+vd5csWeIODg6i2/R47LHH3BEjRriXX365+5WvfMW9/vrr3Q0bNriO47irVq1Cs+PVaV9fnztnzhz3i1/8onvttde66XTabWpqcn/4wx+6bW1t7ic+8Ql3+/bt7gknnOACgHvLLbco2bnrrrtcAHC//vWvh/YpqJ3de++9vn9HVWbF+gQWKuXjsXfvXnfSpEmu4zjupk2b3P7+fnfz5s2u4zjulClT3H379uWcH7VcgwgqbwAosHPNNde41dXVLgC4Z5xxhtvf3++eddZZ7pgxY9x0Ou0uW7ZMyXZYdOuOIu5ccsklLgC4F1xwgXvjjTe6119/vfvhD3/YnTt3rrKW6+K2zyjt0HVp44hqOw/rK0Y/OHz4sDty5Eh34cKFBb974IEHXABwr776aiX//FApg/vvv9/98pe/7AKAe/rpp2fawyuvvFKg69d2tmzZYmycp8JE/pBNXLmE17bnzp3r9vb2utddd5173XXXZepOpV9Tlpnn5+zZs91p06a51157rdvf3486psYxbmeDHdcFfsQZR7DzNY+47i2ywYxLxYiSv6jW6Z49e9zu7m63srLS3bp1q1Yumx+jvLF3+vTpmTr/4he/6NbX17sA4J5yyinobe7gwYOZfGzVqlXu5z//efdf//Vf3UsvvdQ98cQTA/9OJSdRzQ+9dtHT0xM5n4yay3pgxvOwvpi+p9Mh7HiLnc8XI0rbxvIvrrqJ2n+jotuvirFt2zYXANxnnnmm4Hfjxo0jmycIAjPWR8H0fYugT9i4aGreM+77JCxU5jkwoYp/g4OD7siRI9158+YV/O7JJ590AcD90Ic+lDlmw9ykB/ZYSJ1DUPaJOOcJos7Lfvaznw0dU6jnQFXajkosDNPftm7ditrHSt0Hf/WrX/W119PT41ZUVLjV1dWBfnC6F8AizhhPmdfqEsd8aVi45KSmc3NbQV804rrvNtC77rrL9/gxxxzjHjx4MHP8xRdfdGtqatw5c+a4AOD+9Kc/zfm7gYEBd8yYMVqV+KUvfYlMuxjf//73M4PH22+/7U6fPt1tbW11n3vuORJ7HgcPHnQPHTpUcPyf/umfXABwf/e736HY8ep0zpw5OXX64x//2AUAt6qqKicIHDx4MDBBKEaURSNB7eycc87x/TvKMgvqE1hEGbA+9alPuQDg9vf35xy//vrrXQBw/+mf/inneNRyDSKovNetW+cCgDt58uSMnU9+8pMuALiVlZU5dj784Q+7AEAa4HXqjiLutLa2uqtXr1b+u2JgtU/dRSMUcUS1nYf1FasfeG17x44dOcc3b97sVlZWui+99JKSnh+qZfDMM8+4AOBu27atpHZ+2zE5zlNhKn/IJo5cwqu7sWPHunv37s0c37t3rzt27Fi3tbXVffvtt0NpUZaZ5+fkyZPd/fv3R9YJa4dq3M6GIq4LvIgzjmCPUx5x3VtkgxmXihElf1GtU2+8/z//5//kHI+ay2bHKL869/yrqqrKqXOsNvfZz37WBQD3k5/8ZMHvjhw5Evh3KjmJan741a9+1QUAt6GhIXI+qTsJiBnPw/pi+p5OlzDjLXY+X4wobRvLv7jqJmr/jUo5LRrBjvWqcLhvEfQJExdNznvGeZ+Eico8BxaU8e+jH/2o71ySV5cPPfRQ5pgtc5Ouiz8WxpFDUPWJOOcJos7LHnvssUoxhXIOVKXtqMbCUv3tf/7P/4nax0rdB3svB+Tbu+aaa1wAcGfOnBnoB7d7ASziivE2LBqhni8NA5ec1HRubivon6cJw+WXXw41NTWZn7u6umDy5MnwxBNPwJQpU2DevHmwe/fuzH+HDh2ClStXwr333gsHDhyIZPNb3/oWmXYxzjjjDLjsssugv78fVqxYATt27ICvfe1rMHbsWHRb2dTU1EB1dTUADG1ltWfPHti9ezesWLECAAB+97vfodq77LLLcup06dKlAACwYMEC6Ovry/Hr2GOPhaeeegrVvh9B7SzIdtxlZpof/vCH0NHRAZdeemnO8fe///3Q0dEBP/zhD33/TrVcgwgq72nTpgEAwNy5czN2fvSjH8GIESNgypQpOXY+/vGPK9mMG4q4k0qlYMeOHfDnP/+ZyGtzUMSRqO28FFj9YMuWLeA4Dtx0002ZY/v374dvf/vbsHr1aujs7IzkXzZUZVAME+M8Fabyh2zizCUuu+wySKVSmZ9TqRR84AMfgD179oTe2jGOMrvssssS803OJMd1YQgTcQRrnPIwmSdjxCVsVOvUy2UvuOCCHB3MXDa7zj3/Jk6cCI899hh6m7v11luhtbXV9xNpFRXBUwxRcpKw+WFVVRUAADiOY+y+1EQ8N31PFwdx5rJR2ja2f9R1E7X/CqWJI9YXg8N9ixAvMu9Zvlx44YUAAPDNb34zc8x1XfjWt74FM2bMgLlz52aO2zg3iTUWmpgPw8JEXqk6L7tz506lmBLHHGiYtqMaC0v1t7vvvptk/A26Dz5w4ACMGTOmwN5tt92WaTcvvPCCrx+HDh0KXU6CnXCYL+WSk5rOzW2lyoTR8ePHFxxrb2+HHTt2wOOPPw4dHR2Bf7t7924YM2aMss3HHnsMDhw4QKJdii996Uvw85//HO6//37YsmULnHHGGeg2/Ni+fTvceOONsGPHDjh69GjO7/bs2YNqK79OW1tbAQCgp6en4NzW1laSbw+X8glgqJ0999xzgX8TZ5mZ5plnnoG+vr7MBKtHVVUVTJ48GR5++GHfv4tSrkEUK+/a2trMv//617/C/PnzobKyMsfOqFGjoKWlRdluXFDEna985Stw/vnnw8yZM2H8+PFw4oknwrp162DdunXWT/JRxJGo7VzVV4Bo/aCnpwdWrFgBt9xyC1x33XVQXV0N3/nOd+DNN9+EzZs3R/ItH6oyKIaJcZ4Kk/lDNnHlElOnTi045i3m++tf/xpKI44ymzx5cuS/5UaS47owhIk4gpmveZjKkzHiEjaqdZqdy2aDmctm17nnn4efnzpt7qmnnoJjjjkG6urqlP4uSk6ikh8CDE08hz0f+77URDzncE9HTZy5bJS2je0fdd1E7b9CaeKI9cXgct8ixIfMe5Yv3sKQW2+9Fa655hqoqKiAX//61/Dss8/C5z73uZxzbZybxBoLTcyHYWEir4wyL6sSU+KYAw3bdlT8LtXftm3bRjL+FrsPfuGFF4raC3rJ7K233gIAu+4FBDU4zJdyyUlN5+a2YmTRSH4lZTNz5kz40pe+FPj7Yg2tGK7rkmmX4tFHH4Xnn38eAAD+/Oc/w+DgYEGygs2XvvQl+MhHPgKrVq2CD33oQ9DZ2Qk1NTXw0ksvwUUXXVQwGOoSVKfF6pqaINuu6/oej7vMbEW1XIMIKu/bbrstZ8Wxrh2TUMSdU089FZ599ln4z//8T7jnnnvgzjvvhJtuugmWLl0Kd955Z84qYdvgGEeCwOoHAACXXnopbNiwAX7yk5/AmWeeCTfddBOMHDkS1q5dq+umMUyM81SYzB+yMZFLRCWOMjO9ah6TJMd1YQgTcQRznAKQPDkfLmNDNtl17vnnOA688sorcOuttxacz228DQIrP4wjn7QpnmPHCAEPqZvwOI4T+LvBwcEYPeEBx7FJoEXmPcubCy64AP77f//v8Ktf/QpWrFgB3/zmN6GyshLOO++8nPNsnJuUsdBMXhkl7966datSTKGeAw3TdqLEwmL97Yorroh9/O3s7IRvfOMbOcfWrl0LXV1d8IEPfABmzJjh20aeeeYZAJA+lmQ4zJdKTmo3JE8bit24FaOqqgp27doFy5cvR18xOWnSJDLtYuzbtw/OPfdcSKfT8MEPfhD+8R//EbZt2wZXX301qd1bbrkFuru74Y477si53p/+9Kekdm2Gssyi9glKxo8fD0888UTBg8fBwUF48sknfVecYhJU3nfccYevr0899RT09vbmHH/55Zdh7969pH7q1B1V3Glra4PzzjsPzjvvPHBdFz7xiU/A5z73Ofjxj38MGzZsUNbj2D6xMN3Ow3DqqafC8OHD4aabboIZM2bAfffdBx//+MfRFgRQloFq26Ec56kwlT9kE2cu8dhjj8Gpp56ac+wvf/kLAPi/ieAHhzLTJe64iB3XBV4koU+YvLfAiEvYqNapl8seOXIkZ4Iuai5bKkZ5/k2ePBkGBgYyWy1jMXnyZHj88cfh4MGDObsDlsKGvEyXuOO57WUaZryN8xqjtG3b6iBq/+VIW1sbAAC88cYb0N3dnTn+zjvvwMsvvwwTJ06M1R/sWK9KEvINgfY+BCOfS/L8kW1s3LgRPvaxj8E3v/lNWLx4MXzve9+DlStXwqhRo3LOs2VukoI4xmjKPmFDGavGFOo50DBEiYXF+htVHyt2H3zgwIECe729vbBr1y746Ec/GujHzTffjOafIATBJSc1nZvbCkmNDRs2DACGbtxUaGxshFdeeSVwBdKrr74a2acLLriATLsYl156KTz33HPwrW99Cz71qU/B+vXr4brrroO77rqLxJ5HZWUlOI6TszpwcHAQrrvuOlK7NkNZZlH7BCWnnXYa7Nq1C772ta/lHP/qV78Ku3btgtNPP53UflB533777QXnnnrqqfDqq68W9NPPfvazpD4C6NUddtw5cuRIwYDmOA7MmTMnso8APNsnFqbbeRiqq6vhoosugp/97Gdw1VVXAQDApk2b0PQpy0C17VCO81SYyh+yiTOXuOGGG2BgYCDz88DAANx4443Q0tICy5YtC6XBocx0iSsuUsV1gRdJ6BMm7y0w4hI2qnXq5bLZ38EGiJ7LlopRnn8vvvhiKP9Ued/73gd79uyBz3zmMwW/K/Z2mg15WVRMxXPbyzTMeBvnNUZp27bVQdT+yxFv++0777wz5/iXv/xlIzsmYMd6VZKQbwi09yEY+VyS549so6OjA1avXg0/+MEP4NZbb4V9+/bBhRdeWHCeLXOTFMQxRlP0CZvKWDWmUM+BhiFKLCzW36jG36D74Lq6OtizZ0+BPc8Pr1yx/BAEVbjkpKZzcxUOHz4Mjz/+eGaXcZOQLOGbP38+VFRUwNVXXw179uyBxsbGwG8OZ9Pc3AzHHnssfOxjH4Nf/epXsHz5cmhubobnn38efvnLX0JdXV3kByQf/vCH4Re/+AWJdhA33XQTfPvb34ZPfepTsHz5cgAYSkoefPBBOO+88+C//uu/oL29HdWmx/r16+GTn/wkrF69Gs444wzYt28f/Pu//ztUV1eT2EsClGUW1CcWLFiA4Hk0/uEf/gG++93vwtatW+Hhhx+GOXPmwB//+Ee46aaboLe3F/7hH/6B1H5QeR88eNDX13//93+HJ554ApqamuCGG26Au+++Gx544AFIp9OkfurUHXbcefPNN2HUqFHw3ve+F+bMmQPDhw+HZ555Bm644QZobW2FdevWxX6N3DHdzsOyZcsW+PznPw//9//+X1i2bBlMmjQJTZuyDPLbzh/+8Iei51OO81SYyB+yiTuXSKfTsGDBArj44osBAODrX/86PP/88/C1r30t9BaHpssMg7jiIlVcF3iRhD5h8t4CIy5ho1qnXi67ZcsWeOihh2D69OlauWx2jJo9ezYADL11dsIJJ+T49/Of/xzq6urgC1/4Amqb+/CHPwz/8R//AZ/5zGfgwQcfhFWrVkFdXR3s2LEDnnjiiYIHuNnlYENeFgVT8dz2Mg0z3sZ5jVHatm11ELX/cmTFihXQ29sLV1xxBbz++uvQ09MD9957L/z2t78lnyfwAzvWq5KEfEOgvQ/ByOeSPH9kIxdeeCH85Cc/gY985COQSqXgtNNOKzjHlrlJCuIYoyn6hE1l/MwzzyjHFMo50DBEjYVB/Y1q/A26D77xxhvhBz/4QYG9119/HVKpFHz605+GBx980NcPv4VlgoANl5zUdG6uwksvvQRTp06FZcuWwd13323WGZeIm2++2Z06dapbXV3tAoB74YUXul//+tddAHDvuuuugvOXLVvmjhs3zj18+LD7L//yL25fX5/b0NDgNjQ0uBMnTnQ3btzo/uxnP9PyiVI7n8cee8xtaGhwjzvuOPfw4cM5v7v//vvdqqoqd926dag2sxkcHHSvueYad8KECW5NTY07duxY92Mf+5j7l7/8xQUAd9u2bSh2itWpV+/5XHjhha5q07vrrrtcAHC//vWva/nktTM/qMvMr09goVI+2bz22mvuZZdd5nZ1dblVVVVuV1eXe/nll7u7du0qODdquQYRVN5XX321b/k899xzbjqddh3HcZuamtxTTjnFffrpp91x48a5y5YtU7Ktik7dYcadgwcPup/4xCfc+fPnu21tbW5NTY07btw49+KLL3affPJJxavKBaN9Rm2H1HFEpZ3r+BqlH2SzfPlyFwDcb37zm5E1glApg2eeeUYp5uW3HZPjPBWmfI4zl/Da9i9+8Qv3iiuucMeMGePW1NS4M2bMcG+99VZlPaoyK9YHsaEctz0o47rAi7jiCNU4Fde9RTbYcSmIqPmLap0+99xz7plnnuk2NTWh5LJejKqsrHQBwD355JML/PPqi6LNHThwwP3MZz7jTps2za2trXVTqZTb19fn9vf3F/27sDmJan7one+XH4bNJ6O2BdfFj+cqvpi8p8MgzHiLmc+XIkrbxvAvzrqJ2n+joNOvwvDEE0+4J598sltfX++mUil3w4YN7osvvhjLPIEf2LFeFRvvtYRCSsVF0/OecdwnYaM6z4EBdfxz3aH8o62tzQUAd/PmzYHn2TI3STEWxpFDYPeJuOcJdOZlo8YU7DlQlbYTNRYW62+YfSzMfXCQvXPOOce97LLLAv3geC+AQVwxPo64HpU450vDwCUnNZ2bh8Vrwxx8clzXsv0nBUEQBCGBrFmzBh544AH429/+BvX19abdEcqMm2++GS6++GK46667Mm+qC4IgCIIgCIIgCIIgCAImMgcqCILAkwrTDgiCIAhCufP000/Dz372MzjvvPPkZkkQBEEQBEEQBEEQBEEQhMQhc6CCIAh8Cb1o5A9/+AM4jhPpvyjEbU8QBD1M91nT9gUhCr/73e/g5ptvhrPOOgtqamrgIx/5SOC5cbVxjn2Jo0+CIIRD+q8gBCP32IItcG873P0TBCE60r+Tg9SleaQOkg33+i01B8rdf1WSdj1+lMM1CkK5ITuNCIIgCIIhbrjhBrjkkktg3759cOutt0J3d7dplwRBEARBEARBEARBEARBENCQOVBBEAT+OK7rumFOfOedd+DZZ5+NZGTKlCnKfxO3PUEQ9DDdZ03bFwRq4mrjHPsSR58EQQiH9F9BCEbusQVb4N52uPsnCEJ0pH8nB6lL80gdJBvb69d2//NJ2vX4UQ7XKAjlRuhFI4IgCIIgCIIgCIIgCIIgCIIgCIIgCIIgCEJykM/TCIIgCIIgCIIgCIIgCIIgCIIgCIIgCIIglCGyaEQQBEEQBEEQBEEQBEEQBEEQBEEQBEEQBKEMkUUjgiAIgiAIgiAIgiAIgiAIgiAIgiAIgiAIZYgsGhEEQRAEQRAEQRAEQRAEQRAEQRAEQRAEQShDZNGIIAiCIAiCIAiCIAiCIAiCIAiCIAiCIAhCGSKLRgRBEARBEARBEARBEARBEARBEARBEARBEMoQWTQiCIIgCIIgCIIgCIIgCIIgCIIgCIIgCIJQhsiiEUEQBEEQBEEQBEEQBEEQBEEQBEEQBEEQhDLk/wfSqHpwaeiyRAAAAABJRU5ErkJggg==\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## HF transformers:" + ], + "metadata": { + "id": "eQh3GBo_hb5g" + } + }, + { + "cell_type": "code", + "source": [ + "# Refer to README: https://github.com/facebookresearch/seamless_communication/tree/main/docs/m4t#transformers-usage\n", + "# HF space: https://huggingface.co/spaces/facebook/seamless-m4t-v2-large" + ], + "metadata": { + "id": "6jSyZHFihel5" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## m4t_evaluate" + ], + "metadata": { + "id": "10CG60YSw4QB" + } + }, + { + "cell_type": "code", + "source": [ + "# Refer to README: https://github.com/facebookresearch/seamless_communication/tree/main/src/seamless_communication/cli/m4t/evaluate" + ], + "metadata": { + "id": "oQ5GuaQ7w7K8" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GIPdJ3x9tstZ" + }, + "source": [ + "# SeamlessExpressive Inference:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "l76dn3mRtwxK" + }, + "outputs": [], + "source": [ + "# Please follow instructions to download SeamlessExpressive here: https://ai.meta.com/resources/models-and-libraries/seamless-downloads/\n", + "\n", + "!wget \"\" -O /content/SeamlessExpressive.tar.gz\n", + "\n", + "!tar -xzvf /content/SeamlessExpressive.tar.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wqkH-Js91cLX", + "outputId": "09919807-5a69-4639-cd7d-5110f6f5f023" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2023-12-14 21:55:03-- https://dl.fbaipublicfiles.com/seamless/data/samples/expressivity_data.tar.gz\n", + "Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.162.163.51, 3.162.163.11, 3.162.163.34, ...\n", + "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.162.163.51|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 571233 (558K) [application/x-tar]\n", + "Saving to: ‘/content/expressivity_data.tar.gz’\n", + "\n", + "/content/expressivi 100%[===================>] 557.84K 2.53MB/s in 0.2s \n", + "\n", + "2023-12-14 21:55:03 (2.53 MB/s) - ‘/content/expressivity_data.tar.gz’ saved [571233/571233]\n", + "\n", + "./\n", + "./ex01_whisper_00367.wav\n", + "./ex01_confused_00367.wav\n", + "./ex01_enunciated_00367.wav\n", + "./ex01_happy_00367.wav\n", + "./ex01_sad_00367.wav\n", + "./ex01_laughing_00367.wav\n", + "./ex01_default_00367.wav\n" + ] + } + ], + "source": [ + "!wget https://dl.fbaipublicfiles.com/seamless/data/samples/expressivity_data.tar.gz -O /content/expressivity_data.tar.gz\n", + "!tar -xzvf /content/expressivity_data.tar.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "L1o7JgU2xiHV", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "outputId": "3b51ce48-e7ca-4f6c-f8db-5d19dfa1e10f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "English default audio:\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2023-12-14 21:55:07,810 INFO -- seamless_communication.cli.expressivity.predict.predict: Running inference on device=device(type='cuda', index=0) with dtype=torch.float16.\n", + "Downloading the tokenizer of seamless_expressivity...\n", + "100% 360k/360k [00:00<00:00, 10.3MB/s]\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "/usr/local/lib/python3.10/dist-packages/torch/nn/utils/weight_norm.py:30: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n", + " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n", + "2023-12-14 21:55:20,569 INFO -- seamless_communication.cli.expressivity.predict.predict: text_generation_opts=SequenceGeneratorOptions(beam_size=5, soft_max_seq_len=(1, 200), hard_max_seq_len=1024, step_processor=None, unk_penalty=0.0, len_penalty=1.0)\n", + "2023-12-14 21:55:20,569 INFO -- seamless_communication.cli.expressivity.predict.predict: unit_generation_opts=SequenceGeneratorOptions(beam_size=5, soft_max_seq_len=(25, 50), hard_max_seq_len=1024, step_processor=None, unk_penalty=0.0, len_penalty=1.0)\n", + "2023-12-14 21:55:20,569 INFO -- seamless_communication.cli.expressivity.predict.predict: unit_generation_ngram_filtering=False\n", + "2023-12-14 21:55:23,534 INFO -- seamless_communication.cli.expressivity.predict.predict: Saving expressive translated audio in spa\n", + "2023-12-14 21:55:23,538 INFO -- seamless_communication.cli.expressivity.predict.predict: Translated text in spa: Entonces, ¿qué estaba realmente haciendo?\n", + "\n", + "Translated default audio in spa:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "English whisper audio:\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2023-12-14 21:55:28,309 INFO -- seamless_communication.cli.expressivity.predict.predict: Running inference on device=device(type='cuda', index=0) with dtype=torch.float16.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "/usr/local/lib/python3.10/dist-packages/torch/nn/utils/weight_norm.py:30: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n", + " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n", + "2023-12-14 21:55:54,236 INFO -- seamless_communication.cli.expressivity.predict.predict: text_generation_opts=SequenceGeneratorOptions(beam_size=5, soft_max_seq_len=(1, 200), hard_max_seq_len=1024, step_processor=None, unk_penalty=0.0, len_penalty=1.0)\n", + "2023-12-14 21:55:54,236 INFO -- seamless_communication.cli.expressivity.predict.predict: unit_generation_opts=SequenceGeneratorOptions(beam_size=5, soft_max_seq_len=(25, 50), hard_max_seq_len=1024, step_processor=None, unk_penalty=0.0, len_penalty=1.0)\n", + "2023-12-14 21:55:54,236 INFO -- seamless_communication.cli.expressivity.predict.predict: unit_generation_ngram_filtering=False\n", + "2023-12-14 21:55:57,161 INFO -- seamless_communication.cli.expressivity.predict.predict: Saving expressive translated audio in spa\n", + "2023-12-14 21:55:57,166 INFO -- seamless_communication.cli.expressivity.predict.predict: Translated text in spa: Entonces, ¿qué estaba haciendo realmente?\n", + "\n", + "Translated whisper audio in spa:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "English confused audio:\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2023-12-14 21:56:01,903 INFO -- seamless_communication.cli.expressivity.predict.predict: Running inference on device=device(type='cuda', index=0) with dtype=torch.float16.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "/usr/local/lib/python3.10/dist-packages/torch/nn/utils/weight_norm.py:30: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n", + " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n", + "2023-12-14 21:56:10,642 INFO -- seamless_communication.cli.expressivity.predict.predict: text_generation_opts=SequenceGeneratorOptions(beam_size=5, soft_max_seq_len=(1, 200), hard_max_seq_len=1024, step_processor=None, unk_penalty=0.0, len_penalty=1.0)\n", + "2023-12-14 21:56:10,642 INFO -- seamless_communication.cli.expressivity.predict.predict: unit_generation_opts=SequenceGeneratorOptions(beam_size=5, soft_max_seq_len=(25, 50), hard_max_seq_len=1024, step_processor=None, unk_penalty=0.0, len_penalty=1.0)\n", + "2023-12-14 21:56:10,643 INFO -- seamless_communication.cli.expressivity.predict.predict: unit_generation_ngram_filtering=False\n", + "2023-12-14 21:56:12,360 INFO -- seamless_communication.cli.expressivity.predict.predict: Saving expressive translated audio in spa\n", + "2023-12-14 21:56:12,365 INFO -- seamless_communication.cli.expressivity.predict.predict: Translated text in spa: Entonces, ¿qué estaba haciendo en realidad?\n", + "\n", + "Translated confused audio in spa:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "English enunciated audio:\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2023-12-14 21:56:15,985 INFO -- seamless_communication.cli.expressivity.predict.predict: Running inference on device=device(type='cuda', index=0) with dtype=torch.float16.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "/usr/local/lib/python3.10/dist-packages/torch/nn/utils/weight_norm.py:30: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n", + " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n", + "2023-12-14 21:56:23,186 INFO -- seamless_communication.cli.expressivity.predict.predict: text_generation_opts=SequenceGeneratorOptions(beam_size=5, soft_max_seq_len=(1, 200), hard_max_seq_len=1024, step_processor=None, unk_penalty=0.0, len_penalty=1.0)\n", + "2023-12-14 21:56:23,186 INFO -- seamless_communication.cli.expressivity.predict.predict: unit_generation_opts=SequenceGeneratorOptions(beam_size=5, soft_max_seq_len=(25, 50), hard_max_seq_len=1024, step_processor=None, unk_penalty=0.0, len_penalty=1.0)\n", + "2023-12-14 21:56:23,186 INFO -- seamless_communication.cli.expressivity.predict.predict: unit_generation_ngram_filtering=False\n", + "2023-12-14 21:56:24,911 INFO -- seamless_communication.cli.expressivity.predict.predict: Saving expressive translated audio in spa\n", + "2023-12-14 21:56:24,916 INFO -- seamless_communication.cli.expressivity.predict.predict: Translated text in spa: Entonces, ¿qué estaba haciendo en realidad?\n", + "\n", + "Translated enunciated audio in spa:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "English happy audio:\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2023-12-14 21:56:27,815 INFO -- seamless_communication.cli.expressivity.predict.predict: Running inference on device=device(type='cuda', index=0) with dtype=torch.float16.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "/usr/local/lib/python3.10/dist-packages/torch/nn/utils/weight_norm.py:30: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n", + " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n", + "2023-12-14 21:56:34,259 INFO -- seamless_communication.cli.expressivity.predict.predict: text_generation_opts=SequenceGeneratorOptions(beam_size=5, soft_max_seq_len=(1, 200), hard_max_seq_len=1024, step_processor=None, unk_penalty=0.0, len_penalty=1.0)\n", + "2023-12-14 21:56:34,259 INFO -- seamless_communication.cli.expressivity.predict.predict: unit_generation_opts=SequenceGeneratorOptions(beam_size=5, soft_max_seq_len=(25, 50), hard_max_seq_len=1024, step_processor=None, unk_penalty=0.0, len_penalty=1.0)\n", + "2023-12-14 21:56:34,259 INFO -- seamless_communication.cli.expressivity.predict.predict: unit_generation_ngram_filtering=False\n", + "2023-12-14 21:56:36,072 INFO -- seamless_communication.cli.expressivity.predict.predict: Saving expressive translated audio in spa\n", + "2023-12-14 21:56:36,077 INFO -- seamless_communication.cli.expressivity.predict.predict: Translated text in spa: Entonces, ¿qué estaba haciendo en realidad?\n", + "\n", + "Translated happy audio in spa:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "English sad audio:\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2023-12-14 21:56:38,964 INFO -- seamless_communication.cli.expressivity.predict.predict: Running inference on device=device(type='cuda', index=0) with dtype=torch.float16.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "/usr/local/lib/python3.10/dist-packages/torch/nn/utils/weight_norm.py:30: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n", + " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n", + "2023-12-14 21:56:45,496 INFO -- seamless_communication.cli.expressivity.predict.predict: text_generation_opts=SequenceGeneratorOptions(beam_size=5, soft_max_seq_len=(1, 200), hard_max_seq_len=1024, step_processor=None, unk_penalty=0.0, len_penalty=1.0)\n", + "2023-12-14 21:56:45,496 INFO -- seamless_communication.cli.expressivity.predict.predict: unit_generation_opts=SequenceGeneratorOptions(beam_size=5, soft_max_seq_len=(25, 50), hard_max_seq_len=1024, step_processor=None, unk_penalty=0.0, len_penalty=1.0)\n", + "2023-12-14 21:56:45,496 INFO -- seamless_communication.cli.expressivity.predict.predict: unit_generation_ngram_filtering=False\n", + "2023-12-14 21:56:47,359 INFO -- seamless_communication.cli.expressivity.predict.predict: Saving expressive translated audio in spa\n", + "2023-12-14 21:56:47,362 INFO -- seamless_communication.cli.expressivity.predict.predict: Translated text in spa: Entonces, ¿qué estaba haciendo realmente?\n", + "\n", + "Translated sad audio in spa:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "English laughing audio:\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2023-12-14 21:56:50,183 INFO -- seamless_communication.cli.expressivity.predict.predict: Running inference on device=device(type='cuda', index=0) with dtype=torch.float16.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "Using the cached tokenizer of seamless_expressivity. Set `force` to `True` to download again.\n", + "/usr/local/lib/python3.10/dist-packages/torch/nn/utils/weight_norm.py:30: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n", + " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n", + "2023-12-14 21:56:56,380 INFO -- seamless_communication.cli.expressivity.predict.predict: text_generation_opts=SequenceGeneratorOptions(beam_size=5, soft_max_seq_len=(1, 200), hard_max_seq_len=1024, step_processor=None, unk_penalty=0.0, len_penalty=1.0)\n", + "2023-12-14 21:56:56,381 INFO -- seamless_communication.cli.expressivity.predict.predict: unit_generation_opts=SequenceGeneratorOptions(beam_size=5, soft_max_seq_len=(25, 50), hard_max_seq_len=1024, step_processor=None, unk_penalty=0.0, len_penalty=1.0)\n", + "2023-12-14 21:56:56,381 INFO -- seamless_communication.cli.expressivity.predict.predict: unit_generation_ngram_filtering=False\n", + "2023-12-14 21:56:58,865 INFO -- seamless_communication.cli.expressivity.predict.predict: Saving expressive translated audio in spa\n", + "2023-12-14 21:56:58,869 INFO -- seamless_communication.cli.expressivity.predict.predict: Translated text in spa: ¿Entonces qué estaba haciendo realmente?\n", + "\n", + "Translated laughing audio in spa:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + } + ], + "source": [ + "expressions = (\"default\", \"whisper\", \"confused\", \"enunciated\", \"happy\", \"sad\", \"laughing\")\n", + "\n", + "for expression in expressions:\n", + " print(f\"English {expression} audio:\")\n", + " print()\n", + "\n", + " in_file = f\"ex01_{expression}_00367.wav\"\n", + "\n", + " audio_play = Audio(in_file, rate=16000, autoplay=False, normalize=True)\n", + " display(audio_play)\n", + "\n", + " out_file = f\"spa_{expression}.wav\"\n", + "\n", + " !expressivity_predict {in_file} --tgt_lang spa \\\n", + " --model_name seamless_expressivity --vocoder_name vocoder_pretssel \\\n", + " --gated-model-dir SeamlessExpressive --output_path {out_file}\n", + "\n", + " print()\n", + " print(f\"Translated {expression} audio in spa:\")\n", + "\n", + " audio_play = Audio(out_file, rate=16000, autoplay=False, normalize=True)\n", + " display(audio_play)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Automatic Expressive Evaluation:" + ], + "metadata": { + "id": "-qo85CRkgVSW" + } + }, + { + "cell_type": "code", + "source": [ + "# Refer to README: https://github.com/facebookresearch/seamless_communication/blob/main/docs/expressive/README.md#automatic-evaluation\n", + "\n", + "# AutoPCP: https://github.com/facebookresearch/stopes/tree/main/stopes/eval/auto_pcp\n", + "\n", + "# VSim: https://github.com/facebookresearch/stopes/tree/main/stopes/eval/vocal_style_similarity\n", + "\n", + "# expressivity_evaluate: https://github.com/facebookresearch/seamless_communication#seamlessexpressive-evaluation\n", + "\n", + "# HF space: https://huggingface.co/spaces/facebook/seamless-expressive" + ], + "metadata": { + "id": "gGg6R8zogfn1" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4PNlRLsloKWo" + }, + "source": [ + "# Streaming Standalone Inference\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Utility classes + functions" + ], + "metadata": { + "id": "dvM68NSZGK8o" + } + }, + { + "cell_type": "code", + "source": [ + "# Download an the LJ speech dataset sample if you didn't already run it above\n", + "# %%capture\n", + "!wget https://dl.fbaipublicfiles.com/seamlessM4T/LJ037-0171_sr16k.wav -O /content/LJ_eng.wav" + ], + "metadata": { + "id": "ihWc_q0lGcnl", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "05696d30-a68b-494f-e2c8-146b00673aa8" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2023-12-13 06:11:00-- https://dl.fbaipublicfiles.com/seamlessM4T/LJ037-0171_sr16k.wav\n", + "Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.51, 3.163.189.108, 3.163.189.96, ...\n", + "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.51|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 485430 (474K) [audio/x-wav]\n", + "Saving to: ‘/content/LJ_eng.wav’\n", + "\n", + "\r/content/LJ_eng.wav 0%[ ] 0 --.-KB/s \r/content/LJ_eng.wav 100%[===================>] 474.05K --.-KB/s in 0.04s \n", + "\n", + "2023-12-13 06:11:00 (13.0 MB/s) - ‘/content/LJ_eng.wav’ saved [485430/485430]\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R5PPqT9boJ9e" + }, + "outputs": [], + "source": [ + "import math\n", + "from simuleval.data.segments import SpeechSegment, EmptySegment\n", + "from seamless_communication.streaming.agents.seamless_streaming_s2st import (\n", + " SeamlessStreamingS2STVADAgent,\n", + ")\n", + "\n", + "from simuleval.utils.arguments import cli_argument_list\n", + "from simuleval import options\n", + "\n", + "\n", + "from typing import Union, List\n", + "from simuleval.data.segments import Segment, TextSegment\n", + "from simuleval.agents.pipeline import TreeAgentPipeline\n", + "from simuleval.agents.states import AgentStates\n", + "\n", + "\n", + "SAMPLE_RATE = 16000\n", + "\n", + "\n", + "class AudioFrontEnd:\n", + " def __init__(self, wav_file, segment_size) -> None:\n", + " self.samples, self.sample_rate = soundfile.read(wav_file)\n", + " assert self.sample_rate == SAMPLE_RATE\n", + " # print(len(self.samples), self.samples[:100])\n", + " self.samples = self.samples # .tolist()\n", + " self.segment_size = segment_size\n", + " self.step = 0\n", + "\n", + " def send_segment(self):\n", + " \"\"\"\n", + " This is the front-end logic in simuleval instance.py\n", + " \"\"\"\n", + "\n", + " num_samples = math.ceil(self.segment_size / 1000 * self.sample_rate)\n", + "\n", + " if self.step < len(self.samples):\n", + " if self.step + num_samples >= len(self.samples):\n", + " samples = self.samples[self.step :]\n", + " is_finished = True\n", + " else:\n", + " samples = self.samples[self.step : self.step + num_samples]\n", + " is_finished = False\n", + " self.step = min(self.step + num_samples, len(self.samples))\n", + "\n", + " segment = SpeechSegment(\n", + " content=samples,\n", + " sample_rate=self.sample_rate,\n", + " finished=is_finished,\n", + " )\n", + " else:\n", + " # Finish reading this audio\n", + " segment = EmptySegment(\n", + " finished=True,\n", + " )\n", + " return segment\n", + "\n", + "\n", + "class OutputSegments:\n", + " def __init__(self, segments: Union[List[Segment], Segment]):\n", + " if isinstance(segments, Segment):\n", + " segments = [segments]\n", + " self.segments: List[Segment] = [s for s in segments]\n", + "\n", + " @property\n", + " def is_empty(self):\n", + " return all(segment.is_empty for segment in self.segments)\n", + "\n", + " @property\n", + " def finished(self):\n", + " return all(segment.finished for segment in self.segments)\n", + "\n", + "\n", + "def get_audiosegment(samples, sr):\n", + " b = io.BytesIO()\n", + " soundfile.write(b, samples, samplerate=sr, format=\"wav\")\n", + " b.seek(0)\n", + " return AudioSegment.from_file(b)\n", + "\n", + "\n", + "def reset_states(system, states):\n", + " if isinstance(system, TreeAgentPipeline):\n", + " states_iter = states.values()\n", + " else:\n", + " states_iter = states\n", + " for state in states_iter:\n", + " state.reset()\n", + "\n", + "\n", + "def get_states_root(system, states) -> AgentStates:\n", + " if isinstance(system, TreeAgentPipeline):\n", + " # self.states is a dict\n", + " return states[system.source_module]\n", + " else:\n", + " # self.states is a list\n", + " return system.states[0]\n", + "\n", + "\n", + "def plot_s2st(source_file, target_samples, target_fs, intervals, delays, prediction_lists):\n", + " mpl.rcParams[\"axes.spines.left\"] = False\n", + " mpl.rcParams[\"axes.spines.right\"] = False\n", + " mpl.rcParams[\"axes.spines.top\"] = False\n", + " mpl.rcParams[\"axes.spines.bottom\"] = False\n", + "\n", + " source_samples, source_fs = soundfile.read(source_file)\n", + "\n", + " _, axes = plt.subplots(5, sharex=True, figsize=(25, 5))\n", + " for ax in axes:\n", + " ax.set_yticks([])\n", + "\n", + " axes[0].plot(\n", + " numpy.linspace(0, len(source_samples) / source_fs, len(source_samples)),\n", + " source_samples,\n", + " )\n", + "\n", + " axes[1].plot(\n", + " numpy.linspace(0, len(target_samples) / target_fs, len(target_samples)),\n", + " target_samples,\n", + " )\n", + "\n", + " start = 0\n", + " for seg_index in range(len(intervals)):\n", + " start, duration = intervals[seg_index]\n", + " offset = delays[\"s2st\"][seg_index]\n", + "\n", + " samples = target_samples[\n", + " int((start) / 1000 * target_fs) : int(\n", + " (start + duration) / 1000 * target_fs\n", + " )\n", + " ]\n", + "\n", + " # Uncomment this if you want to see the segments without speech playback delay\n", + " axes[2].plot(\n", + " offset / 1000 + numpy.linspace(0, len(samples) / target_fs, len(samples)),\n", + " -seg_index * 0.05 + numpy.array(samples),\n", + " )\n", + " axes[4].plot(\n", + " start / 1000 + numpy.linspace(0, len(samples) / target_fs, len(samples)),\n", + " numpy.array(samples),\n", + " )\n", + "\n", + " from pydub import AudioSegment\n", + " print(\"Output translation (without input)\")\n", + " display(Audio(target_samples, rate=target_fs))\n", + " print(\"Output translation (overlay with input)\")\n", + " source_seg = get_audiosegment(source_samples, source_fs) + AudioSegment.silent(duration=3000)\n", + " target_seg = get_audiosegment(target_samples, target_fs)\n", + " output_seg = source_seg.overlay(target_seg)\n", + " display(output_seg)\n", + "\n", + " delay_token = defaultdict(list)\n", + " d = delays[\"s2tt\"][0]\n", + " for token, delay in zip(prediction_lists[\"s2tt\"], delays[\"s2tt\"]):\n", + " if delay != d:\n", + " d = delay\n", + " delay_token[d].append(token)\n", + " for key, value in delay_token.items():\n", + " axes[3].text(\n", + " key / 1000, 0.2, \" \".join(value), rotation=40\n", + " )\n", + "\n", + "def build_streaming_system(model_configs, agent_class):\n", + " parser = options.general_parser()\n", + " parser.add_argument(\"-f\", \"--f\", help=\"a dummy argument to fool ipython\", default=\"1\")\n", + "\n", + " agent_class.add_args(parser)\n", + " args, _ = parser.parse_known_args(cli_argument_list(model_configs))\n", + " system = agent_class.from_args(args)\n", + " return system\n", + "\n", + "\n", + "def run_streaming_inference(system, audio_frontend, system_states, tgt_lang):\n", + " # NOTE: Here for visualization, we calculate delays offset from audio\n", + " # *BEFORE* VAD segmentation.\n", + " # In contrast for SimulEval evaluation, we assume audios are pre-segmented,\n", + " # and Average Lagging, End Offset metrics are based on those pre-segmented audios.\n", + " # Thus, delays here are *NOT* comparable to SimulEval per-segment delays\n", + " delays = {\"s2st\": [], \"s2tt\": []}\n", + " prediction_lists = {\"s2st\": [], \"s2tt\": []}\n", + " speech_durations = []\n", + " curr_delay = 0\n", + " target_sample_rate = None\n", + "\n", + " while True:\n", + " input_segment = audio_frontend.send_segment()\n", + " input_segment.tgt_lang = tgt_lang\n", + " curr_delay += len(input_segment.content) / SAMPLE_RATE * 1000\n", + " if input_segment.finished:\n", + " # a hack, we expect a real stream to end with silence\n", + " get_states_root(system, system_states).source_finished = True\n", + " # Translation happens here\n", + " output_segments = OutputSegments(system.pushpop(input_segment, system_states))\n", + " if not output_segments.is_empty:\n", + " for segment in output_segments.segments:\n", + " # NOTE: another difference from SimulEval evaluation -\n", + " # delays are accumulated per-token\n", + " if isinstance(segment, SpeechSegment):\n", + " pred_duration = 1000 * len(segment.content) / segment.sample_rate\n", + " speech_durations.append(pred_duration)\n", + " delays[\"s2st\"].append(curr_delay)\n", + " prediction_lists[\"s2st\"].append(segment.content)\n", + " target_sample_rate = segment.sample_rate\n", + " elif isinstance(segment, TextSegment):\n", + " delays[\"s2tt\"].append(curr_delay)\n", + " prediction_lists[\"s2tt\"].append(segment.content)\n", + " print(curr_delay, segment.content)\n", + " if output_segments.finished:\n", + " print(\"End of VAD segment\")\n", + " reset_states(system, system_states)\n", + " if input_segment.finished:\n", + " # an assumption of SimulEval agents -\n", + " # once source_finished=True, generate until output translation is finished\n", + " assert output_segments.finished\n", + " break\n", + " return delays, prediction_lists, speech_durations, target_sample_rate\n", + "\n", + "\n", + "def get_s2st_delayed_targets(delays, target_sample_rate, prediction_lists, speech_durations):\n", + " # get calculate intervals + durations for s2st\n", + " intervals = []\n", + "\n", + " start = prev_end = prediction_offset = delays[\"s2st\"][0]\n", + " target_samples = [0.0] * int(target_sample_rate * prediction_offset / 1000)\n", + "\n", + " for i, delay in enumerate(delays[\"s2st\"]):\n", + " start = max(prev_end, delay)\n", + "\n", + " if start > prev_end:\n", + " # Wait source speech, add discontinuity with silence\n", + " target_samples += [0.0] * int(\n", + " target_sample_rate * (start - prev_end) / 1000\n", + " )\n", + "\n", + " target_samples += prediction_lists[\"s2st\"][i]\n", + " duration = speech_durations[i]\n", + " prev_end = start + duration\n", + " intervals.append([start, duration])\n", + " return target_samples, intervals" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Build SeamlessStreaming S2ST + S2TT agent" + ], + "metadata": { + "id": "wGHmMwIPGWgm" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TZPg2tm3oXGR", + "outputId": "6c9b3f55-e50f-46f9-8d39-4e43d3b8251a" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "building system from dir\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Downloading the tokenizer of seamless_streaming_unity...\n", + "100%|██████████| 4.93M/4.93M [00:00<00:00, 18.8MB/s]\n", + "Downloading the checkpoint of seamless_streaming_unity...\n", + "100%|██████████| 3.34G/3.34G [00:29<00:00, 122MB/s]\n", + "Downloading the tokenizer of seamlessM4T_v2_large...\n", + "100%|██████████| 360k/360k [00:00<00:00, 13.8MB/s]\n", + "Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.\n", + "Downloading the checkpoint of seamless_streaming_monotonic_decoder...\n", + "100%|██████████| 3.98G/3.98G [00:35<00:00, 121MB/s]\n", + "/usr/local/lib/python3.10/dist-packages/torch/hub.py:294: UserWarning: You are about to download and run code from an untrusted repository. In a future release, this won't be allowed. To add the repository to your trusted list, change the command to {calling_fn}(..., trust_repo=False) and a command prompt will appear asking for an explicit confirmation of trust, or load(..., trust_repo=True), which will assume that the prompt is to be answered with 'yes'. You can also use load(..., trust_repo='check') which will only prompt for confirmation if the repo is not already trusted. This will eventually be the default behaviour\n", + " warnings.warn(\n", + "Downloading: \"https://github.com/snakers4/silero-vad/zipball/master\" to /root/.cache/torch/hub/master.zip\n", + "Downloading the checkpoint of vocoder_v2...\n", + "100%|██████████| 160M/160M [00:01<00:00, 119MB/s]\n", + "/usr/local/lib/python3.10/dist-packages/torch/nn/utils/weight_norm.py:30: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n", + " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "finished building system\n" + ] + } + ], + "source": [ + "from seamless_communication.streaming.agents.seamless_streaming_s2st import (\n", + " SeamlessStreamingS2STJointVADAgent,\n", + ")\n", + "\n", + "\n", + "print(\"building system from dir\")\n", + "\n", + "agent_class = SeamlessStreamingS2STJointVADAgent\n", + "tgt_lang = \"spa\"\n", + "\n", + "model_configs = dict(\n", + " source_segment_size=320,\n", + " device=\"cuda:0\",\n", + " dtype=\"fp16\",\n", + " min_starting_wait_w2vbert=192,\n", + " decision_threshold=0.5,\n", + " min_unit_chunk_size=50,\n", + " no_early_stop=True,\n", + " max_len_a=0,\n", + " max_len_b=100,\n", + " task=\"s2st\",\n", + " tgt_lang=tgt_lang,\n", + " block_ngrams=True,\n", + " detokenize_only=True,\n", + ")\n", + "system = build_streaming_system(model_configs, agent_class)\n", + "print(\"finished building system\")" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Initialize states + run inference" + ], + "metadata": { + "id": "rWAgPoUlGaQ0" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "izpe5S-rom8A", + "outputId": "be5433bb-258f-4950-a599-61a73577ab15" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Using cache found in /root/.cache/torch/hub/snakers4_silero-vad_master\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "3200.0 El examen y el testimonio de los expertos\n", + "4160.0 permitieron\n", + "4800.0 a la Comisión\n", + "5120.0 concluir\n", + "7040.0 que\n", + "7360.0 cinco disparos pudieron\n", + "7583.9375 haber sido disparados,\n", + "End of VAD segment\n" + ] + } + ], + "source": [ + "source_segment_size = 320 # milliseconds\n", + "audio_frontend = AudioFrontEnd(\n", + " wav_file=\"/content/LJ_eng.wav\",\n", + " segment_size=source_segment_size,\n", + ")\n", + "\n", + "system_states = system.build_states()\n", + "\n", + "# you can pass tgt_lang at inference time to change the output lang.\n", + "# SeamlessStreaming supports 36 speech output languages, see https://github.com/facebookresearch/seamless_communication/blob/main/docs/m4t/README.md#supported-languages\n", + "# in the Target column for `Sp` outputs.\n", + "delays, prediction_lists, speech_durations, target_sample_rate = run_streaming_inference(\n", + " system, audio_frontend, system_states, tgt_lang\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Visualize streaming outputs" + ], + "metadata": { + "id": "WnHddD4KGgPr" + } + }, + { + "cell_type": "markdown", + "source": [ + "The top row is the input audio, while the later rows are the output audio (in chunks), as well as output text, offset by the corresponding delays." + ], + "metadata": { + "id": "Ac3YKDJwISWJ" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 329 + }, + "id": "x08NFlRzoxdT", + "outputId": "565b921f-1797-44b8-c85a-476d9a1bcc6d" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Output translation (without input)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Output translation (overlay with input)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAAB6UAAAGsCAYAAACVaHIlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd2BTZRcG8CdJ994tlEJbVpll772XC1AUF6i4t5/KBtmKAxkqgqKiggriYu8NpewNhRbaQhfdO+v7ozQ0zU5uRsvz+6tN7njbpsm973nPOSKlUqkEERERERERERERERERERGRFYjtPQAiIiIiIiIiIiIiIiIiIqq9GJQmIiIiIiIiIiIiIiIiIiKrYVCaiIiIiIiIiIiIiIiIiIishkFpIiIiIiIiIiIiIiIiIiKyGgaliYiIiIiIiIiIiIiIiIjIahiUJiIiIiIiIiIiIiIiIiIiq2FQmoiIiIiIiIiIiIiIiIiIrIZBaSIiIiIiIiIiIiIiIiIishoGpYmIiIiIiIiIiIiIiIiIyGoYlCYiIiIiIiIiIiIiIiIiIqthUJqIiIiIiIjorlUHE/HaLyeQXyq191CIiIiIiIiIag2RUqlU2nsQRERERERERPb2/YFEzPrvgur7pAXD7TgaIiIiIiIiotqDmdJEREREREREgFpAGgD2Xsm000iIiIiIiIiIahcGpcmmmJhPRERERESOKCWnWOOxZ7+Pw/EbOXYYDREREREREVHtwqA0Wd2+K5n4Iz4ZkRM3ImrSJty8oznZQ0REREREZE9vrT2l9fGPN1+y7UCIiIiIiIiIaiEnew+AajeFQolnvo9Te6zXwt0I9HTBqvEd0bqen30GRkREREREVIWujGg5qz0RERERERERWYyZ0mRVWYVlWh+/U1SOB5cetPFoiIiIiIiINH227bLO547fyIFc4ViB6Yz8UtzOK7H3MGzuTmEZ1h9PwY07RfYeChEREREREZmImdJkVY+vOKL3+bMpeWhVz9dGoyEiIiIiItK0ZFeC3ueTs4sRGeRpo9Hop1Ao0WneTtX3Xz7eBg/G1oVIJFLbZtCifUjIKMT34zqgX0yoPYYqqA/Wncbv8Smq7wc2D8XC0a3h5+Fix1ERERERERGRsZgpTVaTU1SO65n6V7A/sPQA5m26iD2XM5BTVG6jkREREREREVXQVd2pKkfKk07NVc+QfmvtKURN2oRt59NUjw1bvB8JGYUAgOd+iEepVG7TMVpD1YA0AGy/kI42s7YjMasIGQWlKCqT2WlkREREREREZAyRUskGWWQdf8Qn4/11Z4zePtjbFcemDLDiiIiIiIQhVyhxO68E9fw97D0UIiKy0BtrTuLf07f0bjO6fT18+misjUakX6uZW1FQaloA1svVCX+91g2Bnq7w96w5mcXnUvMQ4OmCxKwiPLnyqMHtL80eAjdniQ1GRrWZUqlESk4J6vm7q1UgICIiIiIiy7B8Nwliy7k0fLH9Cr58og1iwnzMOkZmQRmUSiVv+oiIyGH9eCgJucVSnEnJxc5LGfDzcMaJqQMhFvOzi4ioJpLKFQYD0gCw7niKwwSlTQ1IA0BhmQwDPt8HAFg1viP6Ng0ReliCS84uxoglB0za588TqRjbub6VRkT3A6VSiWdXHcO+K5loGe6Df17rwes8IiIiIiKBMChNgnj55+MAgDHLj+D0jEFmHydq0ibse78v3FzECPF2E2p4REREgpjxz3m173OLpdh7JRN9Yxx/cp+IiDTlFNesFkL7r2ZafIzxq47h4MR+CPdzF2BE1nP+Vr7J+0zecBbLdidg7YtdcOhaFoa0qANfD2crjI5qqz2XM7HvSsX/2bnUfERP3oQvH2+DfVeycDWjACk5JcguKoefhzP6NQ3Bcz2i0DLcV+0Y2UXlUCiVCPJytcePQERERETksBiUJkHllUhVX3+995pZx+i1cDcA4NH29fDxqNZclUxERHZXKpVj/qaLWp/LL5VqfbzSR/+ex94rmfj39R7wdOWlFxGRIxGbUKUpv1QKHzf7BTiVSiWe/i5OkGMdSsjCox0iBDmWtZhbQCs1twQ9P6m4p/zvzG2sfr6zgKOi2u5AQpbGY2+tPaXxWG6xFH+eTMWfJ1N1Hmvh6NYO/39GRERERGRLYnsPgGqv65lFFu3/x/EURE/ehNt5JTh+Ixtyxb325+uPp2Db+TRLh0hERGTQqeRcxEzbgh8P39D6vKG2E6sOJuF6ZhFazNiKXZfSrTFEIiIy01YT7ine0RKYsqW/TxkuM26s61mW3avZghBLk/df1QwwEtnK++vOYPflDJSUy+09FCIiIiIih8B0nVomv1QKF4kYbs4Sm53zghll1UzRdf4uAEC4nzs6RvqjuFyObRcqJvWTFgy36rmJyPbS80shFokQ7M1yd2R/pVI5Hl52UO82+ibN/z6lnj3z3A/x/OwiInIgUzacM3rbnZcyrDgSw97+7ZRgx/p6zzX0aBSE7o2CBDum0Awt+jKWUqkU7FhEphq/6hg6RwXgt5e62nsoRERERER2x0zpWuRcah5az9yGmGlbMH+z9hKj1pBeUKrxWHaR8L3ZUnNL8NepW6qANFARLCCi2qOkXI7O83ai49wdaDp1MyInbsSKfdftPaxa7U5hGSInbsQrPx+391Ac0qaztw1uo2+eW1u5x8iJG5GcXQylsqICSEJGATLufpaWyeTIrWH9TYmIqGZ6cuVRTN5w1t7D0EmoMPJT3x0V6EjWdSghC6eTc+09jPvedwcSBT/m0cRs3M4rEfy4REREREQ1DTOla4mEjAKMWHJA9f3yvdeRU1SOT0bHWv3cJ2/majw2+78LVj8vAEz/+5xVf8b8UinOpeShS3Qge1sT2cAvR++VRy6TKQAAczddxJCWYYgI8LDXsGq1x5YfBgBsPpeGy2kFaBrmbecROZbyu69DfURmTJtX9rrs2ThIVVrU3VmCkruLrTa92RPN6/qYfFwiEl5WYRkmrj+LHRfVy+/PeqgFnukaaZ9BkSCSakAJa2v79ehNvNK7oVWvs/48kYLL6QWYOCTGpIxloZKbDybcQXJ2sUNfS36+7TIW70oAAGx4tRva1ve384juTwWlUqsdu+v8XTg1fSD8PFysdg4iIiIiIkfHTOlaoFQqx4DP92k8/nt8CtLzNbOYhbZ451WNxxJtNMHze3yKVY6bVyxFfqkUAz/fi7Erj+LTbZcx4ad4PP3dUWZnC6ygVKrKFqT7V1GZDBdu5WPORu1VHioDeCS8a5n33q9N6atJ9yzZpfk5aKyqvS5Lqny+DFu8H6m5zKghcgQd5uzQCEgDwPS/z9u0OhEJ76XVrBICWH9B8bu/n8byvddx+Nodk/YrErAPb0ZBmWDHEtKdwjJsO5+mCkgDwCNfHcLR66b9rkgYCivflv7vj9PWPQERERERkYNjULoGS80twbAv9yNm2had28isfVdVQ8zdeAF9Fu7Gg0sPYNnuBL1B0Iz8UsTO2obWM7chPb9i8uKrPdew/UI69l/Nwi9Hb9pq2LXeudQ8tJq5DVGTNmHB5ksokzHgfz86fiMbLWZsxbDF+/Vut/pwkm0GZIHbeSXYej4Nihr63iuTG84KJk2X0gq0Pm7pIqb3fj9VY19Luhy6lmWTBXOWyC+V4rdjN9FkSkULgaW7ruJOoWMGM8j+lu+9jsiJG7Hbzr2GyTyJd5gpDUCtPRJQ8fklxKLRMym5SM4uVn0/duVRlJgQaH5zzUmLx1Bp1NeHBDuWkB5cehAvalkcMebbI4icuBELNl9CXonw2bsKhRInb+Zg5f7rSMkpNrwDCWLHxQxcvJ1v72EQEREREdkNg9I11G/HbqL7gl24YOCG5uYd+9xgSh0osJGRX4oV+xORdKcYZ1LysHDrZTyx4ojWbaVyBTrN26n3eLP/u4AJP8Xjj/hko8q6OoKUnGKUyeTYfiEd51Lz7D0c7LyYjsiJG9VKzn+z9xqaTt1i1mvn5dXHVT1aqWYpLpdh1NeHjdp22t/nrVpSTwhd5+/CS6uP45/Tt2pkBYDSGvKeVlMs32tZP/Qj17MRPXkTsgQOiCqVSly8nY91x1OQVyxVe9yai4MOJWRh7Iqj6Gzgc9beOs7ZgQ/Xn0X53c+jT7ddQfs5OzBiyX5WSyGdxv9wDBkOvuCCNNWUa3lbaDd7O47fyEZydjFipm1B74V7LKrYkZxdjAeXHtSodrPueDLKZHIkZRVBXssWXpnD0O/4m73XEPvRNvzvj9OCLFSTK5TYeOY2oidvwiNfHcKcjRfR79O9Fh+XjDf0y/1YvPMqrymIiIiI6L7EoHQNtONCOj5cf9aobXUFX61JoVDi/C3HWP17/EaO1iDzkevZGo8dvnYHjadsNuq42y+k4/11Z9Bk6mbccPAMi+M3ctDj491oOnULJvwUjxFLDuDFn+JxWUdmn7WdvJmD53+M1/n8lnOmlQ8+k5KLLXdLDtuyxPPF2/lIzi6GQqFEfqkUZ1JyceJmDn44mIiBn+/FP6dv4U5hGcvvGtBu9naTth+/6piVRmKZojIZEjLu/U+9/dspRE3ahJ6f7MLCrZdQWCaz4+iM9+0+y4KopO6mQAtlOszZIdiCog/XnUHUpE0Y+uV+/O+P04idtQ0x0zaj3eztiJq0CU2nbsGnWy8Lcq7qdJXnt4aUnGKs2Hcd/T7bgzVxplU4KdMRpDqXmo+YaVs4iXyfMaWEbqd5O1FcLquRi5LuR9cyC+09BIeSXVSOUV8fxncHEgFUfIZ1X7ALB6q0mTCFrioin2+/gg5zdqDPp3vQYkbFe2pNDE7LFUr8dDjJZlmv646n4IsdVyw+zhPfHsFrv55Qe6xcrsDfp1ItPrY+ydnFXARSxefbr+CTLZZfb1kji56IiIiIyJqc7D0AMk5aXim+2XsNPxxKMnnfU8m5aBPhJ/iYdCm1cQlmpVIJkUik9Tl9ZdoyC8oQ7O2q+v6d306Zdf7eC/fg8KR+CPNx0zkOe9pwUrPv9rYL6aoyfW/2a4Q3+zeGk8Q2a1Qe+Up/6bw31pzEA7F1jTpWRkEpHlx6UIhhmSSjoBRDv9Rfarp6ucGvn2yHIS3DHPI1Yg+Ld17F59tNn1iLv5Ej2BjkCiXiErPRqp4vvFzN/zgsLJOh5YytWp9Lzi7Bst3XcPR6NlaN7whvN2coFEqIxY7xOtippUdqqVQON2eJHUbjmCyZJ1dCuEn2ysoScx9piTEdIkx+z5bJFWg5cytKpZoTwqVSBUql5arvl+5OwG/xyVj9fCfEhPlYNvAqqlZ3iZy4EVvf7oWmYd6CHb9SQakUPT6+t0hp0p9n0STUC+0bBAhy/Hazt+PczMEO839M1jXmW9MWeDafvhXhfu7Y+GYP+Hm4WGlUJISTN3PN2k/fvUdtUP1+86nvjiJpwXCTj7Nw6yWtj+dUqdBRKlUgZtoWBHu74vPHYtG9YZBV31uF+NudSs7FlbQCfH8wURV4N+f3Y44luxLwaPsI1A/0MGv/vGIp4pI0F2cDwFtrT+GhNuGWDE+rknI5tl9MV7s3Gtg8FAtGtkKgl6uePe3Ehusjvj+YiPO38jCqXT082KYu3JwlyCwoQ15JORqFqF8fXU0vwFd7rmFQ81Dkl0rh7uIEpVKJt9aeQqfIAHwyujUigzxtN3giIiIiIjMxU7oGmLvxArrM32lWQBoAxq+KE3Q8+65k4qGlB/BHfLLWFdXbL2gGOawpIcO8LIfvDyaqfW/J/EfX+bsQNWkTIiduxMT1Z4zaR6lU4lxqnka56oSMQmw6e1tvls2Bq1novmAXJq4/o3fFeV6xFD8f0Z8htnhXAhpN2Yzc4nK92wnhVHKuUdsZysi4U1iGXZfS8d7vpzWeyywwrcztsaRs7L6cYVLmW1KW6dmPr/xyQvUaGbFkv0OUUbcncwLSliiTyfH9gUS194uHlx3EEyuO6AwoG+tMSq7BbeJv5KD3wj34+cgNtP5oGz7ffkXwkszm0Fa1IGbaFmb5VTF5g3GVSW7nlWD14SS1jBVr/BqnbDiHRkZW9bg3DiX6fbZXa0Bal8yCMgxZtN+k3p+GxlDde3+cEuTY1U3UUk3G2CoLxnyGFJfLsfrIDZPHZYr8UqndWrCQ5VJzS9Bm1vZa1xO+tjF3PeaOi+wfrs+t3BKMXxWHK+nG36NlFpTh6e/isPKAdSu2fGjkfZo+Dy87iA/Wn9GZCW5tvRaaVxmqXKZA7KxtAo9Gv2/3XUOz6Vs0Futuv5CO9nN24LiAi02F8kucdT/fqzuamI0P1p9BzLQteGjZQXScuwMDPt+HHw8lYcJP8Xjsm8MYvng/Bn6xDxtOpuKVX07gw/Vn8eaak3hr7SkAQFxSNvp8ugcTfopXVR3IK5GysgsREREROSRmSju4tXE3sWJ/ouEN9cgplkImVwiWCfvM9xVB7tPrtN/Uv6slUGhN+aUy5BVL4evhbNJ+pVI5SsrlcJaIcOJmLsrlwkwcrj2WjLcGNEYdX3ed2yRkFGLA5/d6d12bNwwiAL/G3cTUv84BAGY91ALPdI3U2Dc5uxhPfXdUdS4fd2dMHtZM63mW7Lpq9LjbzNqOw5P66R23pX47ZlwJVV0ZGVmFZXB3lmD44gNI09G38d3fT2H18521PlcqlWPG3+fxSLtwdIkORGJWER79pqKfcaiPK45OHmDkT2KZc6n5GLHkAIa3qoNlT7az2nlKyuU4mVwx2RMd5IVQH9dandmjT4fZO1Bwt4R2PX93eLhI1CZLIyduxK73eiM62MvkY4tg3O80u6hc9f+9eOdVLN55Fe0b+GPaiOZoHe5rs6zLz7ddRn6pDDMeaK5zm+JyOTwtyB6vLUzpU991/i4AFb3PAcDb1Un1mrOGB5cewF+vdjf4ulEqlYiatMns83y27TKmjtD9WjHWCS0ZiUIFvKvbePa2xmP5pTL8fSoVl9MK8HDbcNQP8ICrkxj5pTL4uldcPygUSoz82rjqGzP+OY8BzUMR7if8Z+a3+65h3qaKDENrZZOTbml5pXhy5REUlsnwZOcGFh0revImDG0ZhvcGNcGBq1nILCzDewObMsveQYjNvCYSqjVDbfX8j/Fml7Set+kSXuzVUOAR3fN7fAo+GR1rteMbQ6lUIv5GDnKLpTh8zfj2AFVNXH8GC0a1Nnp7hUKJIYv2GdyuoFQKbzfT7ql1kSuUqs8yXUZ9fQjfPNUeQ1qGCXJOIQhRUttcp6ss4J7xz3mT999+IR0NJ2/C1OHNVC1TujcKxGMdIhDo6Qofdyek5pTgSnohwv3dMbp9PaOOK+RcEhERERERZ5wdWEJGISb+aVyGliGfbL2sM3BpDFNuRGzdE6yyRLebsxj7PuiLEG83xCVmY9rd4I8uqw4mYdXBJKuM6es91/DRgy00AoCHErIwduVRje0bTtYMGEz/+7zWoPTDy9QnzL/ddx0j24VrLbFq6qTZh+vP4qfnOpm0D1BRztpZLIa/p3BlKn88lIRnu0Wqvr94O99gyWwA2K8ny/rZ7+NwNDEbv8UnI2nBcPT9dI/qufR847NWhcoi3Xj2NnJXHsHPz3fWeK3IFUpsPZ+GtLxSyBVKRAR4oEVdHxy+fgdnUnLxdJdI3MwuRnSwJ9YcvYkzKXkY0jIMpTI5jiflQCTSzOQZ1a4ePnvMvIk4hUKJ6Luv095NgrFkbFv4CDRpZapLafkmlxSuGhxMydHe57vfZ3vNKr94+Jp5vRaBip7vlf/Tv77QGd0aBZl9LGNkFJRi8a4EAJrlOas6cTMHPRsHW3UsNYElfeqtGZAGgDMpeTiZnGOwJPW5VMt6Xa48kIgpw5tZvKBlnJaqLdcyiyw6pqkqM4q+2nNNkOP9dTIVr/VtJMixKuUUlatN4g9etA9xk/sjxMcNQMXnT2JWETIKytAoxAtBJpQ+lSuU+Hz7ZXSMDECfpiGCjru2UCqV6DJ/p+p7Iap6bD6Xhs3n0lTfJ2QU4n+DmuKTrZfxRr9GaF3PT+t+JeVyuDmL79vFZLZgblDaXtVEisut+7kiFFv1WDbXmZRcnf93+hSWybBsd4LW5z7ffgXvDmyid//4pGyMvrsY1lJrjyWjcag3nu8RZXDb7w8kYtZ/F4w67ptrTmLVeNPvBbV5a+1JwxsBWLr7qkMFpWuDyoA0ABxMuIODCdoXP3SOCkBEgP5S8NsvpOOVn4/j00dj8XBb4cu7ExEREdH9h0FpB1Y1k9ZS3+67jolDYqBQKg0Gl3OLy/HOb6fQtr4/ejUJ1giCOqpSqQKd5u7EgGYhdi+r99PhG0i6U4xvn26v1ptVW0DaVHeKNMtsD1m0H6emD4SPmzMS7xQhMbMI7i4SVd9oY+27kokymRyuTsb3ky0sk6HT3IoJXEMBvX9O3TL6uDP+OY+nuzSAWCxC5MSNRu+nz9HEez3UtE0oZhaU4as9CWgQ4IFujYIgFgEeLk7wdXeGh4tENTEsE3DhxcGEO0jOLtHoDffjoSS9E0jayrLr6hFXaf2JFIT6uOKDITEmjbFqQBoA9l7JROuZFeX/nuhUHx0j/ZGcXQKRCAjzdYO7swQPxNbF6sNJOHkzFwsfjYXkblbY8r3XEObrZtL5qxuyaL/NevcZkl1UrgryWmrsyqN4uksDdG8UiH4xoXBxEjYj4VhStqoygCGLd17F09/F4bNHYzHKyCwKsj2pERU+yuXGl+zW5eLtAjSva35v6TMpuSgotU0wxZYL4xZuvYynuzZQLdApKpOhVCrHuVv56NDAHxdu58PfwxmNQryRkFGAnRczMLRlHQR7u8LdRfvn7JhvNf9HO83biTUTumD35Qx8u097aduPR7XC6PYRqvdabf45nYplu68BuIZnuzbAjAda6MzYPZSQhRM3c/Bcjyh4uAh/u5BXLEVRuQwh3q5ar0tlcgWOJeWgVCpH97uLdRKzitAk1MuqQdrZ/100vJGFtp5Px9bzFddn2y+kY9qI5vgjPhlikUit73qlCT2jcDO7GE1CvdG/WSjWxt1E5+gA9IsJVWX5k3nM7Sld+Rq0dW/pv04afx0ttOuZhQarybSbvR3ZWu5TTBU5cSNiwryx5e1eFh9LmweXHjTpOvLEzRyM/OqQ3m0W77yKt/o31voeXCaTY8qGc1h3PMXkseoz+78LEAF4Tkdg+k5hGd5fdwa7Lhl/X7z7ciZu3ClCg0DLexP/d0azaok251LzsepgIsZ3NxxgJ2H1/GQ34qb0R4i37nuzCT9VtPp5+7dTDEoTERERkSAYlHYghWUybDxzCwOahSIuUX9wyRyVQaV/Xu+udXV4QkYhRn9zCA2DvXD8Rg52X860ed9XIdg7IF1p35VMDF60D3vf7wvAvN7XpVK5WlD75E3dfbfazNpu+iC1mP7XeXw82vhycIlVMt0iJ27ErxM6o0yqQO8mwWqT3bfzSlBkYqnWz7dfwcNt65q0TyVDE4VvrNFcvd9x7g6d20vEIjQK9sLldOH7x41Ysh/5pTK4OonRLyYEhWUyvRnflvhqzzVcyyzEl4+3VXtt6fPYct2BzDVxN7EmTjNA7uPurCpj7OPujJkPtsC51DzM36y/jJ9QKv93LC1drItcoYRYBGQXCdsXevWRG1h95Abe6NcI7w1qavHx5Aol0vNL4eokNjogDQDHkirea9774zQ6Rwegnr/+LAqyj/GrjuG/N3ugoZ5AwfcHLGsBAgBxiXf0BqVv3CmCt5szArRUy9h3JVPV9sMWPtlim/eYSvuvZGF46zq4lVuCbgt2Gdy+6nvgT891QodIf7Wgr64erE+sOKL3uB+uP4sP15/Fq30a4tW+jeClpfz+xjP3snV/PHwDPx6+oTUw8+vRm6pe6mdS8vDtMx30/1DVVLZH0VU9JbuoHO1m37tmqXpdWi5T4OMtl/Cdntdtz8ZBeKNfY3SK0l8lwBzfH7T8/8VUsw1kMFa28dl6Ph1L7i6CWnssGQ0CPVTXmGQec//es/+7gKvpBVh7LFn1WNWKBtYiU1i+yMhcl9IK1ILSMrkCfxxPQVSQJx7/Vv/7k7nns2Zm+LrjKUaXLjYUkK709m+nsOSJtmqP5RaXC3aPps2s/y5AJIJGQLewTIb2c3Tf1+jTe+EeABVtSN4a0BgDmoWiQaAH8kqk8PMwriqWqb2iP/r3Ar7YfgX5pTIMb1UHj3eKgIeLBN5uzgjycoWnq8SkRdNkvO8OJGLSUOMq6v1z+hYyC8rQPyYEkUGWL1wgIiIiovuTSGmv+mOk4aXV8dh6Ph1t6/uZvXLfVPX83fHD+E4olcoxYskBm5zzftM/JgRypRJ7LmeafYzKia6en+xCcrb20sNC+/LxNniojeHV0LqyL9/q3xjvVCljdy41z+avsapjOHA1S9WLm+7pHBWAgc1D0adpMBqFaPYtvXArH8MWGy6bbsimN3viqz0JRmdNGKIvw6XDnO3IKqzI1BnWKgybzqbp3La6gxP7IdjLVW+WslSuwIDP9yLU2w0LRrVCv8+Eq2pR1aShMXipt2V9FYWqMuAomem2JtTvz9pGtgvHZ4/Gqi3EKSqTYf7mi1qrKpjjzX6N0LaBPxoFe0EiFmHRjiv4PV531tcno1ujbYQfBn6hv4fl9XnDjOqvu3L/dSzYfAkfDonB3E0X8eNzndC7iWaJeVv/zVydxGhX3x+Hr5vXF7TSxjd7oFmYj1pVCkucmDZQY5GAtt/N3691R2yEn97tLs0eYtQipsrFYJX7fz+uAz7efBmfPRaLluG+qu0e/eaQauFLpYHNQzG6fT28tPq4wfNUWvdyV3SI1AxMX0kvwKC7r7ufnuuEXk2Cjc5orSn/85WOTu6PEG9XnEvNR0SAO9ycJXBzlqCkXI4P15/BkJZhGNaqjr2H6XCSs4sx+78LJlcUMqRBoAfWTOiCnOJyuDqJMWfjRWTkl+GpLg3wYJuKRZbaFowYo1ymwM9Hbhhdhtka3ujXCKdT8hAd5Im1x26iVGq/ILkQfn6+M3o0rqjEIFcoUVgm06g+cPNOMXotNL6VR/XrpVd/OW7Sdai5ejYOwjNdI7HlXBrWnxA2I7v6eb4f1xHOEjHkCiXySqQanzXjVsVZdN+ry9sDGuPNfo2NumawRE37HBBCx0h/PNQmHAOahcLbzQmed9+nMgpKVRXRqrtf7w2IiIiIyHIMSjuQ+/EGiGqGysULzhIRfN2d4evuDKUSEItF2HslE8/aMAuOrG90+3poXscHnaMDMHxxzVqs8mj7evhDoPKIXq5OCPB0Qf0ADxSXy/DuwKbIKiyDi5MYr/5yAgCwZkIXgxmMlgj2dkVmQRlGtg3H6A710DDYC77uzqos8PxSGWI/2ob2Dfyx/pVuAIAt527j5Z9PWG1M7er7oWNUALaeS8MrfRois6AMXRsGIibMB4VlMgR7uUIkAjIKyhDk5QqxqKLUqUKhhFgsUgWHlEol8ktk8HCVwFkitnkZVEMWbr10t9QxWdvaF7tAKlcgPikHl9Ly8VLvhmhexwdyhRIeLhKcScnDQzWklYgjGdk2HE4SERoEeuLRDvV0TixXJRYB2iqgx03pDyexGGl5pZCIRWgSWpExWfk/ez2z0OACnVAfV3SKCsS/p4UrQfzBkKaIDvJCszreyCgoQ5iPm9m94IX8/LCnMB83pOWXajzu6iTG/JGt0CTUW22RwP2o1ye7cTO72N7DgJerE7zdnDCqXT3E38jGyHb10CTUG/+cuoWYOt7wdnVC/I0cvZUDyHHFhHnjUprwFZYcTYi3K4a1qoNhreqgSaiXVTPDAeCRtuH4YEhTBHq6wkkswg+HkrDtQho+GRULP09nOIvFkCkUcHeWQK5UQqEA3JzFOq8xK68/b94pxtd7r2mtAHW/aRnug35NQ/S2KOocFYBX+jRE2/r+8HSRQCIW2ew6vvo9Q+WUpiPdRxARERGRbgxK25lSqcSgL/Yhp1iKrEJhy8ASERHdL+oHeEAiFqG4XAaJSIRbeRVBmXA/d5TJ5HB1kiC/VAqZXIkymRx+Hi4oKZfDw6Uis1AiFiGzoAwBni4oLpchp1hq55+IiMg2nMQi1L37Xpme75j3IwGeLqp+xeF+7nCSiCARiSBVKFBZ1VosBsRVghIyuRIyhQJOYjGKymVwd65YBOUIAWkiovtJVJAnkrOLIdO28k1AhnpkExEREZH9MSjtAJpO3YwyWc0uf0ZERERERERERGQP5z4abHabBCIiIiKyDQalHcCxpGxIxCKM/OqQvYdCZDQnsQhPdWkAbzcnLNFT2ouIbKNTZADSC0px407NzQCrLFde/Wt/D2dV5nKzOj4I93NDp6gAZBZUlFNvGuaDcD93KJRKyBVKOIlFyC+VwsPFCa5OYiiUFb3ApXIFXJ0kKJXKIRaJ4CSpKCvu7iJBbrEUPu5OEIlEKJXKMXYFe9A7gnr+7sgpKkdRudzeQ6Eq2jfwR4u6Prhxpxgt6vrgqz32L3VfP8ADOUXlKCiT2XsoNYKniwTjukfC190Zbev7A6i4trueWYS0/FKE+7lj87nb2Hpes+/yi72ikZhVhCPX7mB0h3oAKkoVh/m6o6RcjnmbLqJDpD+CvV3Rrr4/bueWYNWhJNy4U4xOkQGIS8pWHSsiwB3J2SUa5wj3c0eJVI6H2tTFw23C4e4iwcXb+fB1d0apVIEgLxdVqXlnSUV2tBJAxZ2tEiXlCrg4iVGZOC0WiVBSLoe7ixhPrjxa4/shEzmicD93pOZq/j8LzdVJjCah3kjIKESJ1Ljrg8c7RmDtsWS81rchGgZ74cTNHOy7ksXKCQIK8nJBszo+6Ns0BE4SETxdnFAilaOevzvcnSUokymQlleKc7fy0KyODwpLZfj56A1MH9Ecy/ddh5+7M+Jv5MDNSYxgb1cMaVkHf59KRYdIf3i6OmH53ut6z//Zo7EY1b6ejX5aIiIiIjIXg9IOpEwmx89HbqJ3kyAM+Hyfzc7bJsIPDQI98Pcp4Xr8kbqnuzTA6iM3LDrGwtGt8c/pW9h/NUugURnn5LSB8Pd00fl8SbkczaZv0Xj8wId9Uc/fA0BFmXqlEoievMlq49QlacFwAMBrv57AxjO3bX5+RzesVRhSc0rg4+6Mp7s0QO+mwXB1kqiezy+VovXMbRaf55PRrTHr3wsoFChYUPl3rVS1t1jkxI1mH9fDRQJvNydse7s3fD2cdW6XU1QOV2cxPFycLDqfPn2bBmPV+E4Gt8ssKIO/hzOcJGIAgFyhxOW0AsSEeQv2P1f9930/yCuRIvYjy1/7tvJy74Z4f3BTSMQVfREzC8sw6mthF7tdnjMErk4SyBVK5JdI8c/pWxjcIgwBni64nVeCMpkCt3JLUD/AA1FBnsgsLDPYx3jGA80xvnuUwXMnZxfjpdXHceF2PmLCvDHzwRb4/Vgy/jyZil3v9UZ0cEWPY2v9P1rbqekDcSu3FMMW7xfkeP1jQvDduI5qj+n63ZyYNhASsQi+7s5QKpWImqT5vmHMe8Ds/y7A00WCk8m52H81C892bYAt59Ow7uVuiAjw0DuOrtGBUCiVOJqYrfGcPmdmDoKPm/p79ZtrTuKfKn2rp41oji7RAWhRV38f5d2XMzB+1TGTzm9vk4bG4PkeUZCIRUjOLkHinSL0bhKMa5mFWLn/OsZ1i8KltHxcTS/ES72j4e2m+3PtfjP4i324nG67fr/vD26K5nV90C7CX+/1hT7J2cVm90wXStfoQBy+fseuYxDSg7F1MaB5KD7dehnvD26KEa3rILOgDD7uznCRiE2+jtr+Ti/UD/RQXUsv252AhVsvW2PoOn0yqjU+WH/GKsfuFBWAV3o3RNv6fnBxEmPZ7gQ80jYcjUK8VdsIde9QXV1fNzSv64uXe0ejQ2SA4MevVFOvIywR5uOG3k2CER3siZHt6iHY2xXlMgWaTN2sdfs2EX7485VuEIvZw5mIiIiITMe6Ng7E1UmC53sYnpi1RIi3Kza/1ROBXq6Y9tc5hPq44vV+jQGAQWkrqVyxa0lQeu/7fdAg0BMlUrlNgtIiEfDB4Bh0ivLXG5AGKlaqV/V0lwaYMrwZ3JzvBTZFIhFENrxnXTa2HUqkcgxoFqL22MYz9yYZAj1dcOdub8L70fQRzfFkl/pqAWhtvFzM+5gY1DwU2y5UZFf5ujvjsQ4RGN2uniBB0nA/d43HRFVeYO8MaIIvdlzBewObIL9UihX7E40+9oVZQ4zaztD/hRC+rxZQ0iXY21Xte4lYhOZ1fQAA61/pig0nUzGyXT2zq3FMHd7MrP1qOl/3mhW8mTg0RvV1/UAP1A/0wNmZg/D1nmuCZLEmzh+m+j+TiEXw93TBs90iVc83CPQEADQJvTcxbUxPP2MC0gAQEeCBTW/1VHusS3QgPh/TRu2xeY+0wuQNZ406plB2vtcb/T/ba/J+fZoGo0MDfzzRqT78PFzg5yHM+0pshJ9GQFqXr59sh4Aq72cikQij2tXD+hMpqsc2vtnDqGNNG9Fc47GPHmqp8diDsXXVgsYAsObFLgCA3OJynEnJwzPfx+k9V7M6PvjqyXYaAWkAeG9QE+y9kokHY+ti1kMt1D4f9OndONio7RzJi72iVT9f5f89ADQM9sL8ka0BAE3DvHXufz/b+k4vi4JPYT5uSMsvVXvsk1Gt8Ui7cOQUlSOjoAzN6vhAImDQxtOOJWnf7NcI7w5qqvaYtYN3O97tjQGfm/7eaozjUwcg0Ove9dODsXVVX4f43PvsOjV9INrM2m70cRuHqv+/1fPXvGa1ltf7NsI7A5tAIhbhsY4RkCuUmP73Ofxy9KbZx7wyZyhcnMTILCiDUqlEsLer2nvq+4NjNPbxcXOGk1hkcu/gZ7o2wE+HK+6XQ31c4eokQbv6fnioTTja1vcT7DOSNB2Z3F/jMRcnMRLnD0Prj7ahoLRiUfGOd3vB280ZIdVeB0REREREpmBQ2kGNbBeOP0+kCnrM6/OGqa1mnf2w5kQhoH5DWNt5uzkhOsgTt/JKVWVihRRbz1dVQmrysBjM23TJpH2XPdkOcoVSNdk/pmMEpv99Xuv2E3pGYfWRGxaXI9z+Ti+NCRV9xGIR1r/SDRn5pRjaqo7ebatPdBuy9sUuaBXuixYzthq9D1CR/WvoRvmH8Z3wwNIDao/NfKA5PvrvApRKoFGIFxIyCk06r7nGd49Ex8gApOWVolGIF47fyMGXO69a7XxrX+yCLtGBRm0rFouwbGw7vPbrCZPO8e0zHVSTlV+MiVUdSwj/vqE/QPJm/0YY3aEe6vq6oUymMCko7Si+fbq9IJM97RsEoH2DACgUSrg6iVEmM/394YWe0RaPg6xn2dh2GNoyTOtz3m7O+GBIjMVB6Yuzhpj9erw2bxiO38jBY8sPWzQGY43pGGF0ULp9A39cyyxEqVSOIC9X5BVLTS79PHV4MzQM9tIInnSJDsALPaKRWyLFqHbhuJJeiMGLKqrgLBzdGqPa1dP6nvjLC53x5ErN0vELRrbCA7F1sfrIDfx5IgVikQierk7wcJGgd5NgPBBbFyHerpDKK8rXa/PxqFb4cP29342Xq5PWz+0ejQNVn9UeLhKDGcamWjSmDZqGeauyB5c80Vb1nJ+HC3o1CcblOUNQXCaHt5sTJGKRSa+/BoGequxvU9S0jK/qwSEy3a8vdMZYLf9vhgxrFYavnmyPO4Vl2HslE90aBiHM914gM8THTS2wKZQAGyyI0+WVPo00Hrs+bxjuFJWj49wdVjlnoxAvqxy3+j2xPn4eLvj00Vj874/TBredrmVhzojWdTH97/PIK5GaPE5jjekQgY9Ht9Z4XCIWYe4jrRDq44bPt18x6ZhPd2mAdwY2gcvdxcfVF0Aa8t24jnjWwOKiqn59oTO6NQrCLC0Lmch+RCIRBjQLxYaTFfNSVTPiiYiIiIjMxaC0g5r3SCvkFUux81KGIMfr3STY4M33hle74dytfDzdpQGGt6qD8T8cQ3EN6eHo5eqE49MG4HZuKfw9XODrobsEZaWrc4fCWXIvy/f3+GR8sE64UmfdGgZi6dh2qu9f7NUQqw4m4XZeqZ69KugqZ+rqJMGQFmHYcj5N7fHFT7TFg7F1MWV4c5TJ5KrsV1PLp8WEeZsUkK7UvoG/UdtNf6C50UHp8x8NVmWEHJzYD90X7DJ6PMZM0raq56sReB7QPBTjDGTt3c4rwa3cUsHK4naJDsDbA5qoZWX2ahKMp7s2QIc5uif5God4oXGoF1JzStCnaQjWHU8xqodbszo+6GDk36vS8NZ18Nqvup+v7Cs6dXgz9GkaAjfniv+r317sgvO38tG36b2M9aVj2yKvRIopG86ZNIaqDE3KikQiVTa1m7MESQuGWzWbZ8e7vQRrueDj5oTNb/fSmg1uCbFYhEuzh+BYknHBwT3/64OTyTlGL16orYa2DMPmc2mGN9SiXX0/nLiZK+yAqhneqg6Gt9a/GAioyNRKyTGvx+PcR1rC3UV/RQV9JGIROkUFIDrYE9czi8w+jinnG9MhAr/FJ2s8t/6VrkjJKUG4nzvaN/DX+lnRauZWVUaQMcbdzRZvFOKFne/1hr+HCy6l5aNNhB88qlSaaBrmreqD/kjbcJ3XZN0bBan1UweAn57rhJ6NgyASifBy74Z4uXdDneNxcdL9+TemY32Mbh+B87fy8OWOq3ivWtZjpeGt6uLCrXzE38jBx6M0Ax2WEotFeK1vI7zWVzPIVcnVSWKwkoc+5manmrMIy9qigz3Roq4vAj1dUC5XICG9EE92qY8hOhajkPFiI/zM2u+hNuEAgEAvV4xsd3/0L9X2OSAWixDs7Yotb/dEfokMHSP98c3e6/h4i/GLcG3t9b6NTF6AMrp9PQxsFoqb2cUaC1qrek5LxTOJWITTMwYhp6hcrcpORn4pbmQXo2NkgEXXqOF+7loD0lW92b8x3uxfURmtxfQtKDJwf9+8jo/OxevG6t3EtMoT3RoFWXQ+IU0f0Ryz/rtg72HYzEIDr58u0QGqoDQRERERkRDYU9rBJWUVoc+neyw/jhn9QJVKJf6IT1HrSfXuwCYmr7S2ph3v9kLDYC+dQUhtN/mv9GmI9wY2UfVfraRQKAXrv+rj5oQzMwdrPH4qORcjvzqIYa3qYPqI5ug0r6LXZvdGgfj26Q4YseQAUnNKcHrGIJ1BgOyicrSbfa+MXLifOw5O7KdzLAcTsrRmXVX3xZhYPNwm3OpZNz8cTMTMf/Xf6DcKqcg6q6q4XIbm07diXLdIdGsYiBdXH9e6b49GQfj5hc5an3to6QGcTskDUPE/UXnMSlX7YBtS9bVV19cNvZuGoFvDQGw5l4aNZyt6V/ePCYGnqxN83J1QP8ADh6/dwe7LmRjVrh5mPtjcYG/H9PxSSOUKuDpJ4OfhjIJSmc6ArEKhhFypxKFrd7DqYCIah3ihW8MgfLHjCs7c/ZkB9RK8pkjOLsawL/erZRE+EFsXcx9pqbV8qiFf7riKL3aY915izvuZsRN+5vZOtmRCcXjrOsgvkaJ9A3+81b+x1f8Hv913zWDVhuplLe9XhhY3VTWoeSi+faYDlEol5AolnCRiq5c2PTixn1ELGGRyBbIKy/HW2pNG9+wd0iIMbw1ojGZ1fCwdJgDgVm4JumlZXGSNfuVSuQKNp6j3Qfz9pa7oFGW4B+Wj3xzCsaQco86z7uWuJvW1VCqVUCgNB0zLZQpcSstHy7q+NS57tzZIzy9F53n6e6FbYt3LXbF0dwLEIhE+ezQW/p4uuJVbggBPF7g5V/RsT88vxfrjKXikXbjR1yVkulKpHDHTtpi8n7nXUkKwR79bFycxrswZavT2Z1Jy8eDSg2af74lO9TG0ZRh6NQkW9Of9cEgMXumje1GPMXRdF1yaPUStdZEpzP0ZF4xshcc71Tdpn9t5Jeg6X/9C34uzhli0GK1S+9nbjWqV5OEiMbp9jq3cL32l/3y1G9rV179YWaFQYsPJVLRr4I+oIE8bjYyIiIiIajMGpWuIX4/etKhHoiWTvrfzSrDrUgYejK0LbzdnZBWWYcqGs3ASi1XBN1tbNrYdBjQPMZhFk5hVhL6f7sGsh1rgma6RBo97KjkXDy8zPIni5+GM3GLdZdguzBqsliFVlVSuUGVo776cgVUHk/DxqFao4+sOmVwBmUJpcFLj5M0cjP7mMDxdJFj/Sje92c2lUjmaT98ChRJaM9XaN/DHV0+2Q6gVygxqo1Ao0XXBTqTn6y6XXjVLWpc7hWWYsuGcRtb40cn9df4sJeVyvLHmJMZ1i0SPxhUr8m/cKULvhXsQ4u2KQxP7aSxW0GXU14dwKjkXx6cOcPgeZ2l5pSgsqwhoW1L6Ua5QoqhcBheJGAqlUudr3FgHE7Lw1HdHYcqn0KGJ/VDXjCzilJxiLN97XdXbvWGwJ65V+V/wdnXCgQ/7wdfDvD7Cr/5yHJvOmpZR+8GQpni5V0O7Bp10TbpZI1BYUxk7Mfn5Y7EaGXOfbr2MpbsTLDr/6RmDIFcosWDzRSRkFOL5HtFGZUfrc+JmjkaP8aqZ3Zve7KnqSy6k6r/LIC9XxE8dIPh5AGDYl/tx4XY+jkzqjyAvF6Pf21NzS4yqzFG5CIFqp8yCMpxOzkXn6AD0XrgH2UYEVgzpHxOCF3tFo/N9XoHCkZgblLbnZ6Q9gmU73u1tciltY4Kf2lT/3TpaUBrQPiZLXhPm/IyrxndUq0Qk1PlMWaBryPlbeRi+WHdmeaWtb/dC0zDHKgstxOsuJswbl9IK9G4T5uOGeSNbokNkAHKLpCiVySG+W/FJoVTi1V9OYO+VTIvHUsnNWYxwP3e8N6gphrY03O6KiIiIiMgaGJSuQTIKStFprumZG6/0aYgPh8RYYURAXGK2RinYw5P6mTUJocuzXRsgNbcUXq4SuDlL8MGQGKv2VCspl6PZdO0TVM3r+ODH5zrB01WC344l48KtfEQFe6J1uB8ahXgh1Mcx+/sVlcngJBGpBfEVCqXdgmGlUjl+PJSE+Zs1szV3vtcbDYONn/iqDDxUMmdSSCZXQAmolXM3RKFQolyuMDsrgu6Z8fc5/GhEH/vhrepg2ZPtDG6nT1ZhGQI9XSASiaBQKHEmNQ8xYd6C/B3nbbqIb/dd17vNuG6R2HExHW/0a4QxHU3LbrGGf07fwptrTqo9JhIBifMZlK60eOdVoyqEfDyqlda/qbkTm5ZkXRnrxp0iHL52B6Pb1zM6aGuJ6r+L0zMGqbUuEJJMrkBRudys4xv6mw1rFYYFo1qbVSWCap4Bn+9Va/VhrIbBnpjxQAvERvhZ7XVOlimTydF0KoPS1b3QIworDyQCAJwlIlydO8ys45gzVmsGpfd/0BcRAZYHXO0dlD42ZYDJPZ6r2nAyBe/8pr1HttCv7ed+OIZdOtqRjWwbjrmPtBIkK1tolr7uEudX/M9Uz6of2DwUT3VpgH9P38LzPaIMVqMpKJWilZ5WXNXbfejy4ZAYPNcj0qK2GEREREREQmFP6RokxNsNSQuG47djN/HheuOzpq0VkAagtRRmHV93fDyqlUljrOr41AH480Qq+jULQXSQp82DvO4uEqx7uStyi6W4lJaPtPxS1PF1xzNdG6iVW9bW89lRacs6tmd2ppuzBC/1bojne0ShqFyOpbuu4mpGIWY/1NLkyaL5I1vhISOy2/UxJxgjFovgJuaNvRA+eqglJGIxvj+YqHe7pWPbWnyuoColqcViEdqY2U9Sm8nDmukMSkcFeeKbp9qjaZg3Zj7YQrBzWurB2LoaQWkGpNU91aWBUUHp0e0jtD6+873e6P/ZXq3PuTiJsfbFLth5MR3ebs54vkcUcorKEextmwVODQI90SDQdqUYXZzEKJcpAFRUPbBmoM5JIoavu3mB9nA/d6Tmau/BvXRsW4xoXdeSoVENM6xVHSzeedXk/X57qavaZw45Hmex9Rfj1DS/TuiMLlGBeGdgE+y/moVeTczv9Vs/wAM3s4vRKSoAWYVlatWaxnWLhJuzBN/svab3GGM718evR2+aPQagIhD46aOxgn3m7Hi3N9LzS3E6JRefbLksyDGN9c6AJhYFpAHgkbb1IIIIb/92Su3xVeM6WnRcbT59NBarD9/AqPbhSMoqRkGpFF5uTgj0dLVKRRZH4OkiUV3DbX27FwYv2qd67pun2kMiFhndc9vbzRkTh8bg5M0cbD2frvbc6uc7oWfjYLWF5pX5JpXnVyiUKCiVmV0JioiIiIjIGhiUroHGdKyPMR3rIzm7GD0/2W3v4WjVJkJ/byJ9Ar1cMaFXtICjMV1lj8gBzUPtOo7arjJoMGV4c7OP0SrcV/X1wtGthRgW2cH0B5pj2ohmWLDlEpbv1QzsvtQr2iGrEFRXOXn6cJu6GNmuHkqkcgR6uiA2ws+kTHxyHAojCsoMaBaqs0+wtsoPoT6umDK8OR6MrQhuVu3nF2KjVgr2ED91AFrP3AZPFwlCLJxUt6a/XuuOtXE3selcGi7ezkefpsH46MEWNg3gk+N4vW8jk4LSayZ0QeeoAPYDrwHEYhE6RQYgLinb3kMx2jdPtcPLP5+w6BiNQ7xwVUf2f7eGFUFoT1cnDGkZZtF5tr7dC2n5pao+tIlZRUjIKES7+n4I9HJFmUyOJqFe2H81CxtOpmo9xtThzSwKSg9uEYolT7SDi5Nw12CNQrzQKMQL7Rv4IyGjEAObWXa/uHB0a7y/7ozq+3cGNMELPaPg6eqEMpkcm8+mQaFUIszXDZ2jhCn//3DbcI2gdN8Y88qB6xPg6YK3BjQGAMHKgtvClGHNMHfTRbP23ftBX9XXTcO88e7AJvh8+xW0b+Cv81pRn5d7V5Scv5JegFd/OYH29f3xdNcGaHn3HrjqZ031eyWxWMSANBERERE5HJbvruHyiqWInaW7pNPhSf1Qx9f0/qumqF7eqrLslzllr+KnDmBWCdF9rtWMrSgok6m+H9u5PuY90sqOIzKeQqFEam6JIOUhbaXqe3XX6ECsebGLHUfjeIxpnTGweShW6OkvfCwpG59tu4xx3aIwsLnuAPb9oEwmh1IJtj6gGsXYa1qhygOT7bzw4zHsuKi9vLAu9izfDVhWWnhku3A81z0KI5Zo9vo9OLEfwv2se9+ojVKpxK9xNxFbz08VaKtkbt9voGa1I7lxpwg7L2agW6NAxITZJoN4TdxNTPqzorLZrvd6I9qE9km1nVSuQOMpm03eb+ObPdCirq/G45kFZfD3cLZJqxQiIiIiIkfHTOkaztfDGUkLhqPVzK0oKJWpPff+4KZWD0jr886AJvhih+GSp5XmPNySAWkiwrGpA7DjYjokIhGGtqpj7+GYRCwW1eiARD8rZMnUeAIs3esYGYC1L3a1/EC1APsZUk307dPt8eLq43q3Wf9Ktxr9/n+/ahnua3JQuib7/LE2AICXekcj3M8d9QM88O2+65j1UAu7BKSBiuzOJzs3EPy4lRmmNUGDQE8818O2raGe6FQfj7QNh1gkEjSTvDZwlohxcGI/dF+wy+h91r7YRWtAGoDFJdeJiIiIiGoTBqVribMzB+NgQhaigz1x4GoWQnzcjO5VZC1jO9c3KSj9RKf6VhwNEdUUbs4S9my1k64NhSkLWZsYM1HrzqxfolptUIswJC0YjjuFZXhy5VFcSisAAAxvXQfzHmll1f7oZF0v926IRTtM7xluT8/3iMJ3BxJN3q9zVIDq60lDm6m+7tO0Zi1I6xQVgLhE3SXXn+pSH8FebnjRzu2gagJWLdEt3M8dK57pgDfXnMSEXtF4rnsk3lhzEvuvZmlsu/O93lrbtRARERERkSYGpWuR7o0qeoA92iHCziOpYMqK4M1v9byvy5kSEdnLRw+2wIx/zgOARtlMAvw8XDB1eDPsvJiBB9vUVZW6rOq1vo3sMDIisrVAL1dsebuXvYdBAqqJQbnX+jYyKyi9dGw7K4zG9r57tgP+OnUL0/46p/X53k1CMLC5ZX2eiYCK9iwXZw9Rfb/6+c4AAJlcgUPX7iC/VIoBzUJr5PsIEREREZG9MChNFhvfPRKrDiaZvf/IduFoVsc2vbOIiEjds90i0bqeLwI9WVpQlxd6RuOFnhUZVzK5AtP+Pq/2fNMwb3sMi4iIbOybp9rbewgI8HTReGzzWz3xzd5rOHztDt4b1AQZ+WX4as81lEjlqudrQwnhSUNj4O3mjKe7NMDItuH45egNFJXJEeDpgj9PpiIluxjdG7HqC1mXk0SMXnauSkdEREREVFOJlEqlAN0S6X4XOXGj6uukBcNVXydnF6PnJ7v17lt1eyIiIkdXJpPj0W8O40xKHta+2AVdojkBTkRUU1W9jzHEUe5bjt/IwaivDwEAdv+vD6KCPO08IuuQK5RoOHmT6vsrc4bqbKuhVCohVyjhJGF/ZCIiIiIiIkfFTGmyqogAD73Pd+VEPhER1TCuThL883oPew+DiIgE8PWT7fDKLyfsPQyTtG/gj0MT++FOYXmtDUgDgEQswrZ3emHXpQw83jFCZ0AaAEQiEZwkbAdFRERERETkyBiUJkGF+7mbtP0XY9pYZyBEREREREQGDG1Vx95DMEtdP3fUNfHeqyZqEuqNJqFsk0FERERERFQbsLYVCUpbj7ORbcPVvt/+Ti880ak+VjzTAWG+brYaGhERERERERERERERERHZATOlSRDfj+uAJbsS8NmjsRrPicX3yqj1bByExqHemD+ylS2HR0RERERERERERERERER2wkxpEkS/mFBseLU7ooO9NJ7r3oh9o4mIiIiIyDH9+Wo3ew+BiIiIiIiIqNZjUJqs7qHYcNTzr+h39r9BTe08GiIiIiIionva1fe39xCIiIiIiIiIaj2W7yarE4tFOPBhP5RK5XBzlth7OERERERERGoe61APv8en6Hz++R5RNhwNERERERERUe3DTGmyGQakiYiIiIjIEc18sIXe5yODPG00EiIiIiIiIqLaiUFpIiIiIiIiuq95uOgvIiay0TiIiIiIiIiIaisGpYmIiIiIiIj08PdwsfcQiIiIiIiIiGo0BqWJiIiIiIjovrf86fY6nxvSMsyGIyEiIiIiIiKqfRiUJiIiIiIiovve4BZhEOuo0y3R9QQRERERERERGYVBaSIiIiIiIiIA7w1qqvHY0rFt7TASIiIiIiIiotqFQWkiIiIiIiIiABN6RmP2wy1V3z/WoR5GtK5rxxERERERERER1Q4MShMREREREREBcHES4+kuDVTfj+8eZcfREBEREREREdUeTvYeABEREREREZEj+ezRWKTll6JZHR97D4WIiIiIiIioVhAplUqlvQdBRERERERERERERERERES1E8t3ExERERERERERERERERGR1TAo7QDKysowc+ZMlJWV2XsoREQOge+LRETq+L5IRKSJ741EROr4vkhEpI7vi0SOheW7HUB+fj58fX2Rl5cHHx/2LCMi4vsiEZE6vi8SEWnieyMRkTq+LxIRqeP7IpFjYaY0ERERERERERERERERERFZDYPSRERERERERERERERERERkNQxKExERERERERERERERERGR1TAo7QBcXV0xY8YMuLq62nsoREQOge+LRETq+L5IRKSJ741EROr4vkhEpI7vi0SORaRUKpX2HgQREREREREREREREREREdVOzJQmIiIiIiIiIiIiIiIiIiKrYVCaiIiIiIiIiIiIiIiIiIishkFpIiIiIiIiIiIiIiIiIiKyGgaliYiIiIiIiIiIiIiIiIjIahiUJiIiIiIiIiIiIiIiIiIiq2FQmoiIiIiIiIiIiIiIiIiIrIZBaSIiIiIiIiIiIiIiIiIishoGpYmIiIiIiIiIiIiIiIhqOKlcYe8hEOnkZO8BEBEREREREREREREREZHpEjIKAIhw+FoWpv19Ht8+3R5xidmIqeOD0e3r2Xt4RCoipVKptPcgiIiIiIiIiMh4S3ZexfaL6fh1Qhd4uXK9OREREdH95HpmId79/TTGdIzAmA4REItF9h4S2UmpVI6YaVt0Pv/FmFg80paBaXIMDEoTERERERER1SDv/HYKG06mAgAkYhFOTh8IHzdnO4/K/mRyBb7dfx3dGwYhNsLP3sMhIiIispoHlhzA2dQ8AMCAZiFY+WxHO4+I7CWzoAwd5+7Qu03rer746blO8PNwsdGoiLRjT2kiIiIiIiKiGqJcplAFpAFArlBi7n8X7Tgix/Fr3E18suUyHlp20N5DISIjlMsUWHc8BbfzSuw9FCKHcC2zEPuuZNp7GOTgFAolVh+5oQpIA8COixl2HBHVBGdS8jB3I+8ZyP5Y44uIiIiI7OZWbgnWxt1En5gQuDtLEBPmDZGIZceIiHQpk8k1HvstPhmzHm4BVyeJHUbkGDIKSjH97/P2HgYRGUEqV2DtsWRM++scAMDX3RmnZwxS2+bEzRz8EZ+CDwY3hb8ns7qodsovlcLLxUlVdrn/Z3sBAP+90QMtw33tOTRyYLM3XsCqg0n2HgY5gNPJuXjvj9MIMPJzcs+VTMgVSkhY6p3siJnSRERERGQXRWUydFuwC4t3JWDkV4cw9Mv9iJq0Cd/uuwaZXGHv4REROZzCMhlazdym9bmmU7cgPinbxiNyHC+tPm7vIRCRkT7ZckkVkAaAvBIpxiw/jO8OJKoeG/nVIayJu4np/3CxCdVOV9IL0HrmNjz34zGN5z7ffgUKBTtuknYMSFOlZ1fFISGjEHGJxt0DZBaU4X9/nLbyqIj0Y1CaiIiIiOzidEqu1sfnbbqEd37XfaOkVCqx61I6lu1OQEpOsZVGR0TkeOIS7+h9fvQ3h/Htvms2Go19fb7tMiInbsTIrw7icloBTt7MtfeQaq2iMhkXi5Fgjt/IwYr9iRqPH03Mxuz/LkCpVA/E/Xv6Fv6q0rKAqKYqLpfh16M3kZ5fCgD45cgNAMCey5nYfPY2Fu24otp216UMrD+RYpdxUs1V+dqi+8PBhCzkFktN3m8DP1PJzhiUJiIiIiK7GLviqM7n/j19S+vjcoUSUZM24bkf4rFw62X0+Hg3Vu6/jstpBdYapk75pVKs3H8dn227jJJyzXK6RI5OzgycGiW7qBzP/RBvcLt5my7ZYDT2VVIux+JdCQCAEzdzMXjRPrOPdSktH9P+OoeMAk7kapNTVI4WM7ZixJID9h4K1RJPrDii9/lnvo/TeOzt305ZaTREtjNn40VM3nAWo74+pPHcK7+cwKIdV9UeO3RN/0I0uj99tu2yzuc6z9tpw5GQvT25Uvd8CpEjY1CaiIiIqJZRKpWQyhUol2lmNUnlClxOK9DIQrGVbefTMOPvc7iUlm/Sfh+sO43O83YgZtpmjefmbLyIwYv22fxnGrP8COZsvIgluxLQ+qOt90UW2d+nUjHj73NaX1sAkJpbguV7ryGvRPuK7aW7rmLFvusWjeFqegEOX7uDUikXAlhix4V0NJ++RecCEHI8i3deNbzRfeJUcq5gxxqyaD9WH7mBTnN3Ijmb1Teq23c1EwBwyQ6Lv6h20nUNUWn/1SzcuFNko9EQ2c6OC+kAgJScEjuPhGqyJXcX5enCRadkjJlsjUF2xKA0ERERUS3y46EkRE3ahMZTNqPj3B2QVguUvrT6OAYv2ocx3x6xeenrK+kFeHH1cfx4+AaGLNpv0r6/x6cgPb8MUrnum2xb339fvH0vsC6VK7HmWLJtB2BjX+1JwFtrT+HHwzfQZOpm/H4sWa3X3aazt9F9wS7M33wJUzac1dh/7sYL+HTbFczddBFlMvMCyo9/exgDv9iHJ1YcQcc5O9hrzwIv/BSPMpkCb6w5icIymb2HQ0YoLuffCajohWco09JcPT/ZbZXj1hacwCRb6b1wj72HQCb6dOtlvPhTPK/NdNh2Pg0ZBWWq73ddSje4z4aTqcguKrfmsKgWeoCVTWqcc6l5GLFkPw4mZNnsnD8cSrLZuYiqY1CaiIiIqJY4lZyLGVUmjPNKpHhq5VEUVQk47bqUAQCIS8xGj493Y/XdXmbWlllQhkFfmFZetTKgrivrtrqGkzdh7sYL+N8fp5FnRm8lS3269XKtXpn+yRb1UnEfrD+Dnw4nAQBScorx6i8nVM/9d+a22usOgFr/yBl/Gx/YUCqV+OtkKt5eexJHrmerHi8ok2Hin2dM+RFIhzd+PWF4I7Kr5Oxi/B5vfG/Jn2303m4PqbnCZZhdSWf2rz6FZTK1QPQPh5L4OyOzVV8oSbWHQqHE0t0J2HYhHb/F1+5FmuZ6cfVxte+f+yEet/IMt414h6XryUQXbptWkYzs61hSNkYsOYBzqfl4cuVRHGbZfroPMChNREREVAtI5Qq88vNxjcePJmajxYytyC4q17rydtpf52wxPHScu8Pkfb69W+b5yZXGZ8St2J+IdcdTEDtrm1XLeWsrP55XIsWvR4ULBKXlleLQNdutlq6UkFGASX+eRUpOsSrb5ZMt2nvUzvrvApRKJUZ/fVjzuX8voKhMpvV3tfZuVnlucbnejJq8YimiJm3C27+dwl+nNMtM/x6fgs1nbxv1c5Fuuy9n2nsIZMCba0+atP3Uv87V2gz4lfstawFQKT2/1OTFUvebz7ddQU61RV4l5WydQKY7lZyLxlM242Md1xM1RXG5DD8eSsItARfH1HSnk3MRPXmT6vtJf57FLwJeD9dm2y8Yzpa2ZeZkTXDoWhYe+eogLtyqvYHXjPxSHEzI0novu2y3/tLd5PhyisoxYsl+TNlwFhn5pXj0G/X7aGtVA9Jm5f7rdmvrRvc3BqWJiIiIaoGdFzNwW89q+31XMvHkyqM2HJHlKrO6z6WaN+nwzm+nkJ5fKnhmTl6JVGf58b1XhJk4Wrn/OrrM34mxK47adDLq+I0cDPh8H9bE3USPj3cjevImDP5iH77ac03r9gol8OXOq0jL13zt7byUgWGL92PIov3Yd0Uz6Hk1vQBtZm3HM9/HaT+2QonYWdsMjvmVX07gKjP3qJa7nml6f1VDfVsdwflbedhyLs2k7f87Y9xCFH3B6+uZheg8b6fR571fXcss1Hjs+4OJkDHjlUxUWcnnax3XEzWBUqnEs9/HYcY/5zH4i304m5J335eqViqVeGjZQY3Hp2w4hwNXhb9+lSuUOJWce19l3cvu89dYVQqFEmNXHMXJm7l4YOmBWvv/1/3jXXhy5VHVvXBVC7de1rKHpsU7rwo9LBLIst0JOJeaj1+O3kQnC65Fd2t5fZhqzsaL2GeF92oiQxiUJiIiIqoFLhoo06VvVfV1LZPOjkChVOLodfPLV/116hY6z9speF+tZ3UEUSsIMzkyZ+NF1dfjVsXZ7G806utDGo9dNhDwXbRD+6RHVmEZbtyp6FuuLfA8+u6q8AN3g+4yuQJX0wugVCqx+nCSWtaNIV/vrbmT3I7id5bbdGhiken72Crz4VZuCfJLpVAqlSgskyE+Kduotgs/HU7C8MUH8PLPxzF88X6jxvuPlooJuszZeBGlUu1ZvTsv6p7Is2ewIzW3xKEm2UVaXnd/n7qFNXE3bT8YqtGupNXsxWOFZTJ0mrcTx5JyAFS0EHlg6QEs3GZcgKi2mrfpos7nnvruqOAVOxZsvoiHlx20WaUnR1EmY4WKXZfS1RaryhVKvFWltHlCRgEeW34Yh2pBZrlUXnEdsKdKJaPbeSXoZELlsc+3XzG6BRbZVomOa1NTjf/hmCDHSckpFuQ4RKZwsvcAiIiIiMhyYm0zx1VczdAd1Bz59SGcmj7IqPMolUqIDJyruklm9v0VAfhUgMm+S2kF2H05A32bhlh8LKCiBKUuOy5m4Ep6AZqEegtyLqBiYqLfZ3vxXPcoTB3eDGJzolMOqOpESeTEjRYd688Tqfh0dGyt+d3YwwfrzqBPk2CE+LjZeyhUzaaztzVKKBvD2rHNsyl5eHT5IZRKK4K49fzdkZJzr6Tt/JGt8ESn+jr3n16lt/z5W/m4llmERiFeOrePT8rG8n2mle7W9XGl1LOAqPuCXTg6ub/Jn3XmupxWgKsZBVh3PAV7LmeiXX0/fPNUe7v/Lx69fkdtQryq87W4bCoJT6FQCjYJby35pVL4uDkDqChRH38jG6E+bvj5yA38c/oWcnW8B3+95xp6NwlGl+hAWw7XIWQWlGHF/kS92/T/bA+OTh5g8bky8kuxfN91fHeg4nxrjyVjwajWFh+3pui7cA8OTepv72HYjUKhxHM/xGs8/u/pWxjcIhSx9fzw4urjuJ5ZhLErjyJpwXA7jFJ4VbPku87fZfL+hxKyMLRVHSGHRAL47RgXAhMxU5qIiIioFrBk7lzXRFt16fml6Dp/FxbtuKJzmz9PpOD3Y8mQV7mJXhNn/o2XUIl+z1uwklgqVyAxqwgyucJgRjoADPpiHzK0lLM2VnG59qyS7w8mYsSSAzoz/+532krckWkeWCpsVYH73ankXKzcfx0KhRJ5JVKzM5df/eWEWfvpC7xaKq9YigeWHlAFpAGoBaSBir6i2py4maN1Ico/pyuyoFcfTsKvR9UzcXdcSFdVVzCFRMeHY5lUdzZ0RkEZym2YLT140T68/utJVQD4xM1ci8o5CkVfT8O1x5KRyp66ZCQhFhgK5eadYqyNu6mqiJCcXYwmUzaj9cxtiJy4EY9+cwjNpm/B09/FYdAX+/DT4RsGr5Mf//bIfZHJuuFkCh775jAupVVcC7/2q+HPpvT8MkHO/f66M6qAdKUhi/Yho8D8621tTifn4viNHEGPKYRbelo0ObrErCK8+stxnEvNUz227Xya0aX8y2UKrSXiK73+60n0/GQ3ErPU25zkFUux4WQKigTO1relNXE3LWrF8sovJ5CgZ2F6VdcyC/HniRS7VmvZfiEd/T7bgzfWnMSGkylq8wm1xZ8nUhyuJP/p5Nxa+bsmx8ZMaSIiIqJawJr5XKVSOSb9eRZ7r2Qiu6gci3ZcxdsDmmhsl1cixbu/nwZQUU559//6WFQ+VomKEt5CUCgrykM7SUxbk1kqlSP2o20oM3FC4P11Z/Djc51M2qdSYanuyZMLt/OxZNdVvD84BkBF6XV3Fwnq+Lqbda5SqRwHrmahS8NAONXwLOPqk1HmKiyTYcOJFPRpGoJ6/u42y5Z0BJWTx+ZURCBND9+dRK0sxz+oeSi+faaD7QZgxfmlh5YZt4Dhwq18NK/ro/bYyK802wQAFf0Ph7UKw7S7GdT1AzxQ188NJVI5XvhJM0PKGLpex7aee9t2Pg0vrj6Otwc0xqt9GiE9vxQRAR569/k9Phmj29WzWwUIQ7+jF36Mx+a3etpmMFSjfSVQH2lzruOq67VwNwAgp1iKOr5ueLtK+V8AqvLcpmo6dQtOTR8IPw8Xi8bnyN75reIaf8ii/dj/QV/EJWYbtV+rGVux/tVu8PdwwZtrTqJrw0BM6BkNdxeJ0ec+dE2zJPOltAK889sp/PJCF6OPo49Ufi/4eXbmIHjfzZwXQm5xuWDHqmle/eUELt7Ox6azadjzvz6IDPLEi6uPAwDaN/BHp6gAnftK5QqMWxWHs1UC2rpUv2Wc8FM84pKy8UjbLHwxpo0lP4Jd9f10j0WLwK6mF+itQlOp/2d7AVS0VVo4OhYSG197rDqYiI/+vQAAuJ5ZhH9P30JhmRxPd2lg03FYW+VciSE37xSjfqD+60Sh/B6fAn9PF0wa2swm5yMCmClNREREVCtYGj/ac1l3lukPh5Kw4WQqsovuTahUZrRVVXUld2JWEZKyiswqNVaVkHGDzefSTN7nlZ+PmxyQBoC9VzJxzdw+0Ab+lgeuVkzM3c4rQb/P9lr0O3782yN44ad4jP76EGKmbTH7OI5g7qaLFmUTVJq64Sym/X0ePT/Zjdd/PSnAyGqWyIkbETVpE5azT7fgtl1It2l2qVygRT1KpRIKhRLLdidgzn8X0H3BLiTdMa7/XPXs+2NJ+oMYQxbtV3391HdH0e+zvRi+2PwMfl1vp4YWPJ26mWv2Oas7m5KnmoBftOMqnvruKHp+shuREzfqbV3wwbozeHLlUWQVWpZpqFQqkZhVZFL202k9bSoqXbydrzVQRGQt5laN0ObQtSyNgLSlvj+YJOjxHMm/1a77KwNYxigok2HQF/vQce4OHL5+B59vv4KhX+4zev8j1++oeuxWd/jaHaOPY4i0SoUMoXvxzv5Pd+/t2kxarcpUn0/3YNnuBNX3aQYqS/14KAmHzPgbbzl3G3F3rzc2nEw1eX9HYul14ydbL6u9trX56XCS6us/T6Til6M3LDqnqRbvvKoKSFc17a9zyCnSXNAhlSvw96lUnE7OxZ7LGUjONq8nsjHXRXKFErfsUBmm18LdanMv1rZ8r2ntcYgsxaA0ERERUQ2XW1yOT7fpLqltjHGrdJe3zizQnBB/c81JjfJ21QPjfT7dY3CywRBjS4sb45wRq+yr262jn6YxXv3Z+MnTMpkcE36KV5sU0OV0Sh72XM5QC0abk5EuVyhV/bEvpRWYvL8jyi0x/ub9/K08/H4sWSPTZ+v5dCgVFWU4N569bdF4LtzKx4gl+7H3ivmvI3uZv/mSvYdQK3VfoLmIJCO/FLsupWudHNtixmKaSkJkA3+z9xqiJm1C9ORNWLj1MlYeSDRpglSuUKraGZTJ5HjUjBLclpigI8PaUFB6zLdHBPm/Tc8v1QjMG5tdCACHr9/BmOWW/c5+OnwDfT/dg4l/njF6H32lUqsau+KoRRVRiEyx7UI68kulegMJ6fmlyLt77SiVKzD0y/148ad4pOQUqwVm9l8VfkGFXGG7sv+29sYa9UV6lrY4SLpTDJmRxxiv5x5FoQR2XUq3aCy2kGDuQtUqSsq1l4i/U1iGPZczEJeYbfTv1FY+WKf5ubNw671y/ob+Z45cN/7zsqqXq92DnTJioVVtlZhVhNWH9QeZp9+tUFP1+4JSYRdmVJdfKsXV9AJcTivA59t1z2O0nb0dl9LykZpborreWL73Gt5aewoPLTuIcauOqRb6/Xgoyejzv//HaURP3oSBn+/Fin3XVZ8b1Y3/4Ri6LdiFHRds/z7z0LIDWoPyRLUBg9JERERENdwnW4Xp1feZjp5/uoK5o74+pDYZLXQPqpM3cwUryQwAy/XccGqjL3vcGDdNWLW9/ngqtl9Ix/S/z+NKmuGJq+qLCPaYETz5eEvtCzrqi42cTcnD9wcSsf9qJtrP3o7hiw/gg/Vn8NjdgI9CoURxuQwymQwicUVJybLbli32eO6HYziXmo9nv4+z6DhCkSuUOH8rz6K+YaVSOfZczmBvcwH1+GQ3nvshXms2z8s/Hzf7uN0X7LKol+KV9AIsEGBxwoSf4nE2JQ9PrTxq8bFMtfNSBm7c0fwcMaY1xM6Llk9AHrlueRbftcyK8ecVSzFlw1lETtyIJTuvGr3/jH8qJpt/j08xuO2Wc7ex5Zxpi3GecZD3N7o/tJ65DR3n7tD6XF6JFJ3n7UTsrG3IyC9F4ymbcfF2PrZdSEePj3ej8ZTNVh0b12eYZvAi47KlDV2zPPdDPLadT8Pp5Fyti2R2X8rA0C/342BCFvZdyVRdvyiVSqTkFCOnqFxjv4eXHcSiHZZdA1ZVJsA1U7PpW7T+fH0+3YNxq47hseWHLV6kLDRDWcqG1nEIVUH6YSMXWtVW52/l63xu4xntn/nGlpk2V+uZ2zDwi31GvQ8MWbQf3RfsUlUc2HVJ+z36jH/OY+pfZw0eb8/lDPxxvOKa6GpGIeZuuojYWdsQOXEjJvwUj+t3F5EolUrsu3uP/eqvwlXqMFZydgnmbbJdlQUhKo4RGYtBaSIiIqIaLiPfstKelZbsStD6+FE9WV0ZVbKoZbZu0mmG8T8YN3l+KCFLb/a4MUqkcqNLeBeW3QuWP/Wd6YEbfVkkuny7r/aV6UrL056Zn5hVhAeWHsCs/y7g6e/icKfaqnOlUoknVx5Fs2mbIVWKIC/OQ9ovHyBnzw94eclfOH4jB+/+fkpr1QB9sg30EJTKFTYN7s7fdBHDFx/AtL/PmX2M6X+fw7hVxzBxvfFZl6Tu2333SqOfvJmjmgR674/TuGNhqebq5mzULIdorPe1ZDiZ43RKHh5YesDsXq2WenjZQby19iQOJVRkRkrlCizbbZvy9F8JdJ6j1+/gjbUn8cvRmwCAz/RkFVVKzS1RTawao7BMhpd/PqGRYWbI/qtZyCuRYk3cTUz96yy2nTc/u58c34Vb+Xjt1xNGLxq0RiZ99c/wSgkZ917vlf8rtvTVnmsok9l+wZZSqcQba04iatJGs6oC2UvlgptKZTI5fo9P1riWExsxc/3i6uN4aNlBRE3apMrwLCitWMgz/odjuHg7H0+uPIpnvo9DzLQteHPNSUz56xx6fLwbbWdvR9SkTSissogrq7Aci3ZcFez1K1RFojMp9/6+W8+n4fVfT6Cg9N64v9l7DbsupeNQQhaeWnlU66IsR1Ki5xr46PU72GaD7FSlUolzqXkGA3JnUnIFWaxmD+tPpOhst/GajmDr9gvp2HAyBf+eviXoInEAZrcI+v5gIiInbsQJPS1Wfj5yE4t3XoVMrsDZlDyNRfMyuULvPf72C+nod7c9QdX7+HKZAvFJ2Rj8xT6z2waYE/StDJ5XZ8pCe2M1mboZf8QnC35cIm2c7D0AIiIiIrKMxI7LDDvP24mDE/sh3M/douxLW9F3E1tp/9VMPP2dMJlf/T/bi1PTB8LPw0XvdmJLm4IDiE/KRofIAIuPU5M9tOwgkhYMV3vs71OpeGvtKb37RU3aBAAQicQoz7qJ9LWT4R7ZFr5dHsOma2XY8vUhAEBBiRTP9YhGqUyOvk1DdB5PKlfgwNUstcmHo9fvoHN0oOr71jO3Ir9UBndnCU5OHwg3Z4mpP67JVh5IBAD8asFkfWW25V+nbmHK8OYI9nYVZGy1SZlMjoeXHdL5/LxNl/Bir4YAgEe+Ut+u+8e70D8mFO8MbIxGId4Wj8WUY8QlZmPOxguY9VBLtInwM6qvcE2QUyzF36du4e9Tt3By2kAM/XK/4Z1getajTK7AxD/PolNUAB7rEAEAuJwuTCBizLdHjN724u18o3/GqjZb0K7gg3WnsfV8xWT9z0duarwPU+3x9HdHcaeoHKeTc3Hgw35qz93KLcHzP8ZjYPNQvDuwCQBg/Qnb9XOteinl4mSfi+NNZ2/jkbb1bHrOLefSVD2fRyw5gL3v98HtvFJ0qXLN4eiUSiUeWnoQl9IKEOztimNTBqieK5WaFshpNXMbRrevh3U6gjkA8E+1HtkAcOJGrsZj4384hh/GdzLp/FXlFJVjq4ALdXLv9rrOKSrHS6u1V1J57od7bSt6L9yD+KkDEOSl/VpNqVSiuFwOT1fhwwN/nzL8vz/1r3NoGuYNNycJ6vi5qY3TlM89Y5y4mQMvVyc0CVW/Llq5PxFz72aj/vdGD7QM99W6/4NLK7Ktd73XG9HBXoKOzRbGrjiKE9MGIsDz3j1ploGFkO/8pp4t/VCbulg0pg0Ss4rw0+EbeKl3NOr4ups8Fmu3CPp8+xVVSfC+TYNV7bh83JzQvoG/0cep/j82+m4LmidWHMGZmYPg4+Zs0rh2mLmoYefFdPRvFqr6vrBMhthZ28w6liHvrzuDR+9ewxJZEzOliYiIiGo4iVC1zcw0424fqpqQKQ3oz9rJyC8VLCBdqc2s7VZZzVzd6G8Oq/VLvF8dv1GR2V9SLsc3d3uOGUupkKPo3E64R7VH0Ij34BwUAZGTC6R3UiDLz8L2ixl4YsURjF91DE2nbtaZ5TznvwsY/4P6Kvwx3x5B9t3sruuZhci/m9lSIpVj0Bf7jP7bFZfL0HX+Tsy1IAPWWG+vvddD8uadYqyoll3fce4OrDqYaNaxj9/IRu+Fu1Vl8WqT1Ydv4OJt3aUSgYpsQ21KpQpsPHsbz35/DGdTLM94MyXL67Hlh3EmJQ9jltfe95K2s7cjLV97RYXqVh+5YdICjn9O38K64ymqHpqWtoAwpHqg42xKHhZsvmRWQDo5u9iizPjKgHSlQgvKxpNjq8xSTsnR7C3fbcEuXLydj8U7ryL97v/Z6sNJVhlHXon6ddXfp1JVgVkAcLHTis2Scuu/d55LzUPkxI2InLgR+65kqhacVeq9cA8e//YIlu4yvsy/vVR+Rv135rYqm9jUqjTa6AtI66KtXcaey5n45/QtnZ/Zhkz4KR4T/zRcTthYf56o+LmmmlDxpsOcHTpLGr/3+2m0mLFVkAz7befTVC0rlu1OMPr6+9FvDuOBpQfQYc4O3MrVfF8RysivDmHQF5qlopdXubYdseSAweMM/XK/INdn9tBu9nb8evQmlEolZv93AR3maG+FoMvfp25h0p9n0e+zvfjhUBL6fbpXo495/8/2IHLiRsHbepmrMiANAPmlMrXvDalezaGq1jNNDwqbWx77+R/j1b6/KtCCR3PkFpfjlZ+PI3LiRuy28nUu1W4MShMRERHVcEVlwpUKrJzk6vXJbqPLyFaWnq4JmdIAEDNti0af1SvpBVi66yoOC9D/U5txBsqGiwTIlAbuTVbdz0Z9fRjNpm1Bs+lbDPbDrR6wE4klkOVlQJZXEWDJO/w7sjYtwu2f3sHtH99C0aV7k1VlMgW+0NJvsFymwI+Hb2g9X2Vp0eoBm5vZxWg8ZbPBcu+rj9xA8+lbcTuvFCv2JyIlx/i+5eb469QtXE4rwG/HbqLXwt2qTJKqPvrXvOD4qK8P48ad4lrZjzbTiPfOYYv36w0Yp+aW4IGlhidHjZGYVYQJP8XjlJGZz2UyhcHsmfvF5A3GBxOuVikdHPvRNizcetkaQ1KpzOApLpchIaMQDyw9gG+MLIlZfbL4xE1hS6u3nLEV6fmleOXn4zUiMEbm0dezvjJL8rSVgjdVW0jkl0rx1tpTWHUwSfWYUFUKTLXrUrrO93alUilIee8XqgQonvk+DsdvaP///XTbFYcJDOlSee9wMEF7aWFH8Oaakxi2eL9Zv8t4HX8bc/196haGLNqnsw+wLj8fUV9g9fepVHRfsAt/3u35bEwwVp/U3BK8uPo4Hr+b3Wzu599XexJwO6/Eqot5n/7uqNr/aPX1Kzl3F96cTalY/LG9WgnxMpkCDyw9YNYCMEcwecNZRE3ahO8OmLeodO2xe+WdS6RyvPBTPBZuvYR/Tt9C74W7VYHc6MmbMG/TRaNbWTmaNXHCt4CwZK6k6mtWqHkDXbSVaz90LQvv/X4abWZtx+ZzFYsix686ppo74mJEMhWD0kREREQ1mEyuwF4rZBrezC5G+zk7sNaIG7Ij1ysyU7VlGDiiMpkCh6r1gnpw6QF8uu2KSVm1pjh5MxdxidmYu/ECJvwUr3FTKtStpb7+3/cTfT3qKikVctVNvVJ5b+W6V6sBkOWm4ebno1GSeALOAfVQZ/wSuIY3Q37cerVJgeV7r6OkXH73GEq8/usJNJm6Wec5b+dVZIDo6i3Y/7O9OjNsL6cVYNpf6pkxPT7ejV2XrNvfbvCiffhwvf7AXLaO/p66VJ+w//tUqtpjSqUShWUynL+VZ5V+pNYmMvI/+tVfTOvdaw6FUonXfjmB7RfS8fCyg0bvV7VH5f1OV6/66nKr9JDPK5HivJmZdabq/9leDPh8r1HbDl+8Hz8dTkL05E2YX2WRiTU++15afRybz6Xh023qi3eyCstq5P81aRr19SEcupal6uFb1blU677+N59LU72OKj+HqzInU1YIOy5mIGrSJqw+nKQaX16xFIt3XkXUpE1oOnWLxUGa6lni+gxbfC9w5oj/d5U9g3UNzZEy8aInb9LICrUHIfpTv7X2FFKrZSV/vMX8ksoZVSqQ5Jh4TVjVz0duouv8XVYrTQwA+69mqVUaSc9XX4TXdvZ2nL+Vp1oYOOGneERO3KhxnIu38/X+T90vn3V7Lmdi2e5reHPNSdy4o75Y9tt919H/s70oKpPhVm4JispkDr9QptIkIyocmJoxL7fg9dD/s3tZ6Qorv676froH3ebvVOtJP3bFUazXs/i95Yyt+CM+GbsvZeC3YzdV+249n4Zpf51DuUwBhUKJ7RfSHeJ9lOyPPaWJiIiIarB8KwcOjC05dyktX5UFWhNUv5kztV+dOR5bflj19T+nU9V6DgpVgf3PE6n4/LE2Wp+TyhV44tsjeLhtOJ7q0kCYE9ZQSqUCIrEEirJi5Oz+HoqSfDgF1IV7ZDu4R7dH6NgFkN5Jhlv91hCJRBA5ucC1ThNIc24BCjkguXcb1Wz6FsTW88Ub/RrjPwOZK/M3XcJDbcL19o8e+uV+bH+nF3ZfzsCJG7l4plsDdGsYpLPk8HM/xNu9f+vYFUew5e1eRm+/ZFeC2vdvrT2FvZcz8fmYNli+95par7kfn+uELtEB+GzbFeQUlWP6A83hbWIPN1tTwrjJospMA2tSKtUzBpOzixER4GFwP21BpvtVl/k78ddr3dEmws/eQ1ETFeQJhUKJ20YGzQHg/K18TL/bcmP5vuv4PT4Zv73U1Srjq5qZP+iLvVg6th2upBfg9V9PYmzn+pj3SCurnJds51JaAcauOIpOkQH4/WX115EtOsvM2XgREf7uyBCg3LPQpv19HtP+Po/t7/TCwGolg/t/theb3+qJZnV8zDq2Kb/bS2kFKJPJMX/TJfxwKAnTRjTH010aGOy5ffxGNlydJDr76wrl1V9OYHz3SPwWn6z1+fGrjml93F72XslU6+1ak0RO3Ih/Xu+usy3F13uu4ZU+DU3uk1td4h3dJY8dRXK2/jLhwxcblzle2Y96dPt6kMoVKJMqIJUrsPNSxWKK9wc3xWt9G1k83pquxYyt9h6CVTyw9ADWv9LNqD7Va+NuWrR4/HpWEYZ8uR+b3uyJkV8dMvs4xrqVV1rRCqJjhFGLvQFotIEJ8nJVVV7adPY28kulkMor7pEuzxkCVyf1++GzKXlwdxGjUYh633eqnUTK+2HZDhEREVEt9dPhJNUEMxlvxTMdMLB5xaSSUqlE1KRNNh/Dlrd7IiasYkLyx0NJmPGPMH/HxU+0xYOxdTUef3n1cWy5mxmQtGC41lX/9xNpdirS10yCS0g0JN5BkOXeRllaAgKHvAHPmB6q7eSlhSi5egTZO75FwMCX4NWyv0XnvTZvGP73x2lsuFsy0RhJC4bjo3/Pq5Um1SZh7lA46emjac2/uaHAeGGZDC1nbEWfpsHYY0I/NwBoEuqFK+kVi17aN/DH+le6mT1ObZRKpaCl8OZvvojle68b3tAOBjQLwYpnOmj9eau+Pr56sp1NMrlrklHt6mHBqFZw1vE/NunPM1gTpz2wQprsvZiGzKPrc6Sev7tGn+mzMwehlRl9N+8XcZP7I8THzeT9hPosT5w/TOtnQV6xVJWpGubjhiOT+9v8mjFpwXDczitB1/m7bHpeQ1qF++LfN3oY3vCumnat/d7AJnijf2OT9zt+Iwejvq4IlPl7OCPHiuW3hXJt3jBIxCKr/402vNoNbesbDlpWVdNeN/e7Xyd0RreGQTqfX304CdM4X6NBIhbhp+c6oXujIJxLzVO1Ebg+bxjEtljZRnbFoDQRERFRDZKSU4xfjt7E13uu4dun2+PF1TWjZLYjerJzffyiI1vAVhaNaQN3FwneWntS0Gztf1/vgVb11LNbqk5whHi7OmRmkTUplQqIRGLV17l7f4Q05xaCH5oIkVgCeWEO8o9tQMGJ/xD2zBdwCW6A4oSjKDq/B6XJZxHQ/yV4Nutp559Cv88fi8XIdvXUHsvIL8V/Z25jz5VM7LNCqf9K5z4aDC/Xigzysyl52HExHS/1joaHS8Vjs/69gO8Pmte/Th9dk+rGkskV6P7xLni5OuH9wU0xpGUdk4+RW1yOcpkCEAEh3m6Yv+kilu9zzKB0pZkPNMczXSNRJlPASSKCRCRC9GTbL86paXo2DsK4bpHo3SQYWYXlUEKJYC9XOEnEDEqb6Mik/gjzrQjICb0whKyn4eRNFvXFJHVBXi5Y/0o3NAj0NHofIQNWa1/sgi7Rgarvy2UKvW1IbOX1vo2wdHeC4Q3tQNfiT21qYnDxypyhOjPpdfW1PZaUjUe/OaxtF4fVq0kwHu8YYZMFeKZeq9bE1839TlsgtfLahn9P02nLpDaGVK5AbrEUgZ4uyCoqg5erk+peFKhI6DhwNQuPtA3H0Fam3/ORcBiUdgCrj9y4L/pMEBERkXm8XJ0wZcM5o0snEQHAuG6ReLl3Q3i7OeHLnVfxrYMHyYRWNcihVMghEt+7sVUq5Ej/dRKc/EIRNOI91eOygju4s3kxJB4+CBrxHsozrqMs9RLcGsTCOSBcdc3uyMGTrW/3QtMwbyiVSnyx/QoW77L/pO5rfRti2e5rNjtfkJcrujYMhJerE9bEVSw8mfVQC6OrSkwZ1gzHb+Qg2NsVMoUS2UVlUCqBNvX9UD/AA63D/SCRiHDwahY2nr2N/VczURmjGdoyDOWye6Ubicg4wd6uyCwoQ9foQLSo64MQH1cEerpCrlRCBKBcroBUpmD2jJ2xOo91nP9oMNbE3cScjRW93oe2DEOwtyv8PVzg7eaEzMIyq1bgGNctEj8cSrLa8WubuMn98c/pW/B0dcKha3dQKpXDw0WCIK+KRZ+Hr2Uhq9D83sqOQCwCXunTEBN6RuPXuJvYei4NF28XoFyuQICnC17p3RBuLhKcuJFjUvWf+1GApwv6x4QgLikbJeVy1cLgdvX9EBnoibb1/eDp6oQQbzfczivRKINMNcPRyf3h6+6M7RfS8caak/YeTq3l7iyxeF7s8Y4RWDCqtUAjInMwKO0AmkzZjHI2eSciIiIiElzuwTVwj24P1zpNkLHuI3i27A/PmB7I2bMK5ZlJCBz6Fpy8AlTb39n2FaR3UhD2xDy141TNtHZ0UUGeEImA65mO39ePiIiIiIiIyBY8XSQ4P2uIvYdxX3MyvAlZ25CWYSx/RERERBoyC8sQl5it+v6l3tEY3CIMbSP88MG6M/jjeIodR0fk+GQFd1BwYiOkmUmQZqdCKS2Da90YAIBzUAOUJBxD8eWD8Go9EGLnijKyYlcPSNy9NbKra0pAGgASs+4Fo9/s1wjvDGyCS2kFGPrlfjuOyjZi6/micag33JzFuJJWiLikbMM76dApMgAxdbwR4u0KhRI4fysPl9IKcONOMQAg3M8dYzvXx5CWYYgK9MSRxDsY9/0xLjgmMlOnqAC0DveFQgncyi1BTnE5PFwkUAJwlojhLBFBBGZK29PGs7ftPYRar39MCHw9nJGQUYisgjIE+7ghOsgTcoUS/5y+Ze/hURUuEjEeiK2LOr5uuHA7H37uzmga5g1/TxesjbuJEzdz7T1EwTwYWxexEX5oV98Pvx1LxtnUPJTLFPB1d0b8jRx7D6/GGdQ8FPUDPCBTKHE1owCFZXIUl8kQ6uOGAwlZ9h4emWDysBjM23RJ4/Gu0YHwdnPCtgvpdhhV7eLpIkFRuXBVAw9P7i/Yscg8zJQmIiIiclAKhVLV43PRmDZ4uG242vNSuQKNp9i/71tN9USniPum/+eWt3uiUbAXtl1It0nvNHtTKpWAQg6RxAnFlw8h85+PIfEOQuiY2XD2v9cHMHvHcpQmn4NrnabwaNwFsrx05OxZBf/+L8I7dpAdfwLTnZk5CC4SMWb+cx7nb+Wjnr873ujXGM3r+qi2eXPNSZtMaH/+WCxahVcEh/86mYpyuQIjWtfBlzuvWqX86A/jO6J3k2CdZdXT8krh5iyGl6sTJGKRajuFQgmx+F6vtwuzBkMsEkGhVKr1H6tKqVSiVKqAs0SkdqxK+69m4unv4gT86azjnQFN8FLvaCiVgBL3fl72vTNet4aBeKxDBHo1CUZydjFSckrw2q+1//1VSD0bB8HL1QkLH41V9aQnx7blXBpe/vm4vYdRK+z5Xx/cKSrHqK8PAaj4LOvTNMTgfkK8T1+cNQTNpm/R+Xy3hoHwcHHCjosMplTXu0kw9l7JxMGJ/RDu525w+5r4ufpir2hMHtYMl9LykZBRiMEtwuAs0b84s9XMrSgoldlohDXPa30bokVdXwwzoo/tb8du4sP1Z20wKhJC0oLhyCuR4vdjyTh/Kw8v9W6ImDBv1T1CUlYR+ny6x76DdACBni64U2RaW4P4qQMQ5OWq8XhxuQzlMgX8PFyQVyzF7fwSAEBMmI/adkqlErnFUuy9kolTybl4tlskIgM9HLoV1/2CQWkiIiKiGqwmTnQ4ArEIuD5/OADgSnoBBn2xz6bnj43ww4dDmiLUxw1uzhL4ujuj5Yytghw7acFwvc8rFEo8/NVBnEnJE+R8jqZqL+nihDgUxP8FiU8IShNPwrv9CHi1HgSJh69q+7yj61ByNQ7ywjuAxBm+XR6FVyv7r54e27k+ejUORrdGgWg/ezukcv23bYb+7gBwLCkbj35zWKghqtn4Zg+cTs7DE50iDN7om/q+dWn2ELhIxEjOKUaDQE/kFUsRO2sbACBx/jCHm1hYsPkSvtlrux7apro4awjcXSRan+Nnim5zH2mJHo2CUFwuR9NQb43exjsvpuP5H+PtNLqa59LsIXBz1v46JMflaAsiW4X74pU+DWvkgrvKz6/MgjIolUqE+LgZtV92UTnazd5u9nmrXi9UXhccndwfoVrOv+ns7Rr5u7UmY663qnps+WG1ylc1gb7rBH3+OX0Lb7KXrkpEgDu2vd3b5N/lf2du4fVfa+fv8X+DmuDTbVfsPQzBDG0Zhq+fam9wu6ZTN6NMdn9XUpr7SEt8vecaUnJKVI+9PaAxnu8RhXd+O4UdFzMAAOO6RWLmgy3sNUyyES5FJSIiIqL7TtXV/rZa1W9oEuvjUa0sXhV/0YjeSGKxCKvGdUT7OTssOpcjqtr3uSTxJLL++wx+PZ+CT/sHkHvgVxSc2gKxmzc8W/SF2Lli1bVv59Hw7TwasrwMiJxdVQFra/WQXja2HbzcnPDs97qzaQ9N7Ie6VbJvrs4dBgA4l5qHEUsOmH3ujpEBhjeq5u/XugMAnCQifLDuDM7fytfYpjL7o0VdX43nLBUZ6KEKXDUI9AQA+Ho4mzwpbEt+Hs5Gb+vj5oR8G2YW7Xu/r97J0evzhqHj3B0mZzLcD2LCvFWvQW3EDrY4Qp9vnmqPTlEBkMoV6Dxvp03O+d7AJgjwckFUoCda1PVlQLqGcpaIkbRguNoCli8fb4PGId4Ytli9RcRLvaOtUh2jqn/f6AGgIsBbLlfg482X8f3BRKueUwjDW9VRLagK9tbMAtMnwNNF9RlYtaqSMSTVFtN0jAzQ+3k6rFUdrH+lK0Z9bZ0FbQDg4SLB0JZ1sP6Eelui41MHONy16vKnDQefqqtJ73X/vt4D9fzdzQpIAxUlvgM9XfDkyqMCj0xYla95pVKJqEnG//9oE+Ltiu3v9saGEymY+e8F1ePPdY/CxKExcHEy/V5CUoOuJ0zxzVPtMbhFaK0KSr87sIlR230xps19v8BHIhLhwIf9AFQscHOqUnFqxTMdMOGneLg6SxiQvk/UnMZoRERERGQzD8bWRdKC4UicP8yo7Q9P6ocGgR5WHpVwmtW5V9qpVCpcfyJdPhwSY3CbMR3rY1y3SIvOY+wkkrmTTY6uMohcdHE/Sq7FqQLSAODXYyzcIlqh4Pi/KE2qyD5QyspRcLJiMsrJNwQSD19UFpKyVg/p4a3roHeTYL3bOEm0T0a1DNce9F3xTAeLx6VLbIQfYiP80KKuL17r20jj+WGtwvC/QU1NOmZMmLfR2/75aneTju0IfN2NC0rX9XVDRID13zcj7743b3qzJ+obeJ8Wi0U4Pm2g1cdU08x5uCXaN9C/qKPqHPKAZqFYOLq1lUcFrH+lK7pGB+KkCX+zH5/rhCEtwxDg6YJQHze82qehFUd4zxv9G+PJzg3QrVEQfE1YuEGOadnYdniiU318/WQ7PBhbV61VRJCXK5IWDMekoc2sOoaVVT77RCIRXJ0kmP5Ac6ue0xiBni54tmsDAICXqxP2/K+P2vPX5w3DsifbCXKu6hUbdOnWMBAfDGmqMRZjlMusW2CzUYgXWoarl1xNWjAcgV6uuDTb8GJLWxrcIszkfdzMCEraQkyYN357sQvGdq6PHe/2wu7/9UGrer7w93Sx6LiuDvrzamNppZ13BjTB+le6wdfdGeO6R2HRmDYY0yECf7zcFdNGNDMrIA0AV9ILLRqXI3KWiDC4RShEIhFmP9zS3sMRTONQ4+5pjCnb7uiCvCx7b2jfwF/1tbNErPb/JxKJsPLZjlg2VpjPRnJ8zJQmIiIiIgCAt6sTdr7XG1vPp+GRdvUAGHez/usLnVHH171GZYkteaKt6uuu0YFWP5+7s3GTErYIUAGAm1PtDEoDQMn148g7uAbyomwEDn8HAKCUSSFyckbQ8LeR/vt05B1cg9IbZ1CccBQSD194tx2mKvtt6gSVs0QEfw8XZBSUAQD+eb076gd4oM0s80trOot1v14WjWmDt387BaCiDP2ZmYNt1ovVRUs/wXA/d5N/Zyue6YCen+zW+XzSguHYcu42Ar1cEWDh5Kg9GBuUBjSz1qxh3SvdoFAYXxoWqCid98OhJOsNqoZ5qksDg9tU/QxcOrYt3JwleH/dGauN6dT0gfDzcMGaF7sAMC6rMNDTBb0aB6k99sGQGGw4mYrbeaVWGyvVPsNb18Hw1rom2e8FMZ/p2gA/Hb5hlTEMaB6q9XF3ZwlKbLDgUJed7/WGn4cLpo1oDiUqJt8vzxmCF36MR5foQKMDyUL65un28HEzbzGITGHdkrNLnmiLXZcytD7nSFnGi8a0MWs/e/8M3RoGYtLQZnhgaUWlnbjJ/dWuBzoLfB9kbiDWVqpnu9fzd1crJ2yKtwY0Vvv+4bbheLhtuNljq1RYJrX4GNbyUq9oLN9XUQHjs0dj8d4fpw3u83jHCMx5uKXqfqHmzBjod32ecYv3hfJ4xwgcS8rGtcwim563UnSwF7IKzWtF8Pdr3Y0O4NP9wbE/KYiIiIhIL2834YJR9QI8EOLjhqe7RpoU5OrWqGKCu6bEpBPnD1ML/orFIiQtGI7oYN1lWS0R5uOGxzvVN2pbW/0K7TEhag1KpVKV2VzJPbo9PFv1A0RilCQcAwCInJyhlFdM8ASNeA+u9ZpDlpcGr5b9UeeZzyu2MfMFPGloM8RNGYCkBcNxZc5QtK7nBz8PF63lMK/MGWrUMXVlSleM897X1+cPNzkg/f0487OqtY1rTMcIk4+jb/HF6PYVC2KGtKxjVrlxR1C19Lo++hZBtKjrI0i2wNThzRDk5WpSQBoA3htkXDlC0s4WQQg/D/UFG4Ferlj/Sje9+xyfNlDra85an3+VzA3mUM33nomVNIRg76BY5cIkJ4lY1S7G1UmC1c931lpxxBbMDUhbuq8hl+cMQYNAT7Xr3z9f1f8+Zi3b3+ml87mXekfjoTZ1zTqum5ELU63l1wld0KqeL9a93BU/jO9o8vWAqVxNXPhaWVUAAM7MHIQNr3azaoZ89Wz3n5/vbNZxGlrxc1OiZ3GqvU0a1gxJC4YjacFwjGpfD/MeaaV3+8T5w7BgVGs4VVnYWlPmDPRZMLKVze+nF4xqjQHNtC/GElqX6AAkzh8GnypzTX2bhph0jKoVMGIj/IQaGtUSjvsuR0REREQGnZkxSLBjVQ/umaomZEoPaBaiMwj01ZPt0KyOD6KDhJ1kWPlsB6MDFJbc2/ZoFGR4oypa1PUxvJEDq+j5XBHUk+bcgvROCsozKvpIerd7AF6tBqDs9hUUnNoCABBJnKGUyyDx8IV/vxcQ9OAH8OsxtuJYCvMzqqr+HqtPhMfWu1du+7nuUUZPlDtryUiuFOJt2WSiu7P5C1mcqk2SLR3bFo1ChFv1/ka/Rpj1UM3vI1b1767PiNZ1oGv9wernO8NDgDL7L/SMNms/bzfnGv8eYWum9BK3lvYN/LFmQhf0iwnBo3cXeBjDGp/fU4ZVlG7+eFQrQTLHqGayVjUIfUGrOXYuDWtpSWChVa0OZI7mVvos2PO/PloDmO3q+2vZ2rp6Ng7S2dv7ue5RmDS0mdl/V3c7ZkqHVQlAd4gMQB8Tg0rmMLV8d9XAno+bM9rW97dpdnmkifd9Byf2wws9ovDD+E5WGhGg5zbArr58vI3GY2M718fp6YOw7/2+cK5yUbvimQ64Mmeo1v8bUQ3IlY6fOkDjsTUTuqgC8sYuOBfaOwOboI2VA7yNQ7yw+PG2EIlE6Nm4ouWUi5PYpEpQQEUQe+YDzbFmQhdrDJNqOJbvJiIiIqrBhJz4MicmPXnYvV7JEgebhNOmRV3dwaKYMB9sfqsnACBy4kZbDUmNJX/Pn54zbXLks8diMWTRfrPPZ2+VPZ/zj/+L/Lg/IXbzhiwvHd5th8Kn4yPw6fQI5MW5KDq/GxLvQHg07AiIKya5RGIJRHe/ViqVqq9Ntfzp9nrLHn71VHt0X7ALADC+e6TRx9VWJrtSl+gAvDuwCZqEehl9PLVjO5n/GqseXGip5//JWE1DvTGyXTjS8kvx7sAmDjeZbw5jfoY6vm54b1BTPLnyiMZzu97rjQBPF8gV1u3jaYgtSovXJq3r+eH1vo0QEWBcprwl6vjqXpzStWEgujaseF/aej4N+aUyAJolS6tyssLfekKvaDzSLhxBXtoDPVT7PNGpPtbE3VTLjrbWtaG+oNUDsXXxxpqTVjmvIZXXkbb0ywud8eTKozqffyDWvAzfSs4SMQa3CMXW8+kWHaeSt6sTTs8YpBaM7NM0BPj3gt1adiiVgLuOhWCDW1iWmWhs9RRrODK5v83P6WxiUDrzbvsbWxja0nBP8FPTB2q04PFwkaC4vGIBa7ifO6aOsG7vekdc6D2qXT081Eb7AjNfD2f4ejjjwqwh2H81E6k5JRioo70CUDMypYO8XLH5rZ4Y+uW9e+XKayt7cnOWYGzn+jiVnGu1c8x6qKWqosLcR1qiQaAHRrarh7yScp37VC2DHxnoAV93Z7zQIxq+DrBgkxwTg9JEREREBABQmBiV/u3FLmoBuZpwg2mPMZoS2DF3fK5OYpNLiNWERQSGFJzchLyDaxA47C24N+yE4ot7kfXvZ3D2D4dX64Hwbv8g8g//joLj/0Li5g3X8BiNYxgKIO7+Xx/0/XSPxuPuzhKNEoDVhfu548ik/sgpLje6X/ixKQP0/i1FIhHe7N9Y5/OG6MvCNqRcrt5X0tT3jKr+eq07vt6TgElDm5mcpVIbPNq+HlycxDpKKVcsOOjROAgRAe5Izjav16GlGJQ23f8G26ZUsbGLkDa+2RObz93GE53qw1tPCV5L/9aj29fDuuMpqu8PTuwHAAxI32fmPdISr/drhPAqQbj77X2kcYh5C8Ys0d3ESjnm8HARbvp4cMswjeucyCBPHJrYD/4emkHpFc90wISf4gU7vzZxidlaFwROH9Hc4p7LT3Suj/mbL1l0jEoDmoVix0VhFgc4iv8Naor/ztxGiI5MdaF892wHnUHFYG9XZBaUoWGwJ/w8XNChgT/ib+Sonu/WMBA7Lmrve24N1lqgufiJtjhy/Q5+PXrT5H1nP2y4kpGzRIx+MYYXcVQG+G2pQaAHbtwpNmmfZnV8IBKZt3Dfmqy5aOHVPg3RJfpe+yQ/Dxd8MOTe/fOch1ti6l/nNPZbNa4jpvx1Dm8PaIxuDa3/mUQ1n4MWhCAiIiIiR1d9kqYmTDwaWy5MqEyXAc1C0DTU+PLG5k5CrHjG9D7BtaGvdHn6dXi3fwAejTqjPO0qcg/8Co+m3eDRtDsAwDWsEbxaD4K8KBdlqRfMOkeUloDpOwOa4Pg0zbJu2oT5uqFZHc3Sl7+/1FXr9rrKRwqlegluU1TvX23JHE2bCD8sf7rDfRmQrkpfhqqbswR7/9fXhqNRJ+TClXA7ZopVerFXNC7MGowLswabtN+nj8ZaaUSme6lXNK7NG4bGRn6uRAR44MVeDfUGpAHAkqT878d1wKePxmLmAxWZY5+Mbu0Qf2+yPZFIpPG3rwnXhkJyxAxHIQj5U+kK8NT1c9eardwpKkDL1sKq56/9Peu5HlEWH1vIntwrntFd8aK6bnbK6jSlHdS1ecMQGeSJuMn9se8D617v9G8WqnNxxZoJnfFo+3pYNa5iwdeXT7TFg7F1seSJtlj9fCd8+XhbdIkOwIwHrJshXckaC4cXjGyFB2PrYt4jrUz+jG5ex0fQhSlFZTLBjmWMxU+0xd73+2LSUM3FydqManev/YmjBaQB65Z3/2BIjN75iKe6NMCDd6tvPNEpAj8+1wkHJ/ZD41Bv/P5SVwakyWjMlCYiIiIiAKYFmLT1FJo/shUeXHpQuAFZwZiOEUZtpy2IaI7PHm1jUvDX3LnbXk2CTd6nJmdKK5UKQC5HWeoF+PUZj9Lkc8hY9xG82w2Hf+9xAICSpFNwDWsE94YdIPb0g2tYI0HO7SwR4a0B5mcqV7LFJKs2lpTvbhPhh06RAYhLygagGaQm0xkK2Nhz8UhtWLgSN6U/9l/JwoDmoWqfW34ezsgtlhrcXySqyAJ2FJPu9mkW2gs9orDrkmlZYCNa18GiMW3gdHd2dFz3KIzuEMH3BVIj5NuIt5sT2jfwx2t9hfk8twZ7vW9+OCQGH28RJhtXKwF/LKWJS9pMXdgwbURzPN8jCml5pegyf6dR+3yrZXHnU13s0zNWH13Bog+GNMUnWy6rvl80pg2GtjJcqtoaTOk7W/m3DfHRbEmRMHcoGk3ZLMiY1r2sfSFopUYh3lhYZQFauJ87Flfrxb72Rf3HEJKXm/Cfo03C7i1m+3VCZ/ReuMfofYWOy7ZvYLu+8S/1ilYFUZ/rEYVVB5OQll+qc3snsQhzHm6p+v75HlH47kCiUaXfremPKq/hIS3qYHFQAhKziuwyli8fb4OPR7XW2fKAyBjMlCYiIiIiALrL+s57pJXGY/FTNbNEW9fzE3Q8xq5mNtal2UMQpqcPpzWYmpRqbIZNPX93LBzdGoD6am5T6MoKcTRKpaLK1xXTIiKRGCInZ7hHd0D2liXI+H0GAga9qgpIy0sLEXRjN4oTKnosVgakqx7LGNUnpABAZuc+v5ayJItLIhbh95e74svH22DeI60QqmUSkYx09+/gyFl1rcIt7xlubyHebhjVvp7GJLmuvqXdGwVi13u9Vd9bM0Pm+R5ReN2E4Npfr3W32li6mVj+NzLQA5+Mbq0KSFdiQJqqE7IMbVSQJ34Y3wkdIw0v6qoFa2pM8kqfhlofD/ISpkezsZWGjGFq5rCpf8vn72Y3h/m64X0jWir8/lJXNArxgkgkwi8vdMaiMW2w4pkOmDrcNlmxpnpVy9/61T6N8P24Dni4TV2cnTkID7cNh6uTfQJG3m7O+GF8R4uP4yQRIyLA8nuVgc1D0cGI9wxH4m2FoHSE/702Qg0CPU2qzORiYp9wQ8zJ4jf3mrTqYj5niRiPdtB/39w3JkQt2DpxaAx+ndAZX4xpY9b5hRAT5q32uefuIlG7VhXKN08ZV4lBJBIxIE0WY1CaiIiIiADozgYb27k+PhndGs2rZA9b0pfWWI93rI8vH28j2PHcnG1/82TqTbyx824rn+2ARztEYP8HfVXBaVM5ScS4NHuIWfvailKphEhU8TssurgPObu/Q/aulSjPuA6lQg73xl0g8fSHa/2W8GpRUfZPqZAjb//PQH4aVrz7mNrxKo+lzbhukWrZy72aBKtW1lfVXKAseqCizK2tmZJxpGvS6KE24Rjb2fEyiGqiBoHqvcZNKfdviDll/at6d2ATgUbieJ7XUZJ19XOdER3spXrvbhhsWXl5bQtbKk0b0RzRJhy/TYSfRWMxJCbM+Nfe7v/1EbSUJ5ExTFnEs2ZCFyuOxDH9+Wo3jcc2CdSORsj1U2+bWG3GksVb2gJZg1vc63m7+vlOatd+3RsF4eG24RjYPNQu9w3GqNrftap+MaFY9Hhbg+0abMGYhSPG+OMlzde0qRyx/LIhQi/6/O+NHhpBaFPusc2919RFJBLhpd7RRm//56vd8O8bPQQ59ws9tJ83yMsFni4SfDhEfSGLs0SMbg2DBHk/MPat7OXeDXFm5iCcnDYQk4bG4KfnOmk5lggDmxvu322KIXbOBqf7C4PSRERERDXc+lcsLycW7ueOcd0idT7/WIcIm5cblkhE6G1GWWptdAUg9Nlp4QrkRWPamJylYGzA0PNuMCAiwMOiUpFuzhL8J9BNvtCUCrkqwyrv8O+4s2kR5EU5KL58EFn/fY7cfavhVq85PFsNgCznNm59/zoy/1mI9LVTIEuKx7///oNH+nXBtblDjfr7z3ywBX5/qSsGNKu4wX+p171Ji+VPt0enyACMbBeO5U8b38/PkMc6RGgEJa3NlKD0srHtrDiS2s1QllBl1v//qmVxbXxT8//x8KR+Zo3B0skqT1cnfPesZYFtRxXkpT1DqPL99O/XuuPB2Lr47lnLsr20LWyp6uE24XhBx/vTqvEdseSJtmgT4YcftUxICm2lkX/rLx9vI2j2K5GxTFnE0Tk6EI8b2bKltmhXX7Mkboi3MMEtof7jn+gUAT8P07K3TQlK161WEaln4yB8/lgsFoxshQ2vdsPZmYOw/OkOOD19EA5O7IeejYW5z7A11yqLXtvW97PfQHQQ6iNCiApXb/Z33HL/uvRqHIQ3+zcWrGpYi7qaC2q7NQxCz8aGq6Q807UBmgi4YLLSq32M/7tUvreZWjFG2+JaXw9nHJ3cX+2xaSOaI37qQJyeMQiNQoT/WSsZ2zpr4tAY+Lg5w9/TBS/1bqi1vD1g+eLTqlxskHBAVBWXthIRERHVcO0bWB4s7hIdaHLPNmuTiESAQDdIU8zow9kw2Musc/3xcle4O0vQ0owyYyNa18X7687o3Wb2Qy0QESBcIFPf3/3N/o1xLjUPU4Y3Q5CnK2JnbVM998/r3SGVKzDq68OCjQUA3JzFKJUqIBJXBPTL06+jPCMRIaNnwq1BayiVSuQd/g0lVw4j/3ggfNo/ALeIlig4uRFiFw84B9RFp0deQHR0NORyOSQSCaaNaI73BjVB8+lbDZ7/26fbI6uwTG0CYHCLMAxuYZ3V47Yud2vs//mAZqHw11HimAzr0zRE7/OVmTvVy5hWL4cMAHV83dG+gT+O38gRbHzG6t8sFIGeLrhTVG7RcUydoH4gti7Gd49Ei7o+eGrlURxLsv7PPm3EvVKtzer46M1ytsTvL3VVlRMXi0WYMrwZMgvLcDDhDvw8nJFTVI6PHmqBvndfQw8YCGwLpZ6/By7PGYIV+67j021XdG73UJtwm4yHqLrpI0wrp7xgVGu82qcRxv0Qh+uZ1u+96QjBwVXjO2L8qmMAgFgBqysI1bnEx4R+w5VMuT3Z8k4vte9FIhFGamlz4+vhDF8P+2cUm2v2wy3xwbozGNkuHB+Psn3VHUOcTO1fpMdfr3XHw8sOqj3m7izBLxM649nv4vBMtwZYtvua1n2PTx2AQB2L0ByZSCTCuwOboLhchvmbLe8Vr2sh2Su9G2L/1Sy9+77V37TKBsbydXfGS72jsXzvdb3bVb0WM7VizC8vdNb6eKiPG+Im90eJVA5vN2dVSxdt1+BCEotFBt9Mqy6KtiWhMtGJjMWgNBERERGpsvYs3UZIYjHg7mTZ5eq4bpF4rW8ji7KJTWVJyTp3Fwle6BGFlQcSdW4zpqOwZZO19RT7ZFRrpOQU450BjdUmMq7PG4bvDiQiIsADrev54eRNYQNFT3aujzkPt8SCLZewfO915Oz6DvnH/oJTQDic/CqCwiKRCN6xQ6AozkPx5YPwaNodLiFRCBz8uuo4w2MjIJPJ4FTl9ePh4oTpI5pj1n8X9I5BLBbpXJFuDXMfaaWabPtiTKzVz2dsUNrW/+/3m6o9jiViEeQGJqnWvtgFjadstvawtGpb3w87LmZYdAxTg9KLq2TjfvNUe6w8kIj/ztxCcnaJReOo1Kna+/SLvaLNqqhh1rmrVR0RiUT48nHrBMBN5eokwev9GusNShOZauHo1gYX3BnyZr9GJmfYAkD9QA9sfqsnhn653+qB6d9etLxykaX6Ng3B8akDsO54itZgrLmkcoXJ+6x+vhOe/i5O7bHX+pqetWrKollT+1XXVI91iED/mBAEeLo4ZOUKFycxfnquE575Ps7wxga0ifBD0oLhKJXKceT6HZTJFBjUPBQikQinZgyCRCzCvitZOJuap7bfpKExNTIgXZWHixP2f9AXPT/ZbfYx9F3bdGsUhElDY7QGvvs0DcaKZzpYtWVXmIH7rS8fb2Ow4owuUUGeev83bHmvV6nqW9nJaQPRdvZ2jW0mmbGQ3lKv9GmIpia0cCESAnPziYiIiMgoL/SMhlgEveUQw/3cBTtfZYmrMR3MK7+4+399MPPBFlqDro7M0OSbk8AB9iAvV6x+vhO+H9cBK5/pgF9e6IzHOkbg3UFNNW7mxWIRJvSKVvWcigqyrN9qpYHNQ3F17lDMfaQVRCIRxrXxAwD493servVbQpZzC9Lc26rtJZ5+8GjaHWWpF6EoLQSgHkR9pU+0WkC60nM9ovDNU8KV3xZCmwg/JMwdiqQFw/FIW+EmkHUxtnScnEFpq3q6awPV10uMyMh1lohN7lEvlHkjW9n0fB8OiVF77wn0csWHQ2Kw/4N+gvV09/d0wZmZg1TfB1tx4trDxTH7kupzesYgrY/Pt/FrgWqHR828jqvKkk8kVycJtrzVC1OHa5/s//LxNnhRgOw0Ryk4FOjlipd6NxT0+ndCT9N/Pz0bB2PbO70woWcUfhjfEYnzh5kVNHbEoKs9vNFPPaAf6OXq0L+bXk2CcXam9s8Sc7g5S9CnaQgGtwhT/dyV90wbXu2GdwY0AVBRqvrf13vgpd4NBTu3PUUEeCB+6gCz9p0/spVaFRhtJvSMxkd3Wxh1aOCPAc1CcG3eMPwwvpNVA9IAMLZzfTQK0axMNvOB5oib0l9rZZbq7Ux+GN8RS8dqXkdvfquncAMVSNV7MH9PFxyZpF5G3NvG1bOI7IlBaSIiIiKCl5vhm6CIAA9cmj0UC/SUifv5hc5aby7NUTnR8IYZvcD2/K+PYAFTmzMwv2SNrO+ejYPRLyYUA5qHonsjw/3FKvl5uGDv+30sOneIt6vaSvwTJ05g8ODBmNSjon9Y8CNTIHb3RsHxfyHNTVPtJ3b3gZNPCJSyirLClRNUnz8WCxc9vbyHtAzDK30ca6LK2uXiqjK2N6NQpTqponxg1R71i8a0gZvzvdfo0JZh+Pbp9tj3fl+9x9mkpd+0Lm7/Z+++w5uq3gCOf2+S7l1KgbL33nvvWUSQH1sEAQeCAg5QQUT2UIYKiMhQFEQc7KXsvffeexS66G6S+/sjNLQ0bdM2pYP38zw+Nveee+5JSZOb+57zvna2e035ujlSuUDqyxHEaVE2Dy721t1oy+Vin+zfZ2omlXz5SvI3Yt0d7czpdttVymd1v6m17eMmGdZ3RvFIIsVuj1q2zdQhXh6WapqmRnqDb/Y6DQMaFqNHrYQB8uuT/Xm1Sn5K2uDaNSsHCNOrYgGPVE08XTPY9HlVKo8bI/3L0aS0b4b/fj7IoDTDWcVHrUpn9hBSzS2ZSQjPB9nTQ6fVMKRFSS6Ob8u6DxpSMR3XLFlRauqqx2fNZ7ZGo9CnXhFqFfXmz4H1+KlPzRdW0stBp+W/Dxvza//alMnrxjddKnN9sj996xfF183ySubGpXKb34s2DGlIk9K+tK/kx7VJ7VjUtyZVCnoyo1vlBNfZWcXz3+HjX2sV9XFh/3O1rq1hKSAvRHYgQWkhhBBCiBzgx97pW/1pbb2olFbqFfVx4b8PGyfbxho6jWK+eVXAy/r6yf992IgzX7WmSHYNSJP2Gw+ZpXCu9P2u6xVLmMr2wYMH3Lx5k74tq7P94yZoHV3x/d8YIi8fJHjbQsIv7CHm4VWCty9C0dlh55mw1rM16SqHty7NJ62f3dwr8xKlLLP29SXpu23Hx9UhQY36YrkT/s0oikKr8nkplCv597oSvm64WzGBCGwfCJ3dq1qaj53/RvUU01P/9EYN/nmvHls/apJsu0K5nK0Obr1ZP+V03H++W49TY1rZNMvH8/JkQopIW/jnvXoJHtcvkSuTRiJygrggpbWm/i/hBEjbXRlZ7qljVamVnpJKVgT6ulQvwPXJ/jYPCi7qWzPZ/Ts/acqHLUvZ9Jy2lM0u7W1q7fsN+KxtGRa9WRN7nYaiPi788169DAmyZ1ZGmYyWVbIwZJQGJX3YOLQRnatblzFqy0eN2fNpM8rGy56jKApNy/iyclD9F5J5Ki1GP121Hlc32sley9/v1ePv9+qx7eMmuKRhpXT7Sn7pXhDwfEkbIV4EyQsghBBCCJEDtCqfFzutQqwh9YGkqoU8s1zNrWHP3VgqksuZ648jkj2mSkFPSvhm/+BicvcdhrbIWatAPm5VijfqFUmwrXjx4vj6+nL//n2KFC7M4jeq0fcX8HnlYx6tnkbElYO4lG0CGi15e3+Dxt4JVVVTtQJHURQGNS3B67ULs//aY2oXfXm+jGvi3a9b1LcmW88/ZMn+G4naSUw6/X7sXZ0HoVHmOm1rBjfgZmAElQp4prlPa1ew5/OwbZC1gJcztYp6c/BaYKqPVRQlxVp1Lcrlsbq/fB5OnLkbmupxWKLVKMmu5HqZVS3kxdWJ7bj2OJxLD8IkKC3SRaNRGNGmDFM2Jq5dakmX6gUYHq8Ota0m7MXvZsW7z2pA22k1zO1VjYG/HU173+kZWDYw9tUKbDh9P8n9C/rUoHlZ69/LU6NJ6dxolISfgT1rF2LpgZsAKU7qym5GtitL49K5aTVjZ2YPJd0q5PegQn7TJIXzY9tkSMannM7T2R7/SvlYd/Jeyo2f+igLT9JIL0c7bYZOJswoXWoUpElpX3xc7c3bqhXySne/6ZlIvKBPDZqUzp3uMQiRWjlzCpEQQgghxEvovSbP0qBN71rZ6uPyZtIqrgV9aiS5r2L+hCsslr1dB9cUZg+3KOtrk3HFd2hkCzYObYid9sXdQEnqxuvBkc0Z2iJn3WAY3KwkjhqVjh070rt3b+bMmcPFixcJCAjg4cOHAPh5m1aVupRthGeTvmA04lyqDr6vjTIFpA2x5oD0wr5Jv6Ys8XC2o3X5vHg626fcOIeIn5KvZB7XJNNd5vPInqs7s5JW5fPSu24R8+OKBTzwT2ea6BFty6RzVGk3p1c18wSOJqVz83od26Ry/mtgvZQbxTOhUwWbnPdFiqtlm93GrtEoFM/tSpsKeSV4L9JtYJPijOto3d+Aoigs6V8r3mPbjGFQ0xK42Gvp36AoNZ9bHdamQl5Gty/Hwr41uDC+DY1Kpe5GfU5fDZvbzYGaRSwHUL7qUD7DAtJgej380q92gm2j/MvStHRuJmeDWvd1iqZuUs9bjYpRKo8bnz39zO9aI2uu/EwtCUin3bcpZJx53vs5PJ19dpXbzfa14BuXSvs9kOZl8+To0hMi65KgtBBCCCFEDjGkeUlWD67PpQltea1aAf54p27KB5E4ReKLsGloI5qXzcP8NywHEZ+/EZjPw4nTX7VmWItSaBR4t3Fxlg5IeHNqQMNiNh9nbjcHyuR1Z9PQRla1XzWofrrP+fz9mqVv1WbPp82SrK2VFRwcaX0NrLjZ3HGB/ps3b9K4cWP0ej1ff/0148aNIyQkhHHjxrFmzRpiHt2kQn5TejaP2p1xLlmHwM1zib5zHtVoQNGaAiWFcznTrEzG3RDNKRzi1dv2drFPcsLF5+3KvqghiVToXacwpfNkTkYIH1cHlr9Tl+uT/Vn8Zi3Gd0x/IGDloPpUL5y6VSJ53B3plM1S7Q5oWIwTX7aiV+3CmT0UITJV7zop/w3MfVouoGHJZ9eCtopl5fd04uSY1nzRPnHNeUVR6NegKM3K5MFBp+XnN2viZmXJhNk9q70UN/YtlXIo6uPCG3Vf/Hubs72ORW/Wons2qHX/bY+qFLOytFD8SbZvNyrG5mGNmPTai/+uJrIWrSbhRB0h4gxvU5oGJXxSfZwt7lsIkVYSlBZCCCGEyCE0GoVKBTyx05ou8WoV9WZK5+SDBv0bFM2Q1U8p3ZyKS+Xaslwerk/2x9vl2UrV3SOaJnnckBYluTrJn0/blqFeCR+uT/bnzFetuT7ZH0c7bZLHpZdXvJW0ud2STnVeuaBnus8Vv57WjG6VqVfcJ8unKPN1c6RbjYIptlNVo/mm7Q9dy6KqKkWLFmXYsGEsW7aMc+fOMXXqVHx8fAgNDeXdd9+lfv36BC4bQeB/8wDI3elztK7eBKyaguHJI8A0IePvVK62fFlpNQp7P23Gzk+a4myvw9PZnl61E97QndCpAl4uL8/q8ewmbxZaxT6re5U0Hzuvd3WqpPE9s2y+7FeqwcNJVhoLkZySvq6sHlyfthUTZ5SwZUYTrZURbkVR+PNd664t2lXMm54hZRvPT5Bc0KcG2z5u8kIC8vH/3d6sXyTDz2dLud0c2PpxE65MbIenc/KfBT/1eVY/W1EUSuVxs/o1K3K2+BN1kvN8xjGRsznaafm5Xy26xLuH0L9BUf77sHGyE8dtcd9CiLRS1PQknhdCCCGEEFmaqqqsO3WP6ZsvEmMwcic4MkGt2JndqtAxg1aczfrvEjP+u5ho+8K+NSyuaA2JjMXdUZdlV5psOfcAe52Gw9eDmLXlksU21yf72+RcT6JicbHXZbs0d3svP6LnTwcs7otf9zn00Cpq298iPDycsmXL8tVXX5E3r+mGbmhoKOXLl2fhwoXUqVOHo0ePsn//fo5RlP1BplUmxugIgrYtJFebwfSoVVBWkNhAkU/XAaZ0/juGN0mwolpkLb0XHGDXpUdJ7m9expcFfWsmud/W4l47lng42XHiy1YW212b1C7N7/cxeiP1p2wl4El0km1s9X4shLAtS+8ZNYt4scJCAHjF4VvsuvSIr7tUxl6XOetqbgVG0HDqtiT3Z2Qt5axo/al7fLXmDHN6VaN6Ye+UD7ARg1Gl5/z9FMvtyqRskLI7Kaduh/DK97st7iubz50NQxq+4BGJ7OT4rWA6zt6TbJsTX7aSiXAvqbBoPTF6Y4IJ/0m958h1sshMEpQWQgghhHiJRMUaKPPFRsA0i3rN+w0y9Hzn7oXSdtYuPJzsaFbGl641ClK3eOrqqmU1wRExVBn7r8V98uXOFHwu+tn6JPc/3jyHiPO7WTj3WwwGAwsWLODu3bucP38ee3t7VFWladOm9OnTh379+pmPCwqPYeQ/J1l34g6K1pROc0jzkgxrmbPqbGeWr9ac4ebjCOa/USPbTYZ42fScv5+9Vx4nuf/MV61xcbAu5awtJBWU3vZxE/K6O+Jkr03U7ud+tWicynqtz/tl33VGrzpjcZ9/xXwW08wKITLfvZBILj0IIyxaz3u/HQXg83ZleLtR8UweWdKuPwpn6qbztKmQj2mbznMrMNK879KEtuYsRUJYo9zojUTEGBJtvzyhLTp5LYkUJDcZEOT7qEjs+dfMug8aUN5PVtSLzPPivqkKIYQQQohM52in5b8PG6MoUDy3a4afr2w+d86Pa4NOo+SYmyyezvb0b1CUBbuvZfZQsiRFUbDXaogxGBPtiw26R+zD69QbNJ0+ffqwe/duLl26hL+/P1qtFkVRUBSF6Ohozp07l+BYLxd75rxegyKnH5i3VSnkmdFP56Xx5SvlM3sIwkqDmpZIMijdr37RFxqQTsrk1ypSNJn6mekNSKdkyv8ke4IQWVU+DyfyeZjKkhwe1YLD14MS1NHNior4uDCnV3UAOlT24/SdEE7cDqZnrUJZNsOPyLo0Sbxmcsp3JSFE1vJWw6LM32W6dzGoaXEJSItMJ592QgghhBAvmRK+ri8kIB3H0U6b426yWFpI2qrcy5O6MSXHv2xpcbshNADHyABWfvk6ixcvpm3btgwdOpRFixah0+k4duwYAO7u7pQvbzlIumZwA5qWzs3U/1WiSQYHtoTIiuqX8GFEmzKJtrvYa/m4deZmDljxbl2uT/ane61Cifb1eLrNVrVAne0tB9+n/a8SrlkgMC+ESJmPqwNtKuTNdteJFfJ70Kt2YQlIizSRV41Ij20fN0ly3/K367y4gYhsY6R/OU582Yqf3qjBkOaSZUxkPknfLYQQQgghRCpNWn+OeTuvmh9XzO/B6sH15eZkPEsP3OTzf06ZH8/qXgV9wHUWzpyIt7c3a9euZd68eXTt2hWAAwcOMH/+fL799luePHlCnjwS5BciKc//fTnZadn3WTM8ne2TOSpj3AqMYNOZ+5TN5079Ej5Jtos1GDl9J4SK+T1sEoCK0Rt599cjlPB15dy9UE7dCWFJv9pULCCrP4QQQmRdq47fYcjvxxNs2zi0IWXyumfOgES2YymFd0lfV/79sHEmjEYIIVJHgtJCCCGEEEKk0o3H4TSetp1W5fIw9X+VcHe0kzq8z4mI0VNu9Cbz46sT2xETE02rVq3YvXs3y5cvp0uXLgBERUXRv39/oqOjmT9/Pl5eXoCpPrUE+oVIbOfFAN5YeND8+NgXLfFyefEB6azEYFTRyvuwEEKIbOBxWDRtZu0i4Ek0IHWAReoM/PUI2y8EEBn7rDa5TGwQQmQXEpQWQgghhBAiDSJi9DjZaSVoGo/BYECr1QKm30/ZLzagKBp+7V+busW80Gq1XLx4kZo1a9KyZUuqV69O8eLFmTVrFqGhoWzdupXcuSUltxApUVWVH3ZcZcrG8ygKXJ7QTgKyQgghRDYSFq3nm80XaF/Jj+qFvTJ7OCIbUVUVvVFl9fG7/LLvOvN61yCvh2NmD0sIIawiQWkhhBBCCCFEusUPSK9cuZK27fwpPXozAGfHtsbZXmdus2PHDn799VfWrVtHpUqVyJcvH4sWLUrUjxAieRExehQUnOzlb0YIIYQQQgghRNYmQWkhhBBCCCFEusSl2b516xbt27fHz8+PoUOHUq5mQ/RGlaI+LonaGo1GQkND0Wq1uLm5AaDX69HpdJn1NIQQQgghhBBCCCFEBpGgtBBCCCGEECLdTp8+TfPmzXnllVeYOnUqTk5OODk5WX280WhEo9Fk4AiFEEIIIYQQQgghRGaRuz5ZQHR0NGPGjCE6OjqzhyKEEFmCvC8KIURC2eF9ceXKldSrV4+ffvoJb29vgoOD2b17N5s2bbLqeAlICyFSKzu8NwohxIsk74tCCJGQvC8KkbXISuksIDQ0FA8PD0JCQnB3d8/s4QghRKaT90UhhEgoq70vWlrVPG7cOH766SfWr1/PL7/8wuXLl9mxYwfh4eF8/PHHjBs3zpy6WwghbCGrvTcKIURmk/dFIYRISN4XhchaZDmCEEIIIYQQwmp6vd4ckH7y5Il5e6tWrShXrhxVqlTh+PHjNG/enCNHjjBhwgTmzZtHUFCQBKSFEEIIIYQQQgghXlK6zB6AEEIIIYQQIntQVRWdTseDBw/o378/YWFhuLi40KVLF/r27cuGDRs4fPgwNWrUMK+K1uv1NG7cOFX1pYUQQgghhBBCCCFEziIrpYUQQgghhBDJ0uv1ACiKwvnz56lYsSLOzs5069YNR0dHJk+ezMCBAwGoUaMGRqORW7du8e233zJu3Djat2+Po6NjZj4FIYQQQgghhBBCCJGJJCidBTg4OPDll1/i4OCQ2UMRQogsQd4XhRAiocx4XwwICGDw4MEA6HQ6c2D6r7/+onbt2ixfvpyBAweydOlSPv74YzZs2MDs2bMBWL16NSNHjuSbb75h+fLl9OnT54WNWwjx8pBrRiGESEjeF4UQIiF5XxQia1FUVVUzexBCCCGEEEKIrGX9+vW89tprvP3223z77bfm7QMGDODcuXPs2bPHvO3hw4d88cUXBAYGsmLFCq5du8bx48epVasW+fPnJ+4rh9SUFkIIIYQQQgghhHg5yUppIYQQQgghRCKNGzdmzpw5LFmyhJkzZwJgNBopUqQIOp2O8+fPm9v6+vpSuHBh9u/fT3h4OEWLFqVTp07kz58fo9GIoigSkBZCCCGEEEIIIYR4iUlQWgghhBBCCJGIi4sLHTt2ZOjQoYwePZq1a9ei0Wjo0KED586dY8GCBdy5c8fcXqfTUbVq1UT9aDTylUMIIYQQQgghhBDiZafL7AEIIYQQQgghsiZvb2/69evH3bt3efPNN9m6dSuVKlVi5syZDBkyhBs3blCxYkV0Oh1jx45l1qxZuLi4ZPawhRBCCCGEEEIIIUQWIzWlhRBCCCGEEMk6e/Ysw4cP59KlSxw+fBg3Nzd+//13tm3bxt69e/H29mbIkCG89tprmT1UIYQQQgghhBBCCJEFSVBaCCGEEEIIkaJ9+/YxePBg3N3d2bZtG2CqMR0eHo6iKLi6uhL31ULqRwshhBBCCCGEEEKI+KTAmxBCCCGEEC8xg8GQaJvRaDT/HBdorlWrFmPHjuXWrVv0798fMNWLdnNzMwekFUWRgLQQQgghhBBCCCGESERWSgshhBBCCPGSMhgMaLVaoqOj2bVrF66urpQtWxYPDw+L7aOioli6dCkDBgxgw4YNtG7d+gWPWAghhBBCCCGEEEJkRxKUFkIIIYQQ4iV29uxZOnfujJ2dHTExMRgMBhYvXkzt2rXR6XSJ2gcFBXHx4kVq166dCaMVQgghhBBCCCGEENmRpO8WQgghhBDiJRA/JXecy5cv0759e9q2bcvBgwc5f/48d+7cYcqUKYSFhVnsx8vLyxyQtpT6WwiLwh/DmX9AH5PZIxFCCCGEEEIIIUQmkKC0EEIIIYTINFGxBjaduc/9kCiCIyRYlVFu3bpF/fr1OXfuXILt169fp1SpUkyfPh07Ozs6depEgQIFGD16NJ6enin2q9VqM2jEIkd5dBmmFYMVfWH9R5k9mqwlNhIMsZk9CiGEEEIIIYQQIsMlzscnhBBCCCHECxCtN1Dmi40JtlUq4MHQFiVpWtoXRVEyaWQ5j7e3N48ePeLtt9/mn3/+wcfHB4Bz584RGhpKUFAQTZs2xdPTk+3bt+Pn58fJkyd59OgRzZo1y+TRi2zv++rPfj76C7jmgWajMm88WUVsJEwqAG75YNjpzB6NEEIIIYQQQgiRoWSltBBCCCGEyBSn74Qm2nbydgj9Fh9m0obzKRwbwqrjdwgMl9XVKVFVFRcXF7Zu3crVq1cZPHgw4eHhANSuXRu9Xk+BAgWoUqUKmzdvxs/PD4Bt27Yxe/ZsgoODM3H0IkfaOS2zR5A1XN0ORj2E3MrskQghhBAvNVVK0gghhBAvhASlhRBCCCFEpug8d2+S+37cedXidqNRpcin62j/3W6G/H6cauP+5Y/Dt7gaYLn+cUYKj9az9MBN5my/TFRs1ryRpaoqqqoC4Ovry/z58/njjz+YM2cOACVKlKB06dIULVqUzp07Y29vT2xsLMuXL2fatGm0bt3aqjTeInuKe21kqIPzLW9f8SaE3M7482dV13bBsu6ZPQohhBDipRVx7Bj6wEDufTmGS/XqE33pEvdGf0nk8eOZPTQhhBAix1LUF3InQgghhBBCvCiqqhJrUFFRcdAlrPkbozdy9VEYpfO4ZUp67M1n7rPn8iO61ypE21m7km17fbK/+efP/j7FtvMPCY/R8yRKb7H9tUntXuhz6jh7D8dvBQPgbK/l1JjWaDVZM+X45s2befvtt2nZsiVLliwhNjaWefPmMWDAAI4dO8aoUaM4efIkRYoUwdPTkx07djB9+nQGDBgAmF5TiqKw+sRdjt4I4rN2ZRK9tgDuBkey5sRdutcqhIeTXaL9s7ddxkGnYUDDYml+LhcfPOHRk2iqFvLCyV5qWqfVjosBvL/0KFM6V6JtxXwZd6IxHsnv7/YblG2fcefPKh5fgZN/gFYHdd+HvwfAuTXP9n8ZDFKyQAjxkjkRcIJ5J+bRo0wPGhZomGi/wWhAq7HdZ33ov//y+Mf55B46BJd69aRUzEssfO9ebvbrn+T+Qj//jEvtWi9wRCZ/XvyT049OM7ruaDSKrCUTQgiR80hQWgghhBAiB5m344o59bWLvZbjX7bCTvvshkbfRQfZfiGAygU8mNalMiV9XV/YDbkzd0Pw/3a31e3jB6WLfLouxfZXJrZ7oUHh58c0vmMFXq9T+IWd31pnz56lUaNGDB8+nLfffpsbN27w008/8dNPP7FmzRpatGjB9evXOXr0KLt37yZPnjy0atWKqlWrAmA0GtFoNHy75RLT/71o7vfLV8rRu05hdE9fX2tP3mXw0mMA+FfKx+ye1RKMY8zqMyzeex2AC+PbWAxqp6TrD/s4eD0QAI0CF8e3NZ9fpE781+/JMa1wd0w8iSDdDi+CtUNTbvf5XbB3sf35s5Lng/MuvhD+8Nnj0UGgseK1bIiFK9ugUG1wTCHg/7LaMhZC70HHORLoFyKLuv3kNm3/bptg25qOa/B19sXZzhmABacWMPfEXH5p+wvlcpWzyXnPlSlr/tn3k0/I1b+fTfoV2c+DyVMIXLw42TZePXuQ54svMuy7ksFo4M+Lf1I1T1VKeZUCoOLPFQGY1XQWzQo1y5DzCiGEEJlJ7uAIIYQQQuQQR24EJajFHB5j4JXvdvPwSZR52/YLAQCcuB1Cqxk7mb/LcppsW7sVGJGqgDRgTokd8CTaqvbFP1/PwF+P0HfRQR6HWXeMLY1aeZpYg/GFnzcl9+/fx8PDg9dffx1PT08qV67M9OnTadeuHW+++Sbnzp2jSJEivPbaa0yfPp0RI0ZQtWpVjEbTc9E8DZTFD0gDfLXmLAv3XAPg2qNwc0AaYN3Je4RExiZoHxeQBvh4xUmrx28wqszdfoX+iw+ZA9IARhXe+uWw1f2IpL275IjtO310ybqANEBMhO3Pn5UYLaT3jx+QBlCtLAGw6xtY2gXm1IOY8PSPLSfa9Q2cWAoPTmf2SIQQFsQaYxMFpAFeWfkKHVZ2MD+eeXQm0YZouq3tRkh0SLrOqQ8M5Gb/AQm2Pfzmm3T1KbIva9dnBS1dRuThjLvWXH1lNeMPjKfz6s6oqkp47LPP9SHbhqT7dS+EEEJkRRKUFkIIIYTIAaJiDXSbty/R9vP3n1BrwhYehEax8fT9RPsnrj+faFtGaDh1W6qPmb3tMpB87ennbTh9n+0XAqg+/j+MxoxLCHTkRqDF7YueBmnTS1VVjt4MYuWxO+mu++vo6Mj169eJiDAF/vR6PXZ2dowYMYJ79+4xZMgQ7ty5Y25/6nYIb/1ymKuPIgiJiEVVVUb8aTmIPHH9eWL0Rjp8l3jCwch/TvEwNIptFx4meg5rTtwF4PLDMMKjLadjB3j4JIrin69nysbzbDn/MNH+bRcCzK8TkXZ7rzy2fafhj6xva21ANrv6sUnKbVQrJ7ScWGb6f+htmFIkrSN6Oez5FgxJv78IITLH1INTk9z3IOIBxx4eS7T9o+0fpeucl5u3IHzPnnT1IXKGxwsWcr5suRRXSccJ/fdfVINtrlMuBF6g48qObLmxBYAzj8+Y9w3YPIA6S+skaD/n+BybnFcIIYTISiQoLYQQQgiRA2w6cx99MkHYA9cCeffXDFgNmYH2XXmMqqrcDEzbKsou8/Zx+Hogj2y8avpOcCSd5yaeAABw8FqQTc7x+T+neG3OXoYuP87msw/S3I+qqpQoUYKmTZsycuRIQkND0el0AOh0OqpUqcK2bdvYvn07ANvOP+SV73fz79kHtJi+g8pjN1P0s/UsP3wryXN8sfI0TywElvddeUyDqdt4c9Eh1p26l2j/kRtBtJi+gzazdlrsNyQilloTtqT4HKdtusDBa5YnCQjr2bSqk9EAh+Zb3/7MP7Y7d1Z034rMANYEpQ2xEHQ93uOYNA8px4pfp/vUH3D058wbixDCot8v/J7s/jc2vJFo24H7B9J1TjUyMl3Hi5zj4bRpqWof9MsSAn9ZYpNzf7zjY66EXGHo9qGJ9h28fzDRttCYUJucVwghhMhKJCgthBBCCJEDXAlIPo3rh8uPJ7nvyA3bBFKTEpeGO7WMqsrqpytq0+LIjSD+98M+aoz/z6YBt46zk15pExxhmyDRsoPPgsDvLDnC2pN30/QcFEXB19eXrl27cufOHYYNG4Zeryc8PJytW7dSq1Yt7t27R69evQB4c/GhVJ8jqYD14/AYYvSmQFv81N5x4lbA3wqMxGhUCQqPYe3Ju0TrDXz29ykqj91s9RiWHriR6nGLhMauPWu7zk78Dqf/sr79xk/h6nbbnT8ruW1l2s+UgtJGA/z8SvrHk5M9OAPLX0+4zZoJAUKILGfuibk260sfEGB5h9FI7P3EWYRE2lwPuc7xh8czexjJuj9hYpqOC12zJuVGKYiIjeB66HXz41hjbNKNn4o2vPhyREIIIURGk6C0EEIIIUQOoKSwP7lV1KlJj303ODLVdZObf7MjVe3jW7LPNsHGeTttVzs7uRrXh28Esf1C4jTT6TV46TGafr09UYDfYCGdYFwtaHi2+rVPnz707NmT3bt34+vrS926dRk3bhz/+9//8PHxSXTci1bs8/VUHfcvg5ceo/SojSw7eDNVx688fjfNkx+EyaI91zl9x0a1Cx+mIcB9N/HEhSwnKgSu7YTU/K381Ny6dnu/h6QmnhiNMLs23LScoUE89fhK4m1n/sn5NcuFyIFsmbb4UsNGSe670TvxqmyRNq+sfIXeG3pzJ+xOyo0zSdCStK141j9+nO4JrrWX1k7wuNqSain2+e+Nf9l83fpJmkIIIUR2IEFpIYQQQogcQKOkFJZOH1VVOXIjiHqTtyYbxP5y1Wm6zdvHzcfPggB3gjM/ZeLkDWmvnX07KII/Dt3izN0Qvt1yKcX2fRcd4ty9tKfbC4m0vHLi+uMIynyxkcdP05Hr9Xq0Wi0Ap06d4ubNm0RERKDRPLvEVxQFVVVxcHDgnXfe4fDhw0yfPp0PPviAM2fO0KxZM3MwOv5x2dGmM7LaKb3aW6gNniYZ/H6ULrFRpuAymFYfh9yGyOCUj7t1ECYXMq1WHutl6icl8VNtp2T7RLj0r+V90SHwOIn3HmMmTsaIDktdgD6jWXrdRYXAug9f/FiEEAmExoQy9dBUzj62YVYOG4i9dQtDcHCqjonUZ/51bVaiN+pZcvZZsLfNX20IisrYLEyqqqb63y099A8ecO/Tz2ze7x8X/0ixzYidI2x+3uwsJDqENVfWEBErE86EECK7yt53noQQQgghBJCxMaCAJ9HUnfQsGH3ytuXVlI/Dovl53w0OXAuk0bRt5mPTw2jDtNtPolJOk/e864/CaTBlG8P/Oon/t7uZ/u9Fq45777ejqT5XnGh98kGmr9acRVVVdDodfxy6Rb3mbenYsSMNGjSgRo0abNmyhaioZwEz5emLQ1EU3Nzc6Nu3LwMGDKBAgQLcCQxn7NpzXA0I49jNjL2BmNFsFZS+FRjBO0sO88OOK9wPsSLwmMMEhsdw4lYw4RbqhFsl4CLs/S71x9myprUlT+7D32/DhDym4PLqD2CsN8woD1MKw9YJyR+/oGXCxwdSSC1r0MOsyqkbY3AaMkN8U8a6oLqthD+Gy//B/rkwKT/Ma2QK7Ge2sIeJU3fHObHsxY5FCJHItEPTWHJ2Cd3WdsvsoSRyvVt3q9vuur2LWr/VYu5x26UXz+4GbRnE1ENTE2xrtLxRhgYN734ynIt16hJ+MHEd5owSsmrVCzuXSNp7W97j892fM+FACtdtQgghsiwJSgshhBBC5AD6VKbUft53yawA/mHHFe6HJgzODVt+HMNzKcGfzxD+/dZL1JzwX7rGlUzW8VRLS33qV75P28rRa4/C2ZbKNN5h0XqMyTxhY4xpZc7h64EoisKinRfp26MzRy/c4Pfff2fJkiWUL1+efv36sWHDhkTpuC2thK4/dTuL916n2Tc76DTH+jTuWdH6U/cJS2sgNZ53lhxh05kHTN5wnjqTtqS7vzvBkTataZ5R/HjEAO06Go1bRffZW+g2d1faOvp7QBpHkMG/o29Kw8nlzx4f/Tnh/p0Jb6ibRQTCjIqJt2+dYAqkX/4Ptk9JGFQPvArjc6d/zNYIfwinVryYcwF8XwN+7WyqAw7w4JQpsJ/ZgeltKdQJzQ7p4YXIwTKr1nD01ZTLt8TcSHlCkKqqrL6ymve2vAfAnBNzqPxLZc4Hpj0TT05w8N5B9t61fP3YYWUHm5xDVVWir1xBNRo58uAIy7rWJHTtWgBuvtGHJ9u32+Q8WZVeTf+1bU4QHhvOHxf+4GTASQBWX1nNtpvbEuxffn45jyIfZdYQhRBCWEmC0kIIIYQQ2dzZu6F8u/Vyuvr45t+LPHxieVXo88FngH+O3WHu9uTP+fVm61YVJyU4Mpbjt4LT1Ud8fx1JXdAkWm/gSVTabwS996v1q6XvBkdS4ctNdJ23D8VChfCAfyYScWm/qW1IFB8sO8boPw6iD76PR91uVK5ancaNG7NixQoqVqzIhAkTCE4hreHdLJBW3dYirAxK3wqMYPzas9QY/y9fb7qQYN/VR2E2G8+SfdepP3krX63JWulKLfnHYTSj7H7jtOMAzjn244egNAaXo9JYlzojA/d7vrWu3YUNibf9OxpCLNQ4N8bC2VWmAO32ibB1HNzYa1oh/W1VUNMwUUijTf0xGeHOURiXG/7oY0rP/fD8szTdkYGWj/mzf9r/7W0hJoW/2x+bvJBhCCEsux56PcP6fjTvRx798IPFfVfb+VvVx/OTx/RGPaqqciv0FmuvrqXSL5UYuXtkgjZG1UiXNV1e2iCYUTXSf3P/JPc/iHjAjyd/TPd5Ar79lqv+7TlfrjwTfupDlZMJ3+9vvzvQqn5sMUHw7siRKTfKALb4PWZnqqpSZ2kdxu0fl2D7B9s+IMYQg6qqTNg/gfEHxvPmxjczaZRCCCGsJUFpIYQQQohs7uvNF1JuZIXP/jplcXtSaZG/3nyRqNhnqaYtBa/T42pAuE37O3ozmCM3rE9RvWD3tXSdL6U03PGtebqK+/CNIH4/mDgAZu9XGpfS9c2PV5+4i/5JAPqQh+g88jBs+XEiI01B5qVLl3Lx4kXWPl1FkpQBPx+2enzZRVIvQVVVmbj+HEU+Xcdbvxym4dRt/LT7Go/CYvh+m2lyxanbISzZd52o2ITBxKDwGAAiY1Jfu3fietMKqsV7r6f62IxwOyiCsWvOciswcUrNPEpwgscFlEdw80CidpcfhvHBsmNcfPDE8kmUtH7FtNH7h9FoqrN8aAGsfA9mVIB/v7Du2GXPpXCNCIRjSyy3BVjR59nPu76BRW1hnE/qxxwnKqla9CnUZ7i4CfTpK5Vg9uQBzG8Khhg4uxKWdoU5tU11tMd4JH3crf2mtOj3LX+OpMrDc8n8LiwIvmndavFDC9I+JiFEmh24l/izxFYMYWEEzJhBwMxZ6IMSXuNFHLM+Q8Ly/9Vg/P7x7Ly9k/6b+lN1SVUq/VKJdv+047NdydcSbv1n62yRESW9LgRe4Nezv2Iwmq6Hvj2a8oSv746loZzHcx7PfTbhYMKS1F+LAah6Pdc7/y/dYwn56+9095EWtvg9ZpawmDD+uvhXgjrjd8PucvrRaauOV1WV2cdnJ7m/+q/VeWPDG2y5acpuFDcBRlXVl3bCiBBCZHUSlBZCCCGEyOY0NiooveW85XTT95Kpq7v57APzjTi9MX0pxF+EuLrYyTEaVQb8fIipG9MX7DeqsGS/dTVi4/8bfhOvbrX6dLWlR+3OKDp7wk5vJfLqEQAc8pVC5+1H8J6lrDt1D7T26PV6FEWhUKFC5iB1Us7eS0XQJ5tYesDy7/urNWf5cacphee/Zx8k2r/38iNe+X43X6w6k2hf1XH/MmXjecqO3siuSwGpGo+aTKD1akAYi/ZcY93Je6nqMz36LT7Ewj3X6PLDPusOWNgq0ab3F/yHy+kl9P/BQmr+m/tNqavTwhb384/9ZgqejvWGdR/C8d8g5Fbq+rgVrz7lt1XTMIh0PJH/vkzbauPL/5pWdNvCN6US950aPzRI+FhVU7cK/uYBmFMndb/7mRbSq1uy7kOItl0mBCGEdQZsTmtZh5SpsbEWf743+ktu9OhpdT+Vz0Sw/MJyBm0ZxMH7qatTHGOModIvlYg1xqbcOBv735r/MeXQFKosqUJwVDALTls30WfmkZmExYQRpY9i7vG5nH384rPHRBw5StTZzMtaE2vI2a+N5Ew6OIkx+8bQYkULovSm75St/2pNj3U9uBGa8vekPy/9ybyT85JtczzgOBH6hBMuvz/+PU3/aMrv539P++CFEEJkCAlKCyGEEEJkc9pMvKL7YNkxVh6/A9h+pXRmUFWVTnP28N+51NWDTsoXK0+z/+rjFNslNa9Aibfq1BgbRfDOJYQeWU30fdPqXvdarxHz4ArBu35j2qYL6HQ6Hj58SEREBPnz57fJc8hOnk9jH2swUnXs5hRXKvf8KflVXHO3XwGg94KDDFp6lAE/HyY8mVThlx+GUeTTdQlWXb+x8KB5Ake03kCzb3bw1ZqzDFp6NNVp6mMNxjStyrr4wBSQe75GfLIMCZ/n+MjxTLJbwCj990zecJ778SetLGyd6jGZbRsPFzam/XiAVe+l73iABS3hwVlY+yFEBae/v9Tal/RqoGQdXmTbcaRXbBT8/TZ85QkLWlkfmP7vS9P/I6xY3XRxE/z7ZerGNSk/hMvKKSGyo+Xnlye7X4169nkU/Mcfqe6/y660rcKN88+lf9J1fGoZQkLQBwSg6jO+5vCpgIRZMKYdnmb1sQtOL6Dusro0+aMJc07Modvabkw/PN2qYyNPJ54smCaZvJL9p9M/Zer5M9PqK6sB0+SNtza/xa7bu8z7UpqgsOPWDsbuG5vqc14KumROeT7hwIRUHy+EECJjSVBaCCGEECKbs9VKaYBjN4NYfeIuG07dszroteKwqVazPpsEpVc/TZX9PL3ByKHrQZy4bdu6qN1/3M+2JFahx1Es/Buqz9Wk1dg54tv5C/RB93hydB360Ec4l6yLW5U2hB78m4nvdeHVzt2oX78+DRo0oH379jZ9HtlFh+93ExwRQ6+f9lNy5AaCImy7OmXdyXv8d+4B5b/cxLkkVpv3XpA4yL3zYgDn75tSXl+4nzD1dcfZe5JOhx1PrMHI3iuPKDlyA0U/W4/xRfzNTcgLwG8HbtD2szlU11wCoLX2MMd2rqXvotStKEvWsm6pa2+MF0DYOt5245hbFw5nUqrnHVPgmzKwbeKzbdamH0+v9E4KiLNlHOyeASefBpBuH4SwFCb6nF0FV7bCTStX8YMptfiemakf39lVpjEubg/rh6f+eCFEphh/wML7fLxr1SutWvPk2GGC161LU/9ddlv5maqqDF9h4NPlhgTnD4hMXTaV9Ig4doyLtetwqWEjzleoyI0+fVEzMGNRz/UJV53HBRpTIzz2WVmeRWesm0h1/X/pT7lta7c/GIIxhWxEz9txa0e6z5vU9zK9UU9ARADBmTGRLgXLzi9L8Ph4wHHe2/JsAqFBTX4iyJ+X/kzTeV9b/VqCxw8jbDPZWAghhG1IUFoIIYQQIpvbcNpyzee06DRnLx8sO8bA344y879LVh3z4Omqy9N3bBvMzSgfLDvGmbsJxzp722UqjNnEjosZc9PizcWHMBhVLj98YrGu9fMhaVVVURQN+pCHRFzaT2zgHYwxkdjnKYZX475EXTvCk+PrwajHvdZr+HYbh12uQjyM0TF27Fh++eUXAIzZIKW6rZ28HUKVsf+y53LKK9TTa9zahCs8nkTFMnvb5SRT3gc8MdX9jdEn/ndpNWMn5UZvtLgPICrWQI3x/9Fz/rOAd9Vx/xIRk94VUiqr7UcmvdsYS5FP1zHyn9OstE8YHF3uMI65gQPgaDJ1l1MjVwnr2z48B5MLm2o5A+y0ftVWlvfknik4fWUbTCoIR39J+RhDGmpK/9kPlvV8FlRJ7aSApOz6GnZMTrgtJom02QEXTLWq/3gDlnSy/hxnVqZ5eOz91jTG67vg4DwIvJb2voQQL1TFnysyfGe8ySSGhEGt2z16c++jj9Pcv2N0yoHpUnegxmWValdVpi40oDydIGZUX8w115OtWxOlJo84cIAnmzYleUxWrHk9aveoZPenZszXf5qdZP3gUwGnmHtibqrGlpwnmzcT+LMVn8vx2OK1Uf3X6ha3v7X5LZqtaEbD5Q1f+Gr9lEw8MDHZ/Sn9GyuJviGlTdu/2tqkHyGEELYhQWkhhBBCiGwsNCrjapTN2nKJIp+mvNrkSkA4wRExDPn9eIaNxdZuByVc4TBt0wWiYo3M3nYlw87Z/+dDtJi+k85z93IrMGHds7iF0nGroxVFIeLCXu4tep/A/+bxcMWXhOxdjqqPwbl0PdxqvEr46a2EndmGMTocxwLlyNVmMB7N3+Htt98GwGAwoNEkvNwPiYglKjZ96SnFM3uvPGb5oZvEGowER8RQccxmpm1Kuhb5iiOmrAJJTeCIiDFQatQGVFVFVVXO3w81/3sduRFESGTCv/eQyFjKjU76JrQ1CisPqKRJPig3WWdKgeigJA6AF9U8gNWD4fqedI0DAK0Dlx48Yea/F2j+zXaafb2d4IgYwFTrPUGq8A0jIOYJbEl9WsdsY0lHiE5F7fdpJa1PURodBqf/ggvr4Oo2U8ryjLTouRvCl/4zBaNn10pbfyv6pH0sQdcTPj70k2nVvarCS1z3Uwhr3Ay9ybRD0zJ15eGGaxvMP6sG217T/DLdgNaQ9PuoR5jK+CXPzlnkIVS5amr/48kfMRgz9hrLGBnJ7fcGWdx3Z9iHXPnlh0TbQ9au43zZcpwrU5ZzZcpyo09fwnbtsmq1r+Hp77fFMSOv7DdS+IGKxqiiJpGppd1BI/03Gaz6LFp1ZVXCcxkN7LmzhycxpswxYTusX10c+fX3vD6nCT3W9iDGYLpuiDXGsvTcUnqu78nhB4es7ssahuDgVLU/F3gu3eeMNcZyOehZmZrjD48z98RcDj84bN42eu9oLgVd4szjM4zeM5qAiBe3ej8tklspfTX4KttubbPJeWKMMcnuD4gISDFAfuvJLc48slE6eSGEeMnpMnsAQgghhBAi7e4Fp6I2bAaytPo3K7NlynNrbb/w7MZQ21m7ODyqBY52WuDZSum4GtIxj24ScXEvXs3fwrFIVcKObyDy+jGCdvyMd/O38KjdGUNoAGHH1qPoHHCt2BxFo+X8/ScEhcfg5WKPVqtNcP4DVx/T7cf9AFyf7J/xT/glMeKvU4z46xQNS/qk2HbT06wGBb2dk21Xa+IW86rqesVzsfStOoQlU8N64+l7OOi0NC3jm4qRm2hJefVOd912fjK0S77R4hT2WyE6NpbXZ6xktcMoFENTZui7MODnwwxuVoK+i0w3lH94vRptKuQDReZXJxL+EAKvQq7iKbc1xns9LekEHb7PuHEBhD0w/T/0ril99sZPUz4m+BZ4FjQ9J50juPuZtj9M/839BPZ9D15F4Mw/8OQ+DNwLdo62PYcQOcSAzQO4F36Pc4HnWNh6YYJ9gVGBjNs3jlr5atGjTA8Att20TVApSTYOSgO4R0CQm+V9la4nDlx9tsLIO4MVgtwUtt3aRovCLWw6HlVVufXOOygaLVEXkp74BhAzcRbnJs6CutUpNXM2Wg8P7n6ccOV4xIEDRBwwZV0psXULdn5+Sfan1WqJvnoVr98DuR0bSzmthmneuThfAI4X07CpmkK4k+kqVmtQ6bvFdE2xuzxcKJC65zhs+zC23dpGWe+y/PHKHxgepy7jzYz5Bt4ZfIrqv1bn/arv892x71J1fGqoMdZlJwmPDWff3VSUpUjBo6hHlKAEIdEh9N7Q22Kb+Kmr/7n8D3t77MXNPokXNKb03zqN7cMD1jzvL/Z8Qblc5XC1c8XL0QsnnZN536urXrXpeK4GX8VR54ifa8LX+5ora/h89+cArOq4imIexSwe3+5v03Xups6bEvUhhBAideSbvBBCCCFENhOtN3Dh/hNUVaX1zJ2ZPRwA+v98OOVGWcg/x24TFq3nhx1XWHnszgs/f1i0nkpfbTavgr0Z+Gy1ypNj6wnc8C3G6HCcS9VD55YLj7pdcSpWg+hbpwk5YKqv5t3yXTQOLsTcT5hmveq4f4k1JA40xgWkAa4EJJFKV6TZrkuW00bGF2Mw8tnfJ1P8e4kLSINpNfY3my/wzpIjSbZ/99ejvLn4EJeSqEttMKpsPpO+NP//OWR87d2bj8N5V7eGPEowQ3T/4EAMh28EmQPSYHqur/90gL1XAzN8PNnSqkFgSENK9xdVi3JOHesC0gAzK8CGT+HbqjC9bMI+bG39x3BjDwRegStbnm0/twYeyMooIeLcC78HmAI8z2v9Z2v+u/kfEw9MJDDK9B5ty7TJ8cXVJ9b6pDwhLLX6bzZaXOnrG6Ty/hrLE7ne2WDETq8SEmn7SZqP5/1I+M5dhG3fjv7ePesO2neEi7Xr8Ghu8r//y82aW6xFrdebPkdi79+nd7XqzHoUwKmoSBYEBtL75g2K3VLpvtPIopkGGp0yHd/+4LPfWaGHKgUfqrQ7aEx25Xmc7be2m1fFxq0qNoaHJ3OEZcXvmc6VkQFpgKCly7j3xWgMYcmP8ZMdnzBs+zCbnXf91fUATD001epj6i2rx/Qj0y3u+/rQ19T8tSbXQtJfxmLPnT2cDDgJwPLzy3n737etOq7z6s60/qs1tX6rxePIjCu98+qqV2n9V+tE2+P/bl5dmXIgvNf6XlwKsq7ElRBCCMsUNSsWFhFCCCGEEGZPomL5Zd8Nzt0LJTgilt2XUw5+iewr9NBKwk5sQlVV8r/1LA2jPiyQ0P1/En33HG7VO+BavimqPgZFZ2+xn2uT2qE8XREeEhFL5bGbX8j4Rea6OL4t9jrT3OPTd0J4f9kxrj1K+qZpMeUuWx3SXn/Tlh6qnvgqwebH3+tf5Wu95TrHK+2/oIrGlG6/vuM/7IlKRT3inM7RAz66mPxq38ggmFLk2eP2M2Ht0Iwd1zs7YV6j9PXhnh9CX8BEotL+oBrg4kbT4zGWU+4L8bKZe2Iuc47PoZhHMZb5L8PZ7lnmj4o/VzT/PKr2KLqV6ZZgmy21L9aeSQ0nAXCuTNkUWqfeHH8N2yslXMfT8qiRtzZZVxvYoWRJ8k2ciFPFCuiDgoi9eROnypXTtCo1I55ffIqTE+6tWhF19gz5Z84kdONG1JhYNMWLcWDIUJYGB/Gmdy5y63Rci45m4J3bVHZyYoZffnMfXT/T8cckyxOiFjfXsL6W5TVRx3sfR6vRMnrPaP65/Kwe8qk+pzhXsRLEpr6kwusfa4mxe5YRyT1c5advMyatune/fuQZ/kmS+zPi9V8+V3nOPE79ZKlTfU6Zf95wbQOzjs7iTpjp81RB4WSfk2ke0/3w+7T8s6X5PGl93t1Kd2NAxQE46Zxo8HuDNI8nOQ3zN2R289nm70jNVzRPUI5gV7ddeDp6cu7xObqu7cr3zb6nccHGiZ5TJZ9K/Ob/W4aMUQghcjpZKS2EEEIIkYWFRMTS4fs9TNt0gbUn70lAOgeJqxv8PPeaHXGp2AIMsQTvfnazQ+fqjVuVtujcfQnd/yf60ADQmm5sqhZqGBb9bD1FPl1HkU/XSUA6BymgPKS8kvSKllKjNrD6xF0W7r5G++92JxuQzmriB6QBButW0VRzzGLbuIA0QKXQ7Rk4qmwoKgQm5IEbe5Nu8/x7T0YHpCH9AWl4MQFpMNXajgtICyHMHLQOAFwNuUrvDb3ZeXsnoTGhidqdenQq0TZbWnt1bYo1YNPjvXVGxv2ixyXSdI4uuwxWB6QBoi9d4nqXLhjDw7lUtx7Xu3Wn67gKVF1S1bwqVVVV7n3xBcF//ZUhz8FaamQkIatWEX3pMlf92/Pou+8J+OEH3nvjDV65fo3z0dG4ajTYKwqlHR2Zms+PnWFhTA94FsjLFZr0v0XfLUZ6bLccFN50fZNpDFg4Pg0BaYCP/k7479Rjh/X/bqkVnUI6dVSVxieNfPKnAcVGr9e0BKSfN3zncHNAGky//2mHpqW5v/hB3eB0ZF5ZfmE5Lf9smWEBaYBdd3ax5eazjCjxxw7QcHlDTgacpOvargAM3jrYYpD95KOTFr/LxW27/eQ2RjXjXntCCJGdSU1pIYQQQogsymhUaf/9Lm49Te1cwMuJfB6OvNWwGG8nk8pXZH2qqppn6EfdOk3ktaNoHFyx9ymEU/EauFVpiyEskMirR9G6+eBW2ZRuzs6nIG7VX0HRaNG55zb3p2i0Fs8jcp7dDkPNP7ePHs9pNa72nYoGFXfCGb5sP1GYAgd53R353L8stwIjmLXpNPU0p6mgXMdZiWaHoTKPSbrOYFawyN50k3SxvhW31NwY0eBEwlqOc+1nZcbQsr5FbU3/L90OPAuZAtEuucHJEwxpu9n/0lrkD2+uMwX6NXbg6mv6PUY/ATc/UBTTf0LkcHFBaYCLQRcZtGWQxRWD2hdwXTJm3xj8XPywwXQXi0rfgUUzDTxyB5/EcXerXKhew/zzV78ZGPGmlrcXvsJ3nX9B+8ZH6O/fJ3jFn7jUqUPM7Ts416iOon3x13RGVUUT7z1Moyj09vLmTmwst2JjcXq6T1VVajg782WevIx+cJ98Ojt6eHlR82LyAddO+1SWNUm8fcSuEey+s5s1V9fY7LlUvZpwLJWvZtzkhfA9eyxuj7lxg6izZ/lj8rNg/PLJBq77woHSGv5q8OLXiFX8uSI/tPiBeSfnWdz/y9lfeLvS23g4eKTrPFdCrqTcKJPdenIr2f291veyqp/vjn3H/FPzaVawGUaMROuj0at6Dt03lZ15vezrjKg1It3jFUKInEbSdwshhBBCZFEHrj421wHuXacw4zpWSLD/bnAk9SZvzYyhCRsJPbSK4J2/4FCwAsaIYGIeXsOz4et41O2KPjSA4J2/oA8NwKP2/3AqXiPR8fGD2yn5pktl1p+6x5bzD1NuLLKkaspF/nYYk2I7o6rwAG+0brnJVagsWre8EHaf8DMbcSEyQdtdhgo01J7OoBEL8ZKwdwWfUqagvzEWwgJAHwV5K0CXxZk9OiFs5o8LfzBu/zir2u7stpNGyzMqZPzM75P1aHLInc3cQz4g1zvvgKKYr+8yOn23QVXRxk2UNBpRASeNKWh6MjKSgXdu08zVlXF58yU4btrDh2x6EsrqosVw1qQcZO0+QotRY90164neJ7hQrrz5cWqudwHmtdWwpYppTAtn6HGNsvrQNMk9bBi53n4LRVGIvXuXy82aJ9v+cAmFn1toeOxmmvBw3wtQFCpeM2LQwNnCprFrjKrVvzNbeK/yewysMjDVxx1/eJzeG3pnwIgyztHeR9EpOir9UilDz/Njyx+p61c3Q88hhBDZjQSlhRBCCCGysBO3gtl89j4ftiyN1sJNiSKfrsuEUeUM1yf78zgsmisB4XSdt++Fn79J7gjWfj2M9m9/ytu9u9Nj3m7CTm0hcPMccrV5H9dKLYm+e4GQ/SswPHmEzyufYOedP8V+D3zenDzuSdeSvfk4gkbTttnyqYgM4Oqgo27xXAxpXpL23+0GwI0ITjkOSFe/91UvDhjLkptg6mnP2mKoQoik5CoB70tmE5FzrL6ympG7R2b2MBJYOkWPLodlydX55cMY+gSvHj14PH9+hp0n/grpqQ8fcik6mgC9ns6eHjR0caWIvT3/PnnCR3fv8IFPbgbkymU+NlZViVVVqwLScbp+ZkXCzlgj7UML88aP17gZE0Mhe/tUPy+Aj/prueWrJFnr2tZ0efKQd8yXBC7+mYgDB1J9/IJWGvpvNr2QR76hZeTvBpxjYEZHDfvKJv4de4aphDpj86D1mo5rKOJRxOK+u2F3cdQ5olW0CVZUH31wlD4b+9h0HBnNWedMvwr9+P749xl+rsOvH06QZQIg2hCNvcY+VZMthBAip5CgtBBCCCFENiZB6bSx0ypcmtAOgJO3g+nwveX0e7Z08PPmqICjnRZXBx0b1q/jvffe4+jRo+TKlYv3lx1j7cl7BG1fRNiJTeR/dyEaB2ciLuwlNugOHnW6pHiONYMbULFAymn3Hj6JotaELSm2ExljaIuSzPzvUpL7F71Zk6alfc2PVVXFqMLfR2/TZW2FRO1DVGc8Gg+Cyj3ALR/ERsCjixByG67tMKVtdsuHoWQbSsy5j4oGUPlMt5R3dNnsPcSrKJTxx6h1QLP7m8weTfbjV820gjd/dTi2JLNHk301Hw15KkDB2hAdClGh8PAsxISZ0nu75QOdvSltuleRzB6tEDaz+fpmPtrxUWYPI4Hfpuqxs1yyWFhBVVUG3bnNjdhYunl4cjM2hh3h4VRydGRgLh9KODjwS2AgXwc85Bu//LR0S1j24/nU38nZWknh5+YaIh0tt1eNKp+tMFLlipEP794FYLCPaQwvu/U1FJY20RBjp9D4pJFB60wBbKsC/WnUtGBTmhVqxu47u9l5eyeR+mfZdl4v+zrejt4ceXiEPXcy/ntUdqdRNIlqTHs5eJHfNT/lfcrjbOdMbqfcBEYFEhEbQaQ+kkFVBpHHJU8mjVgIITKO1JQWQgghhBCJKAocGtmCsCg9Tb7enmL7xW/W5ItVp831r7M6X7dnK4kfh8Vk6LlUVeXTtmXwfW71sq+vL/fv3+fOnTv4+PgwpVM5rj0K50RVf8LPbCfq9hmci9fEuXS9eH0ZUZSkV6RYE5AGcLaXrwGZ5fpkf1RVTTYoXS6fe4LHiqKgVaBLjYLU/3MWvXX/MkXf/WlwGUb5l2VAw2LPDrB3Bhcf08+Vupo3awGVuCC0wiR9T7Ybq7DMfoItnlqGeFKoGW6vTIbcpRNs1wBIUDp5TT6HM/9AgRpQcwB4FHj2ugAJSqeGa154/7Ap4Gz3XCYKJ0/T//MmnjAiRE6jVV58veOUyEqb1IlbmxS3QvNIZCRXY2KYV6AghZ+uSq75JJTfg4P5PTiID3P70s3Tk+uxMQy9e4cdxUvgo3t2HWltQBqg2UmVog8MjOhn+TpU0SgUuBhD25s3KWhnx/s+ufGMV2M7Lo13atN55wTtDqu0O2xgdzmFBmefveqrXzJypGQaa1SrqulLn6qiqKA+t+ra/Y8tXIz4jxNVNUR6J9z367lf03bOl9TzAWmAoOgggqKDOP3YchmdbmW6SVBaCJEjyd0oIYQQQghhtnJQfVYcvsUHzUvi4+qAj2vKKxM+aV2aJqV9zfXosoN5vaubf65TLFcyLdPn2U0zhatXrxIcHEyFChWwt7enSJEiNG/enDFjxjBv3jx8fX15rVoBjp04CRotWhcvC32l8abTcxx1tulHJDa3VzWK+LjQdtauJNukdCNVl0wqxuHdWzLk99zmx/s/a05ej6TTtSdP4YAxY2tVpscDn3rk6fdP0g0+uQrTiiW9/2VWuAE0GWH6T6RPmfamutBau8weiRCZLkIfkdlDSMReVkmnStw1yNmoKHx1OsKMRmJUlfhXHq3d3LkVE8ufIcEM9lHx1Gp5P5cP7dzcEwSk06LoA+jzn4GfW2hBVWl4RuX9NUaitCo6PUx5HEg5R0dm+D0rWRNqMKBVwEWjNT+HXeFh6FCo6+KSrvFkN/ED0gAj/jQClvPXR+tAayTd6e0fl89H8apdqJW3FmW8y7Dx+kb23t1LaHQohdwLsen6JgKjAtN3kpeEi50LpbxK0b5Ye5ztnImIjeBG6A0i9BE8inyEn4sfLnYuOOmc8HH0SblDIYTIhiQoLYQQQgghANPqzCoFPalS0DNVxw1qWgJI3UqJzHRxfFvs4wVlney1nPmqNY2nbeORjVdNx934izyxgWodR+Hu7k6uXLn4/vvvqV+/Pn379uX7779nwIABTJ8+nQc3b/DkyFo0Ds6JgtK2XBGi00pQ2la+aF+OXrULodUo3AuOolAuZ8C0Ivr59PrHvmhpVZ/J/fvEfx1cn+yf6vFO71qZD/84YX5sxPrXwuVqoyhxdHyqz5lWed5dlXwDl1zwZTD80gGu7XwhY8o2Wqdy9budC8SGZ8xY4ti7QcyT1B/32W1weJouduPnsH+2bcdlyeAj4FMi488jRDbj6+ybciORJRlU1TyBdE1oCJ/eu8figoWwUxQCDQYC9HoK2dujV1V0isL/PD359lEAp6IiaejiipdOR42nAen0rlT2P6Tif0ifoB9HgwIKPDboMT5dzf1XSDDnoqL490kYbloNk/Lmo6KTE4/0epYEBhGtGqnq5IRjKupav0wcbFRO+6tXZuFYrpz5cccSHelYoqP58ee1P+e3c78x+eBk25wwh+lcsjNdSnXBUedIcc/imT0cIYTIdPKpLYQQQgiRjV0Y38ZmfaU3/WF2iElXK+SZICAdx8VBx/w3auDtYm+T86hG07IdVVWZ17Egf/2xjPnz57Ny5UqcnZ0ZNmwYu3btolu3bnzwwQfcu3ePypUr8+2nbxF95yy+//sSnVvqVnAX80ndShFLvweResVzu+Bop8VOqzEHpOOUyuNq/rl9pXx4Wfn6stMm/cfk5Zy+1Zr5PJzSfKxGm/FpW6PLdYW6g+HjS6aavClRFKj1ToaPK1v5/C74VbG+vU9p+PxOhg0HgHrvw+e3YUyI6b8Pz1l3XJ81zwLSAG0mZsz4nudoXSkEIV42tfLWokruKpk9DLPpTaZn9hCyDW28FdKP9QbG581LTWdn6ru40MTFlS/u3ydAr0f3tN21mGiK2NtT0C7xZ7EtJkrqnwtsP/QwXTfXdHLmfHQ09S9f4o/gYBQUvsqbFy+tlq8DHgLgo9PR3t0dA0hA+gWIH5BOSq+yvVj1agqTCbOIsfXG2rQ/N3s3i9s7l+zMqT6nGFNvDOV9yktAWgghnpKV0kIIIYQQ2ZiDznZBorgac6nRt14R88/aZFIOZxWNSyW9wqdqIS+OjGqBoiiJVrimhmo0oDxNL2gIfUjYEy/q1atHly5dANi4cSNNmzZl0qRJeHl50aVLF7p06cK+fftYf+oeS646JOrHGuuHNEzVOFcPrk+bmUmnmBYpm/RaRRqXyp3k/h971zDXZP+oVekk2z0vub/rBiV86Fe/KGXyWb4BlhJ7Xdr/TjWaZL4+1nwLDs1Pc99xHNpNANdUrsQr1iTd581RUpvmv9X4jJ9V1Oq5FfbuftDkM9g+KeljynWEoo0Sb++8AP7qb9PhJeDkDc4ZV9ZBiOxMURSWtFtCxZ8rmretenUVBdwKUP3X6gna/tH+D7qu7ZphY9FpdLQs3JKLuX0wBDzKsPPkJJufhDLs7l28tVpmxkuPPdzXl/fv3ObNWzdp5eZGYTt7fnj8iNIOjhSxt82EzThdP9NR6J6RbntUjApMMz7BL1cZzunO4VLGhdb3cjH5LwfORUXTws0VF40GF42WU1GRPNLrzSu5O3h4cCQykhhVxQ7bZhTKrnoM12J4bmKjYlQpFACBbhDhAKXuQOnbKpf84EyRZ9cLWoPKgd6Hidm9j6Dffwe9gVwD+uNcp47V5y/mWYwlbZfQe0Nvmz2njNCpZCc6lewEkOC9LE19lejE2PpjuRN2hz4b+vAg4gEAv7f/nfK5yqd7rEIIkRNJUFoIIYQQQgCYU+VZa/nbdagdrx5zdkjfndIQ03tDS1VVFI0WQ1gQD5aPQjXq6f7TA1q1amVu4+bmxuLFi3n11VeZNWsWQ4cOpXz58tStW5dLXIerZ1IdkLbXanC0S90EheTqFotnNgxpaLE+tKJAj1qFkj22iI8LOz5pwqOwaIpauZJ9xydNkp3goSgKo19JecVKUuzSkbo9Vk1iXM2/hPpD0x+UHrg39QFpAAdXaDEG/huTvvPnFErGr2i3iSafgmdhWPmu5f3VkripndqguzVKtYGLGwHFtIpbVt4JkayNnTcSHBVMIfdCiVYJ+jr5sqXrlgwfw/pO6wGw98tPpASlLXo+zXZNJ2fe8PLi16AgcxViVVXxs7Pj10KFGffgAQfCI9hLOE1cXRnhmwcwfUewxXV+909M760382mY1ErP1YlX8fH0QRf6kICLATza/Ig13fMx0MmZqk6m7DMxqsqu8DD+CA7mvVw+6BTFHJj+Km9e83PIaXp/pGXJNwkLpk/trOFCAYUFs55tHzRQS4Bn0v82qkbhRp5njwNK++D/Wl/eLNAQbwdvTgScoH7++mgVLYqi4NC0KW5Nm6Z53A5ahzQfm9383v53SnqWBCC/a35+bPUj98LuUc+vnkySEEKIZMg3LSGEEEIIAUBq7+fED0hD+oJdL4q1twfWfdAgVf3GpetWFAVDRAjBe3/HPk8xWr41ijZt2nLu3Dnmz38WsKtQoQLffvstv//+O7/88gtRUVHm44FUBaQBFvStkar2kD0mEWQFZfO5J9r2QbMSnBrT2qrjC+dyoXph70TbV7xbN8n2GSnNadv7bcbB3kLq8OLNTem2bRHEs0t7anHsXVNu87KwNmhbvpMpdXexxhk7nuRU6QGjA00pvV3iZR3wKAjFmlk+JpXvj1bp8D34fwNDT4Kdo+37FyKHye+an/I+5ZNMWxtnaqOpGTaGfK75ADA8SUOt+myg1IH9lNy1M03Hxk00VRSFYIOBR3pTnWYvnY53cvlQ3cmZMQ/uE200oigKsaqKk0bDuLx5+bVQIebkL2AOSBvSGZA2qir7w8NNfWkUVFVFNaoErA0gf5H8nNh/gk9++ISC7xXEqYgTAb8F8KhSJQB2hIUx9eEDPrl7lwHeuejh5QVgTjEeJycGAGPsTKvKu36qpcdwLV0/03G4lIYnzgpvDtVyxxsWtNIkG5AG+K7Zd5zqc4r8rvlx0jmxsfNG+pTvQzGPYng6etK4YGN0Gp3NfocOutQFpSv5VDL//Gb5NyngWoAv6nxhk7FYsrXL1gSPu5Xulua+yucqj732WSaBYh7FqJ+/fo58PQohhC1l/TuHQgghhBAiyynyXO1cgCmdK1lombX0qJ38ytY45f1Srieqf/KYR+tnmldHA0TdPkPgptkYQh/i0aAXy77sz59/rqBatWosX76c1atXm4/39/fn66+/5rXXXsPR0RQESevi5YYlk04hnZTskG49K9JpFD5sVRpXh/QlnapZJHGg+kXQpSV47OYHhWpTwNtC4LflV9bVfrZGcunBUzw2m6wOfhGsDUp3WQyDDkAqbyCnWrMUbi7H/du9uRGKNITe/8Cw00lPdEjP6wSg0HMTQlpPAtfcUHMAeFr3GSGEsE79/PUzpN8RNUeYf465ejVDzpGZ8s+cidbDA13u3Nj5+aXqWDVeEPnvkGDeunWL12/eYMjdO/weHISnVstXefNiUFU+uHsHADtFMa2GxhTg9dKa3peNqmquRZ0Wqqryc1Ag/W/f4osqUShx154KcBtaVW6Fj4cPKipOhZ3wbuZN9VLVWVOwAGAKPntotczJX4A+3t7mMWV1DqVKpbsPNe73riiJUnKHOykMe0fHpuopf943KdgEgLWd1rK7++4MX8lsr0ndNWHF3M/SZ39Y40M2dN5A19IZl/Y/t3PC70wjao1IoqVlzQs1B2BAxQE2G5MQQrxsJCgthBBCCCEA0CWx0vn7nlUTbdv6UZNE28r5JV5Rmh7jOlawaX+XJ7TFx9V2N2KibpxA55k3wWz4mAdXiXl0k5gHV9G5eqPVKDg4ODB58mSMRiOLFy9m79695vbvvPMOtWvXNqcdtHYlSuUCHvzY21S78d3GxdM0/oJezjjb59xA3set0n9DcEGfxCvQDdngZmhy0jQZ4enrUqNNIRj40YU0jCie9AQbbZmyurS/7fpKj65L4O3tUKCm9cf4VUtdfeiMXM2j0ZlSsjf62Lr2PiWg71oonsQKabN0jLlCZ3j9L6jSy/TYqwjUfS/t/QkhEsnv9qxWsTaDygm8Xu71DOnX1txfecX0g6Lg897AFNu7NGxI/pkzcG8TLxtLKt+n465LfwkMZPLDh3T39GSmX37y6HRMffiQHWFhFLa3Z3zefJyMjGTiA1MNXI2iPMvYE/e5n87PCEVRaOLqSnt3d1b/e4fYwFgURWFinYm8UusVYmJiCAsLw93e9B3CuZgzfvn8OHvBdD1R38WFd3P5UM3Z2bTCOhWrtr99JRNveacze0z/Ibb5u/m68dfmn3UaXYJVvRkltX/zBqMh5UYZyE6TMAtQy8Itk20/o8kMVnVcxftV38/IYQkhRI4mQWkhhBBCCAHAO42KWdzevpIfC/rUoFbRZys7NS9glW2Hyn4Wg4JplVTQPSl22uSfo0u5xnjW6w5AxOUDALhXfwW3Km1A0RC8ZykOT1MllyhRgq+++orbt28zdepU7ty5k6CvZzf/rBvbpNcq0ap8Xk6OacWnbcuk5mmZaTQKp8e0tmmgPit5u1FxtnzUGHdH6wKdw9uUpnvNgubHTUvnpnnZPInapWVVelJ+7lfLZn1ZK121xC0FjeMH6d3ypr1vSF+tYFuulHZL/O/+Qnx4Hqr3NaWR/uQqlOsAflWh1Xjr+xiwJWMDzanx/hHIU972/fqUTNtxgw5C5wVg72JK1917Jby1zaZDE+JltqTtEloWbsnkhpPN2zIqKJ1d5J82lbLnz1H23Flyf/ABuT/8MMm2Wh8fCs3/Efc2bRLuSCHAuS40lKVBQQm2hRuN7I4IZ6RvHjp7euKl1bI1LIxmrq5UdTKVyqjh7MzIPHn4LTiIHWFhaXuCwMFSydQzVlVCSzkS1C83joUcufHtDV4r+RodynSgbNmybNq0iW3bttGqUCvaFW3H6LqjyZ8/P+7u7uifXl/YxQuUpyYt8u4KGk4XejGfh/lnTKfMubPPNqgqilPqSpL8W8U01pu54Ymzbcbduoh1pWYyk7fTi8kcVMS9CPNazEux3fQm0/FwSJgxK36ZAkVRKOZRDE16rlmFEOIlJ++gQgghhBDZ3H8fNkp3HzWLeNGxav4k9zcvm4eyeZOvG2hrWo1CtUJeNunrqw6pD4zs+6x5gsdqvOCbqhrNKbufHFvPozXfEHbGFNxwrdwa5zL1KRp7kwXzfzQf07BhQ959910aNmxI/vyWf9euDhZq9lrg42pa6eDuaF37pGg0Cr8OSHtgtHjujK1/nFal8rhir9NQPLcrJ8e0ZmKniike816TEkzuXIkPW5bC1UHHSP+y5n1/DaxHn7qF+ahlKWZ2q2KzcTYulZvyNs4wkJK0TSiJSyFp6eujDVeOa9LxerblTc3MutHong9emWVKI+2S69n25H4vA/cmfGyL2t5J6bPG+rZNR5lWIWcEn5KpC9QD9FwBuUs/C9hrNFC8KThnThp9IXKiKr5VmN5kOn6uz9JNv8xB6dLHjyXa5vP2W5Q5d5bCS39LMIGozOlTlNq9y3JHSQRiVVUlxGDgcEQEZR0TTjCMfbqvqpMT28PC6HD9Gi3d3Jiazw93rZZTkZEEGwz4u7nzU4GCNHa1UJ7DSg884cteif+de36oodvndoztCBeuhKJz1xF1I4q/x/wNwPDhw6levTojRoxg/g/z6enRE6eLTixevJj69esnqhudFlO6ZPznef4Z03Fr3RpFUbArYEo77taiBXb58qWqnz8aapjRUcNXPZP+m/mhxQ/pGuuLoKbiurBRgUa8U+kdupXuxrj64zJwVLCm0xrq5a9ncd97lU0ZUz6sbpo0MrDys6wGlXwqManBpAwdmxBCvGzSWYxJCCGEEEJkthK+6Q8WF86VcnDxRSct1mkUtHa2uZnZp16RVB/z/ApiRVGIeXgNjaMLOndfou9fRtXH4lSiNjEPrxJ64C907rk5OvsD7t+pyHdfT+a3334jf/78vPI0fWO/fv3M/amqmmjFR+vyKa/QXNK/Fr7ujql+Pmkxs1sVzt4LZUDDorg72tFi+g5uB0UCcHBkc24FRtJ57t4Uekkd/0r5WHfyXrr6UJ5L79uzdiF61CpI0c/Wp3jsB81LMqhpiQRprqsX9qJ6YdtMkHiet0vGp1KML10rpR0svNd4Fk57f/HVejthIDa1SrVJuY21strql7xJTKoYcsIU+C3ZGi5tSv95ui+F33smvS93Wcv7LGn8SfrHk5wUU3zH8+ocKNUq48YihEiS1pZZLJ7qULyDzfu0taL//I3G0fK1mqIoOFerRtlzZwnftw9dnjwoutTfnn1kMJBbp+MTX1+cNRpux8QQYjRS3tERBQg2GJjw8AEnIyMZ7ONDby/TJJwbMTGsDg2lrZsb1Zydqeti+g5gSGP9aJ0BzhVSGPCBlp++NaViPl5UQe+gIfp+NFfHXcW1oitOhZ1wq+zGhd0XGDp0KDNnzmT16tX07duXH3/8kSlTpqAoCh999BGDBg3i3Hffp3os8b1e9nV+Pfcrt3NBgcfp6ipZ7m3bmn8u8vsywg8cwL1lSzSurjycMiXF47dVVLjnrRDiqrCvbPK//9TUaP+x5Y8pN8oAqVlBPLv5bABG1RmVaN+QakOYdXSWTcb0avFXk93/buV3ebXEq+RzMU0k6FmmJ/X96lPQraD5PWxh64UUcS9ik/EIIcTLLot92xZCCCGEEJnBmjK5blamQbYVjaLgZK/F1SFt5y2T141p/6vEubHpD1apqoohKox7iz4gZP9fPDm2nvs/DyPm3gV0brlwrdgSOy8/grYtIib4AdXKl2bQoEHkzZuXcePGceTIkUT9WUpBqNNqeDuJNOpx6hRLR+DOgoJezom2/TWwHj+8Xo2OVfPzebuy+Lo54minZfeIZizpX4uNQxvi6+aYtvrEyZj0WkVm96zGor6pqKFrwfvNSyTapigKP71hXTp4Wz+v5EzsVJHqhb2oWcSLvwbWzfDzWarF2D0m8c3AhJ6+QTwfCBy4D5w8E24bcjJtA2s3LW3HxdFo4L396evDLIukv46jS2LiQtxKZP9voFA96PpL+s5TJpla2l5FwDW3aQW0JQ0/Mo3DNQ80yuCANIB3cevaVe0NlXtk7FiEEEmyZYrbSj6V+POVP/mq3lc26zMppwor9B+iZVaH1I8/90cf4ljWukk8LnXr4lAs+eu+2Js3Ez5WVcY+uM97t28TaTTirNEQoNcz7uEDvg54yI2YGDy0Wt7JlYvd4eH08fY2B6RVVWVpcBDHIyPJ9VwgPC4grfHwwG+a9Z/JWqPp/6Euzz471ac/hhwMwbGIIwXeLoBPWx8KvFOAhm82ZOHChcydOxeAH374gf/++49Vq1axdetWPvvsMwCM1nw5ScYnNT8hr0te81gyQpEVfyR4rPPxwcPfH8XeHu83elv8Pa6roTD8TS3/1FV440Mtc9trWVnP+teZky5xWvDd3XfTIH8DAD6q/hHHeh+jrl/GX1Naksc5D00KNkl3PwMqDkj/YAAfJx++rPdlsm0URcHP1S9BTfUiHkUSTKqpmbcmuZ1tV8JHCCFeZrJSWgghhBBCWOXdxsU5eTuEVyr5JdmmVbk8bD77wCbniwsMvlm/CN9tvZzq439/uw6ezrZbhap1dCVPr6k8+P0zUFVytRuCa8UWADj4lca1cmtCDvzFe++9x99//02NGjXo0aMHq1evxs8v4e8suZp4Kd07S8sqluS4OOg4OLI59loNqgp6o0put6TrTMevqVzez53iuV24EhCerjFM7FSRHrUKmn8vTcv4prkvRTHVQbekRbk8rP+gIe2+TSJFZiYo6O3MXwMtpxPMCJZWSu83lkv+oLgbwxotdPoR/nnb9Njbwo10r8JQ5XU4/ms6R5oGvmVNNZmPLH6x59U5QdefTfWTD86HPTMz/pxFGj772bMg9NuQMefptxlC7zyrDd34E4gOgb3fgXsBCL0Ntd+F5qNN+2v0fzE1re0cYcQN2D8HdiSzEq3Dd1mnxrYQIl00iobS3qVfyLnG9dCAorCnvMLgNUa0qYiP2idRosVW7BSFCo6OnIuKYsLDB4zPm4/cOh2t3NxYGxrKvMeP+czXl9c8PDkcEcm8x48JNhjw0Gg5HhXJhehoFhYoSGF7y9fIpfbtRdFoiLl2lUdz5qY4ng01EgdU435dMQExGCONKE+vPbROWj4c8CGLHy5m+PDhVKpUifr165M3b17y5s377HhVtTiJzlru/v5oFA39KvTjbq5xFHyUMfmenComXR5G0WrxeKU9EYcOsfb0CjZX03AlH8TqTM/ret60ZRH4u8PftP27bYJtHg4efN/se4Kig/Bx8klTv7aiKAqTG06mztI66e5rZtOZDN02NF19VPCpgF16ysMIIYSwOVkpLYQQQgiRA6Q3BXDZfCmnAHdztGNJ/9p0rVkwyTYzulVhlH8qUrwmIy529k5jK1fEPVWpgAeXJ7S1WUBaNRpQFAVVNWJ4EmC606aqaBycUfWx5nZOxarjWqklgYGBDBo0CIDXXnuNBQsWkC9fvgQ1qZOTXMAa0loTOHm+bo54Otvj5WKfbED6eXZaDf8Oa5zOczvQs3ahRM977fsN0tTfuvcbJru/nJ87P/dLex3t7C5Nrx+3eGnl46+MTmoF3CszU38OW8lv3Wr4ZKX2RvjHF6FUa/AoAE0+hQYfglvSk3fSNqbnftev/23b/pNSqDZUeC3htlbjYUwIfHgGRgdB23hB4RcZAHbyhKafg0ehpNtIQFqIHCMjUoEnKd57x3uDMrcutu+IEeaf464lX/Pw5BV3Dy5GRzMzIACAzh6eNHZx5WpMNPMDTfmqJ+bLx1veubgZE8PpqCjy6nSsL1qMYg4OGCxclxZdvQpFY/q88Xn//RTHNq67hjs+CqrR1NeyxhrCHWBJc1MfzsWdUTQKEVciAKiRpwb+Vf2pXr064eHhNGzYkHv3EpdsSelaODm6PHnwmzQRgE4lOnHo9app7is5ro2tu/7NN/Yrvn1Vy/mCijkgnR4F3ArgoE18ra7VaDM9IB3n+TI6adW8UPN091E7b20bjEQIIYQtSVBaCCGEECIH+OOd9KVoS0vNZUtcHHQMaJh8GkJrNCjhY74hlZr03VcntmP14AbotLa5zFWNBhSNFlUfQ8y9S7iUbUThT1biVrUtjzd8R9TtM6hGg7m9c6l6dOvWjdWrV7Nu3ToAtFptkum6LcluMZT0Bsk/aF7S4vYK+T24MrEdxXKnXO88vnJ+7im2aVwqN+s/aGhejf9J6xez8iorSNM/V+cFz36OH/TVJrHyRGsH5Tul4UQ2ULn7iz1f+dfAMd5rzs4JWnwJH52DJp/b7jyf3X72c+4ySaf0Tq8y7VPXXpMFbikMSiJte+13X+w4hBAZ6rNan1ncbl+4sE3Ps+e5ur5Bbqn74NT5pj3biyXeb/Q2/2yMt91OUfDT2bE8OIjVISEA9PT0pFRuZ7brI1j0NDD9no8P3+UvwKz8+fkqbz6cNRr0SdSPdixVyvyzoii4NEh+gmCwqykgrWgUjDFGfnENp0enGK7r9Kb+CjliiDIQsj+E6PvRFPc0TTS1s7Pjww8/NE/cTK8DpZ89F7/Jk1CergJ31Dkyu8tvFP3H9hO5/L5OZ9mRdGhRuIX55+Xtl2faOJJiy7T9fcv3tbi9ZeGWADhqLdduB/i01qd0L/OCrwuFEEKkKAt8gxRCCCGEEOlVwtc1zaulqxf2ws5GQVxbaVgy4Uz/dxqnHOj+X/UCNl1FrKoqfeoXI5/hIfcWD+HJ0bVE3zkHgHfLgdjnKUrgvz8Q+/iW+Rhj1BOGDh3KqlWr8Pd/Vp81NSs+knoKRXI5v5Cawy/Slo8a06t20qsctRqFaf+rbHV/hXMlro+dlHJ+7lyZ2I4L49swqGniGtQ5Vfx62Ts/acrCvglXFv9jqE/tqO+fbSjaCHLFy1bgkgve3AhvbUt+BoX/dFsNOXW0dtB6Yvr68Eg6G0QiXRYlva/+B9b3U6Nf8vvtXaDBMNPPrSdY329qdZqXcX1nFHsXyyvT2yaT1lsI8cK0K9ou3X28U+mdJFN3e/ftk+7+45vVMfHK6FG9rVst7dm9G841bJCxIx5Fq8WpShXAVMLlanQ0DS9fYm9EOIpi2jbn8SN2h4dhr9HQ+5+F6GrlZ1tYGKueBqu1ikLcMzCqKjorr0v9pk1NtC3KDua10bCigcKt3AqKRiHqdhRXxlzh/u/3uTnvNje+uUFJTUmciznj08aHsLNh3Ft6jyebnzBlyhSmTZtGvXr1ePPNN01jMhoTncda772n5ZvXtHQboaXEju241E18rexYtiz5Jk1K8zme59KwIVq3lLNMZZQv6nzBF3W+YFvXbZTLlUIJlkzgqHOkTznb/F1+WP1D/nzlT2Y0mUGjAo1oWbglS9st5ZvG3/Bjyx/Z9L9NFo+rkacGPcv0RKeRyqVCCJHVZK27j0IIIYQQIs0Wv1mTwrmcaVshL3s/bUYRKwN0napmbO07S9qUz8uu4U2T3P/8mD5tUybFursdq9j2eSiKQnPPx5ybP4xq9Rrj1bQ/9nmfBS99u40Ho57A/34k8toxgvf+zoMF73H//n3q1TONNS032SylvHuzfhG2f9KU6oW90/6EMlDfNK60L57bNcWAfVEf61dK/zYg9Sn6HHSZm5bzRYtfo1GjgYr5PRPsP2EszgO8ibJ/+lorlbBuIQCF60L+asmfyNkb6gyyblCDDlnXzlo130r7sW9tTTlA3OgT+OAYfHQh+XZ2TtDWypVU7Wek3KbFGPjsDpRokWLTNHNwTVivOrsY8F/CVfyOHpk3FiFEAlMapX+CiFZJ+rPas3t3iv7zN169eyfZJr0uFrAuiJt7yJAMOX/BH0y1naONRuY8fkQtZ2em5fNjhl9+Zvrlp6SDAzMDArhfvRodqr/Kihkr8SpfntWhIQQbTBl94q63kqrVXHjpb4m26by8KLF1C1ElTNfYyxppeOsDLVuqaljR0PRvEnE5gutfX8e1oivFRhej8JDCxAbHcn3GdcbXHE/uRrlp+25bOtfqzMrfVrJs2TK++eYbXnvtWWkITRqzblzNC488TM9H1SjY5cmTZFvPTh3TdA5LPF591WZ9Waug27MJcy52LnQt3TXLpOu25OOaH7O4zeJ096MoCqW9S9OicAtmN5/N9CbTqZi7IoqiUNevLt6O3qx6dVWCY8bVH8fC1gvTlQZeCCFExpHpQkIIIYQQOUSlAp7s+ORZoHfV4AZU/mpzisclt1I1o3zTtTIuDjqOftGSTWfu89nfp8z7BjUtjq97wlRsiqJQvbAXp8a04tLDMEr4umKv1XDmbigLd1+jRTlfGpS07Y0ZVVX5+eef6dGjB8PHfU2TqVswxkSif3gNjEYc8pchT/eJPPhjNEHbF2KMCmPX9q0J0hCm5SZbPs9nz12jwJWJ7bL8TZUxHcqzeO/1VB3TtHRuq9p5u9iz4t26dPlhX7LtJnSqQAEv61dKv6wc7bR0r1mQsGg9+T2diNabJk60iJ5KXc1ZlhqaU79ELuy77oc7By0Hpa2lWjkpI3eplNukhs7eFORdOyz1xyoasHeGgrXh1gHLbeq9b33Q0z39aUkTcHC1bX+WZPH3G4s88sNbWyAiEB5dAh/LZQGEEDmPoig4li1L7sF+BC1Zkq6+YpK5SzriTS1TFhmS3J9/3g/ovLzSdf7n6fV6dDodGg/TZ46donAtJobazi7m1c41nJ0JNxqZEhTItMBAajx5QoUKFfi4Vy80332Pp9a6yXfO1SxPNrPz86Pyms1U/sWUuUY1qgmmT4afD8ergRd5/mcKCN/+/TZ5cuXh4cOHzBo8i2Nbj5nbBgYGotVq8fDwMNfHTs817sg3nj23Mt5l0txPahRc8BMu9ZKfKJsR/urw1ws/Z3pVz1M9yX2VfCrZ7DzFPItxqNchTj06RVXfqrI6WgghsjhZKS2EEEIIkUN5ONlxfHTLZNu83ahYhgQ8k6txXb2wFy5P60R7u9jTo1YhtnzUmBZlfRnRpgwftky6vq+box3VCnnh7miHo52W6oW9kRKpbAAAx9RJREFUmN2rGp2qFkj3mJ9f1awoCnq9nhs3bhATeBflyHL0W78n+K8vefDHF4Sf3Y7Ow5c83Sfg4z8Mv/5zqVunNgZD0jcsrdG1RkHeqFuYtxsV48DnLbJ8QDrO0rdSt0p5RrcqVretWcSbEr5JB+OuTWpHr9q2rSmZk03uXInve1Yz3ci307JyUH0uqwVYYmiFAS0ftSqNxj0PlH0FtOm4sVdnoO0GnVrV+kDLseBZGMp1hK6/WHfc05vkWMhYAMCrc1K3Cre0PxSoaX37rKDdN+CcC1qNz+yRpJ6zNxSqbfq/ECLL+Pd//6aqfX2/+gkeW3MtpPXwoMypk6k6T3wGBQYPTDqAey2vwtTOiW+j/t5Iw7C3tLg1apTmcydFpzN9Bj969Ajf4cOJVFWK2zsQaTQSbnx2vdn588+o0LQpGzdtont3Uw3d5gMGkN/ODqP5cy1pRVb8kex+jaKhqm9VVNVUPxrg3rJ7RFyNwL2mO+413TFEGrg+5Tq1ctfiwK4DzJo1i+3btzNixAhzP97e3uaAtKIoVv27OtesycGfEl9PLGqhwaA1HT+23liWtluaYl+24FS+fKZcmzvpnF74OW1hepPptCzckrcqPstiM67+OJa0S98Ekuc56hypmbemBKSFECIbkKC0EEIIIUQO5ulsz7VJ7fj97TpsHtaIJf1rJdg/sHHxJI5Mn1pFvdn+cROL9ZHn9U48a754bld+6lOTgU2KJ6h5+6IYDAY0Gg1Go5HHjx8TGxsLQLNmzQgJCaFsmdJUcgljRL/OfLt0Dc4l6xB6eBWqPhadWy7sfYuhsXfEYDCgtXJFSlLstBrGvlqBz9uVJbebgy2e3gtRr7gPVye2Y06vFNI6P+XpnLoa6B+3SryatmJ+D/55r162CdxnVVUKepp/nturGtUK2Will1dheO0n2/SVWhot1B8CQ09C15+h3KuQu6wVBz69ea9Y+Kr8zi6o2iuV49BA8y9Td0xmy10KPrliWhEuhBA2kNclLx2Kd7C6/dwWcxM8tlTaxBLFzg7vPmmrZXtxxZcEuyZ/nsOlNPQYbqpf/ENbDUPf1vJ3fQ13fKwLsKbFmDFj8PPzw7lHd0r/8APlHB35L+wJ+8IjKLxpI2XPnyP3e+9RsFAhevToQYMGDQDQurpSfOMGc7rufJMnUfb8OUqfOE6eUaPM/Xv17o1TxYopj6POGBRFQVVV7v9xn7AzYWjsNTjkccCpsBPVn1SnvEd55syeQ4ECBXB2dsbHx4dp06axfv36BH2l5nelGo30rDuQP+snPCa45bPvE51KdsJOa2d1n1ldh+IdmNN8TmYPwyZaFm7J9CbTeb/q+6zrtI6Tb5ykY4mOaCxdZwkhhHgpyPQhIYQQQogcTlEU6hTLBUCpPG70rlMYjQKftSuLo13G1dMt4uPC1Un+/Lz3OrndHCibz50CXk7YabPWTQhVVdFqtVy6dImBAwcSFhaGm5sb7du3Z8iQIbRv355Lly5Rt25dc/vPxjmjyV8ORZfwBlh6A9LZnUaj0K6ijdMVP9WmQj4OjWxBzQn/xduWl6q2CqC+5P77sBEhkbG2r1uuyUJ/Ez2WwbdVrGv7/M3SFl9BvjSmmsyVMZN/MpRM9BBC2NiEBhPI45yH+afmp9hWURS+rPslX+37CiB1Aaw0vn+9Uq4zZ8Mu42bvRovCLXh9/evojfpE7eJW526tYvv3ybgVxHH/B+jYsSMrV66kRYsW7Nmzh0nXr3HR35/vbt8mcvlymjRpwvnz59m7dy+zZs2icePG5r7sixQh95AP0Li44NmxIwAaBwe8X++FZ5f/EXnsWJJpu59XzKsY+Yz5OPXvKWIexpC/b34cC5hKzvQu15snl5+w/fZ2SpUyTSK8du0a7777Ll27dqVChQpp/p04limDndaOJp/M4OK1oewtq2FDTYWdrb+l94betCvWzuq+iq1by1X/9mkeCwBpuNYv6VWSS0GXrG4/ocEEAF4v+zq/nvuVen4vPl24rSmKQiH3F18ySgghRNaTte4ICiGEEEKIDDeuYwW+erVChgak4+tTrwjtKuajqI9LlgtIg+kmyaFDh6hduzYlS5Zk8uTJtGvXjmHDhjFlyhR8fHyoW7cuer2eBw8eMGHCBMLP7cKpSNXMHnqWNTOF1NyL+qYtnXFuNwfejbe6v0+9ImnqRyRWwtfN9gFpAGMK6exL+9v+nEnxLpr8/vw1IF8V08/PBzXSs2rY3Q/6rEn78UIIkUN8UO2DFNuMrTcWgP+V+p95W2pW1ro2aZLqcbm3a4edxo5RdUYxpNoQyucqz5HXj1h9/Oe1P0/1OS2Je54xMTHmbZUrV+bbb7/l8uXL9OzZE42zM/9s20bLVq1YtmwZnTp14sMPP+T9999PEJCO68tn4EC833gj0bk0Dg641KmDYm995pqat2sSsDqAiIsR2OUyTcxUDSqf1PiEV199lbCwMNq0aUPfvn0ZPnw4tWrVMgekU1vepujKf8j1zjvkHjYMAJ27O6P66FhfS4OqKHg6erKm0xoGVra+VIhD8fRNEvPq2QOtm1uqj/u26bdWty3s/qwczYfVP2R289lMbzI91ecUQgghsqqsd1dQCCGEEEKIDBZXP9poNGI0Glm5ciU9evRg7ty5NGnShB07dlCqVCmaNm1qbrdw4UIGDRrETz/9RO8vZ+NUvEaCPtcMbvDCn0dW1bFqfioVSFh319ley/vNSnB9sj9Ny/imue9P25bh3Ng2XJ/sj6uDJH7K8nQp3Ozu9uuLGUdK+v8Lb215trI7fgCk98r0r/guUCvpfeVeTV/fQgiRA3g6eLKo9SI6leyUaJ+bnfWBQJc6tXFvn7rVsPnGjU20TaNoWNLWurq33Up3S9X5krNx40YqV65sLiWjKAr16tVj9uzZ/Pnnn4wfPx6AmTNn8u+//7JmzRqOHDlC//79gYQB6fSIu1aOb8IXE/Bq4oVqUAneE8yntT7ldL/TKIpChQoV+PXXX7G3tyckJITNmzfTPt6/Q2qzCTmWKYPvsKFoXV1MxyvPju9YomPanlQ65R09Ok3HFXArwKk+p9jdfXeKtaFXvbrK/LOd1o5GBRrhYueSpvMKIYQQWZEEpYUQQgghRI6nqqr5Z6PRiEZjugzWaDRoNBrOnTtHyZIlCQ4OpnLlygQEBPDvv/9Sq1Ytrl69SlBQEE2aNKFdu3YcPXqUjm2ao6qmm3Wj/MuyenB9Kj4XhH3ZLXurToLHJ79sxUetStukbyf7LJQSWiSvdDso0tDyvhr9TDWXM1P516DPWij4XNBYG6+ee/GmNjiRmvSu/y22Qf9CCJF91cxbk13dd1Ejb8IJf6Nqj6JZwWYWA9XJsRRkTopDqVJoXCwH/ar4VuGfDv8ke/zs5rPTVR83/jUqgL29PUajkc6dO5u36XQ62rRpw5AhQxg9ejQbNmzAzs4OHx8fqlWrRuHChc392CIgrdfrzdfKd+/e5dGjR+Z9P0z+Ac+KnuS/kx+XC89+b46Ojvj7+/P333/zxx9/UK9ePYuB7bSqlqcaFX0q0qF4B8bVH5fmfgrO+yFNx+Wfkf7Vyh4OHsxvlXT6+sLuhdFmpbInQgghRAaQoLQQQgghhMjxFEUhMjKSnj17EhoaCkD79u2ZP38+qqri7u7O/v37qVy5MuXKlWPTpk0ULFiQ8PBwli5dyurVqylVqhT9+vXD29ubVmVzM6NbVf77sBEDGhajUgHPzH2CWVD8e6L96hdFlwVTt4sXQGsHvS3c0PcpBe2+efHjaT3p2c/9NkGXRVDUQtC87RTwKATtvrbNeTVJrOrvuSLzA/NCCPECbeq8idnNZzOt8TTztuaFmlts261MN2Y1m4W91voU0wAaJyfyT7fuM8a+WLFk95fwKsGajmtoUagFkxpOIq9L3gT76/rVTdXY4jMYDImCyA0aNODrr7/m8OHDvP/+s9IRrq6u5nrN/v7+3Lx5M8FxtghGg2nypk5n+sx67bXX6Ny5M+XKlWPy5MmcOXOGzhU6s++XfXg5e7Fw4UJ2795tPlZVVXQ6HXZ2dqiqag5s24JOo2Op/1JzveW0cm3cmEKLFibYps2VC98RI/Du29fiMRpnZ9zbtk3XeeNUzl05ydXSKU2AEEIIIXICRX1+Sp4QQgghhBA5UFBQEFWrVqVAgQJcu3aNMmXKsGrVKlxdXdmxYwdNmzaldevWrF69Gjs7U5285cuXM3r0aKZNm0aHDh0y+RlkL9F6A6VHbQTgyKgW5HJ1SOEIkWMZYmGcT8Jtn1wBFx/L7TNaTARo7UH7gtO/X9kGhhhw8oZjS6D5l+CS68WOQQghspDAqEBOBpykYf6GNl8hqhoMXKpXH0NISLLtSu7dg87bO1V9nw88z7nH5+hYomOag8FxabZv377N1KlTURSFxo0b07RpU7y8vPjxxx8ZPnw448aNMwenv//+e8LDwylfvnyC1Ni29vDhQ1q3bo2npycTJkzg1KlT/PDDD1SoUIFJkyZRoEABduzYwciRI8mfPz9ffPGFuXZ0dmGMiSF07TocSpbAqWJF07bwcG682Y+okycTtNX55aPk1q02O3ft32oToY9ItP1Un1M2O4cQQgiRVUkRNiGEEEII8VLw8vJizJgx9OvXj3LlyrF582a0Wi0Gg4HGjRszYcIExowZw/Dhw8mTJw9BQUHMmjWL7777TgLSaeCg0zKjW2Vi9EYJSL/snl8l/OqczAtIA9g7Z85546cBL1gzc8YghBBZiLejN00KNsmQvhWtlnwTxnN78PtJtvHs0T3VAWmAMt5lKONdJk3jiisjoygKR48epWXLllSoUAGdTsfixYtp164dn3/+OQMGDODBgwcMGTKEffv24eLiwt9//83SpUtp3bp1gr5syWAwMHfuXIoUKcI//5hW7u7Zs4czZ84QGxvLxIkTmT59Oo0bN6Z///7MnDmTqKgom47hRdDY2+P5WsK08BoXF/KO/Jzr3bon2F5w7lybnttWq9qFEEKI7EjyhAkhhBBCiBzJUh07nU5Hnz59uHTpEpMmTUJVVbRa08qczz77jClTpnDt2jVWrlzJhQsX2LhxI2+99RaQuOafSFmnqgXoVrNQZg9DZDZFgc/umFJhV+0Nlbpl9oiEEEK8BHR58ia7PzOCg3FB5NOnT3Po0CHeffddduzYwZYtW1iyZAkBAQFMmzaNwMBARo0axaJFiwgKCuL27dusWLHCHJCO31d6PH99q9Vq6dSpE4MHDwagf//+zJ49m3///Zd27dqxYsUKZsyYAcCbb77Jhg0bqFGjRqJ+syuHEiUSPC7ww1wcS5e26Tm+a/Zdom0bO2+06TmEEEKIrErSdwshhBBCiBzl3LlzlC1bFkh6Bcn8+fN59913+eWXX+jVqxd6vd5cPw8gIiICnU6Hvb09RqMRRVFkVYMQQgghRDZzZ/hwQlevsbiv2Lq1OBQv/oJHBGfPnqVChQo4OTkxZcoUcwAYTNeoX3/9Nd999x2tWrUCIDo6GkVRzNeltlod/fz17/PbV69ezbhx45gzZw41a9Zkz549dOjQAXd3dyZMmEDPnj1tMo6sxvDkCRFHjhBz5Sre/d7MkO8AeqOeIduGsPP2TkBSdwshhHh5yEppIYQQQgiRY1y5coXatWszYsQIIOkVJG+99RYff/wx7777Lnv37kWn03Hv3j3Gjh0LgLOzM/b29qiqak6xKIQQQgghshfvXr0sbvf9dESmBKQBypUrx+zZs4mMjCQoKAhVVc0Zft566y1UVWXjxmcrZx0cHBJcl9qCqqrmgPQHH3zAO++8w5AhQ7h165a5zblz53jy5Amln64Uvnr1Kv7+/owcOZIePXrYZBxZkdbNDbcmTcjVv1+GfQfQaXTMbDKTL+p8wbpO6zLkHEIIIURWJEFpIYQQQgiRY/j4+DBq1Ch++uknfv31V8ByGm+AKVOm4O/vT6tWrRg2bBilS5fmwYMHCdpIMFoIIYQQIvuyL17C4nbPzp1f8EgSGjhwIAMHDmT69OkcPHjQHGxWVZXChQvj6+ub6BhbXZfGZQGKiIigcuXK7Ny5k8jISFatWsUrr7zC8uXLAfD19cXd3Z2JEyfy66+/8sknn9CyZUsGDBiAoihJXmML69hp7ehauiuF3KXUjRBCiJeHpO8WQgghhBA5yoMHD5g6dSoLFy5kw4YN1KlTB4PBYK4d/byhQ4fy6NEj6tevz8CBA1/waIUQQgghREYyhIai2Nlxb/SXxNy8QZElS1Ds7TP0nKqqphhEjo2NpV27dly5coUvvviC8uXLc/DgQT766CNWrVpFmzZtMmx8Z8+eJSQkhIULFzJnzhzs7OwwGo106tSJJ0+eMHXqVMqWLcvw4cPZu3cvISEhDBo0iI8++sjq5yeEEEII8TwJSgshhBBCiGzv+Rtjly9fZtSoUezdu5eDBw+SN2/eRHXz4geqo6KicHR0BJKuQy2EEEIIIURK4l9jBgYG4u3tnWTbgIAAWrduzfHjx+nevTu3b99mxIgR+Pv723RM8a+VIyMjadiwIUePHsXf35+///4brVaLRqPh5s2btGzZknbt2jFjxgzCwsJQFIWgoCAKFCgAyLWyEEIIIdJOriCEEEIIIUS2ptfrE63UKFGiBJ999hkFCxakQ4cOAOh0OgwGg7lN/JXTcQFpSLoOtRBCCCGEEMmJH5AeP348/fr148KFCyS1Jih37tz8+uuv+Pr6kitXLtatW4e/v3+COtPpFf9aOTw8HCcnJ2bPnk2VKlV49OgRiqKgKAoGg4FChQrRt29f/vrrLyIjI3F2dsbFxcUckLZlXWshhBBCvHzkKiILiI6OZsyYMURHR2f2UIQQIkuQ90UhhLUMBoN59fPo0aP5+OOPGT58OOfPn6dy5cpMmTKF4OBgunTpApgC0dmx/p28LwohRGLy3iiEyGriAtL/+9//WLBgAe3bt8fBwSHZVNflypVjwYIFzJ07l/nz5wOm+tFpCf4+/74Y/1p56NChzJs3j/v371O7dm3Gjx/PiRMnGDt2LIqimMceGRlJuXLlcHBwSDQGSdkthMhu5HpRiKxF0ndnAaGhoXh4eBASEoK7u3tmD0cIITKdvC8KIVIjLCyMxo0bYzQaadOmDbt37yYyMpKPP/6Y7t2788cff/DBBx/Qr18/Jk6cmNnDTRN5XxRCiMTkvVEIkRXNmzePb7/9ln///Rc/Pz/z9vjlYizVZJ49ezbvv/8+K1asoHPnzmk6d9z7YnBwMB4eHuZtTZo0QVVVxo0bR+3atcmdOzexsbHMnz+fwYMH8+mnn1K1alWcnZ3p06cP7777LuPHj0/jb0AIIbIOuV4UImvRpdxECCGEEEKIrCdubuW4cePIkycP69evB2DMmDF888035pt+7dq1IyAggPfff59ChQrx7rvvZtqYhRAZI366VCGEECIznT9/nuLFi+Pn58eOHTvYv38/S5cupWDBgnTp0oU+ffpYXHE8aNAg7t+/T6lSpdJ87k8++QR4tqI5KiqKvn37UqBAAVavXm1uF1cX+r333uPmzZtMnjyZ8uXL07x5c4YNG8bIkSPTPAYhhBBCiKRIUFoIIYQQQmRLcTfbbty4wauvvgpA//79Wbt2LcuWLaN9+/ZER0ej0+no2bMn9+7do0SJEpk5ZCFEBlBVFa1Wy/3799m2bRvFihWjfPnyuLq6ZvbQhBBC5GBJTYiqVasWs2bNolGjRjx48IDq1avTuXNn9u3bx9KlS3n11Vfx8PCwGJgeN25cmsdz6tQprl+/nmCb0WgkPDycV155BYDNmzdz+vRpVq1aRZUqVRg3bhxfffUVAQEBrFmzhk8++YT8+fMn+/yEEEIIIdJK0ndnAZJCQgghEpL3RSGEtfR6PS1btsTf3599+/Zx9uxZli9fTqVKlYiIiGDx4sWUKlWKFi1aoNfr0el0FtMlZnXyvihEYvH/lnfv3k27du0oVKgQFy9eZNCgQbzzzjuUKVMmk0cpMpK8NwohMkv8gO3y5cu5d+8e7u7u+Pv7kydPHv7880+OHTtGhw4d8PPzo2DBgixevJiFCxeyefNmc0YfW3j06BE+Pj7As/fF//77j+bNmxMWFkbr1q3x8PDg4cOHuLm54eXlhZeXF4cOHaJdu3ZMnjyZGzdu0K1bN1RVZd++fWmqZy2EEFmRXC8KkbXIFUYW4ODgwJdffomDg0NmD0UIIbIEeV8UQlhDVVV0Oh1vvPEGw4cP59atW5w7d45KlSoBcOXKFZYsWcL9+/cB0OlMSYKyW0Aa5H1RCDBNQokv7m/50qVL/Pzzz4wZM4ZDhw4xa9Ys9u/fz/Tp081//yJnkvdGIURmiQtId+/enQ8//JClS5cyYcIEqlevzpEjR/jf//7HhAkTqF27NgULFuTmzZt8//33VKxYEXt7e5uN4/z58/j5+bFmzRrA9L7Yp08fWrZsya+//oqrqytz5syhZs2a1KhRgylTpvDjjz+yYMECChYsaK47XbhwYX788UeuXLlC+/btbTY+IYTIbHK9KETWIiulhRBCCCFElhZX8y7O86kEHzx4wPDhw1mzZg3r1q1Do9EQHBxMv379aNu2LT/99FNmDFsIYUOXLl3i/Pnz5vSjcRYsWMDy5cvR6/UsWbLEnHL0u+++47fffqNJkyZ89dVXchNKCCGEzcRl6vj++++ZMWMG27dvJ1euXERFRdGjRw+uXbvGpk2bKFq0KAcPHuSff/5hxYoVVKlShT///NPm4+nZsyc7d+5kw4YNVKxYkcjISEaPHs2cOXPYunUrtWvXJjo6OsFn4aFDh+jRowejR4/mjTfeMG/fvHkzQUFBdOvWzebjFEIIIYSQldJCCCGEECJL02g03Lp1i08++QQgUW27PHny8OWXX9KqVSs6depE9+7dGTZsGP379zcHpI1G4wsftxDCdn777Te2b98OmIIBcfLkycPt27c5e/YsYWFh5u3vv/8+zZo1Y/fu3Xz33XcverhCCCFyoLjPn5CQEADOnDlD5cqVKViwIA4ODnh7e7Nhwwb0er25NrSXlxeBgYEMGzbMHJA2GAw2GU/c9e1vv/1GqVKl6NWrF3fv3sXJyYnPP/+cdu3a0bFjRx4+fGgOSP/1119MmTKFFi1a0KNHjwQBaYBWrVpJQFoIIYQQGUaC0kIIIYQQIktTVZX//vuPlStXcv36dYttihUrxu+//86mTZvYsGEDK1euZOzYsYDpxp/UxRMie8ufPz958uQBEgal27dvz8iRI/H09OSHH37g0aNH5n0jR46kePHi/PLLLxw7duyFj1kIIUTOEPe5oygKq1atonLlygQHB+Pg4MDly5cB06TJ6OhoNBoNI0aM4MCBAwQEBFCyZElmzJjBoEGDgMQZf9IqfiahM2fO8Omnn3L69Gk+//xzwsLC8PLy4ptvvqFQoUL4+/ubn8PNmzfZtm0bixYtMgfOZfKmEEIIIV4UuTsnhBBCCCGylOdvjCmKQrVq1QgKCuLq1asW28Q9rly5MmXKlKFUqVKA6SaiLW78CSEyV1BQEC1btuTo0aN8/PHHjBo1ikWLFgHQq1cvc+rShQsXEhsbC4CLiwtfffUVM2fOpGrVqpk5fCGEENmYoigAnDt3jsWLFzNs2DDc3Nxo1aoVRqORqVOnAphXI4eEhODp6UmuXLkAcHZ2Bmx7XRoXkJ42bRoNGzZk/fr1lCtXjiVLljBq1ChUVaVQoUL8+OOPPHz4kO7duwMwbNgwli1bxmuvvQYkLpMjhBBCCJGRpKa0EEIIIYTIFnr27EloaChr167N7KEIIV6QuLqdAOvXr6dTp0507NiRa9eucf36dRo0aMDff/8NwDvvvMPp06d566236Nu3byaOWgghRE4zf/58li1bhpOTE8uWLcPd3Z2goCDGjx/Ptm3b6NOnD6+//jrBwcG8/vrrVKhQgR9++AGNRmP+HLO106dP06JFC2bNmkW3bt0ICgrin3/+4a233mL69OkMGTIEMNWJbtOmDbNnz2bgwIFAws9XIYQQQogXRabCCSGEEEKILMff359hw4axc+dO87bmzZsTGBjInTt3MnFkIqeRlJVZz549e3jw4AHwbHXanTt3+PTTT5k2bRrLly9n+/bt/Pnnn+zYsYMPPvgAgClTppAvXz5mzJjBtm3bMm38Qgghcp7bt29z/fp1bt26hbu7O2CqFz148GDat2/P559/TvXq1WncuDE+Pj7Mnz8frVaboYHfu3fvoqoqTZo0MY+nX79+jBw5khEjRrBhwwYAWrRowbZt28wBaUAC0kIIIYTIFBKUFkIIIYQQWc6rr77K2bNn6d27N2PHjuXBgwd0796d8+fPs2PHDiBhXVkh0iIuZWVQUBCXLl0iMjIys4f00gsPD6dXr14cOHAgwfZ79+5x48YNWrRoAZhSoTZq1Ii5c+cye/Zs9uzZg6enJ+PGjaN9+/Y0btw4M4YvhBAim1NV1eI15meffUavXr0ICAjg66+/Nm8vWrQoY8eO5dSpUyxevJiff/6ZNWvWAKb60RmpVKlShIWFceTIkQTn69KlCwBvvvkmhw4dQqPRmD8X9Xp9ho5JCCGEECI5uswegBBCCCGEEM97++23ad++Pbt27eLTTz9l27ZtvPrqq7zxxhusWLGCjh07muvzCZFacSkrNRoNGzduZPDgwURFRVGtWjXeeOMN/ve//2X2EF9aLi4unDhxAg8PD6KiorC3t0ej0VCiRAnc3NzYv38/5cqVM7dv1KgRpUqV4sqVK9SvX5+yZcsyYcKETHwGQgghsqv49ZXPnDnD1atXyZcvH4X+z95dh0WVvg0c/w4NIqiIid0u9tqda2Cs7trd3a7+1NW1u1bX7u5YW7Bb18LGThBFQhpm5v1jXkaQhhkG9P5cl5cz5zznnHuY4czh3M9zP7lzkyVLFvr164eHhwf79+8nf/782nmZlUol+fPnJ3/+/Np9KZVKnc0fHZv06dPz66+/smDBAvLmzav9fjQxMaFJkyZ4enri6ekZZRsTE7kVLIQQQgjDkZHSQgghhBAiVcqRIwdt2rThyJEjdOjQgSlTprB69Wru3r2rHdEqo6VFYkWeQ/HJkydMmjSJAQMGMG/ePIyNjZk7dy779u3TthUpz8bGBl9fXwoXLhxlNFqNGjU4cOAAly9f1i6ztLQEkE4qQgghki0iIf33339Tp04dxo0bR7t27Rg2bBgvX74kR44c9O/fn2zZsrFs2TLt91HEdpHpOyENYGdnR9u2bTE2NqZXr15cvXqV+/fvM3fuXMzMzHBxcaFJkyZ6j0MIIYQQIqEkKS2EEEIIIVJcYsoZFitWjJ49e3Lt2jVGjhzJ58+f2bRpEyDz4YnEi/jMrF+/nnHjxlGmTBmGDBlCmzZtmDx5MgUKFGDOnDlcv34dhUIhc06nkMjnBLVaja2tLV26dGHSpEkcOnSIDBky0LNnT/z8/Pjzzz85cuQIDx48YMqUKQQGBuLo6GjA6IUQQnwv5s6dy5w5c1i2bBmurq6MGzeOEydOMGTIEAICAihTpgx9+vTBxMSE0aNH8+7dO71cj8Z3rRzRcc7JyYkhQ4ZgZ2dHjRo1aNKkCZcvX2bu3LmYm5vrPC4hhBBCiORQqKX7vxBCCCGESEGRyxk6OztToEAB7O3tSZ8+fZSyiZFFLP/y5QsTJkzg8+fPrFy5ElNT0xjbCxGXgIAA+vbty/Hjx6lRowa7d+/Wrjt9+jTz5s1DrVazfPlycuXKFWV0tdC9yOeEnTt3Ym5ujpOTE8bGxvTs2ZP9+/dz6tQpSpYsyaFDh9i2bRvbtm3D0dGR4OBg9u7dK0lpIYQQyRZxfVCnTh26devGvXv3aNSoEbly5cLPz49KlSqxevVqAFauXMn79+/566+/dB7Ht9+L6dOnx8HBgRIlSkRp9+31ye3btwkNDaVChQrR9iOEEEIIkRpIUloIIYQQQqSYiJtn7u7u1K5dm5CQEIKCgmjdujWjR48mZ86csd5Ai9h2wYIFbN68mRs3bhjgFYi0Rq1Wo1aro3VeePbsGRMnTuTChQusWLGCX375Rbtuz549zJ49m6xZs7Jv3z65oatHkW+ot2nThqtXrzJu3DgaNWqEg4MDoaGhNGzYEA8PD06dOkW2bNkAuHv3LgqFgkKFCslIMCGEEEkSU6czNzc3MmbMyLNnz2jdujVdunThr7/+ok+fPmzfvp3hw4czefLkePeTXO/fv6dWrVooFApCQkLw9PRk2rRpdOrUicyZM0dpG1OnTklICyGEECI1kmElQgghhBAixSgUCj59+sTSpUupUaMG58+fZ/Dgwbi6uvLHH38QHByMsbFxjCWTI272XblyhfTp0xMUFCRz/oo4RdwkNjIy4sGDB+zatQsXFxc+ffpEgQIFGDp0KI6Ojvz999/cvXtXu12rVq3o1KkTvXv3lhu6enD+/HntY4VCQXh4OG3atOHFixecOnWKnj174uDgAICZmRk7d+4kJCSEnj174uvrC0CJEiVwdHSUhLQQQohEi7jOVCgUeHl54e3trV1XuHBh7O3t2b59Ow0aNGDs2LEYGxuTN29e7O3t2bFjB48ePdLuI7kJaZVKxfv377XP1Wo1vr6+tGnThkqVKnH9+nWuX7/OtGnTmDJlClu3biUkJCTKPgw1p7UQQgghRGJJUloIIYQQQqSYCxcu0KVLFy5fvkz//v1xcHBg7NixtGnThqdPnzJ27FhAc3MtpsT0vXv3sLW15fDhw1haWkpJZRGniM/HypUrqVq1KhMnTqRbt25UrFiR27dv8/PPP9O3b18CAwOZMWMGnp6e2m0HDBiAk5OToUL/bm3fvp0OHTrg7++vXebt7c3z58+ZOHEi+fPn5+nTp5w+fZrp06ezf/9+MmfOzL59+zhy5AgjRoyQzihCCCGSJSKJu2rVKipWrEilSpVo1qwZhw8f1rZxc3Pjw4cPWFpaAvD69WsGDRrEqVOnKFq0qHYfSb0WVavVBAcHU69ePaZNm4a7u7t2f0FBQXz8+JE2bdpgY2ODvb09w4YNo3fv3kybNg0PDw/tPoQQQggh0hJJSgshhBBCCL359maZu7s7Hz584NatW2TKlEm7vGvXrjg5OXH+/HnmzZsHxDzqw9HRkZUrV5IuXTr9Bi6+G87OzowbN461a9dy7tw5zpw5Q6FChfj111958uQJTk5OtGvXjjdv3jBixAjCw8OBpN9kFnFr27Ytjx49wtrami9fvgDg5+eHWq3GxcWFMWPGMHz4cIYNG8bevXtp06YNhw4domTJkhw8eJAWLVrIeyOEECJJInd4PHDgAOPHj2fgwIH8+eef+Pj4MG3aNJYtWwZAgwYNePHiBU5OTlSvXp1z587RqVMncubMqZNksEKhwMLCgoYNG3LkyBF27NiBn58fAF++fOHly5ekT58egKCgIABmzpyJpaUl27dv1+5DCCGEECItkaS0EEIIIYTQC6VSGe1m2e+//87QoUOxs7Nj0qRJBAcHA2BpaUnv3r2pUqUKCxcu5MKFC4YIWXwHlEpllOdPnjyhQIECNG7cGDs7OwoUKMCxY8dIly4dw4cPB9B+9rJlyyblLlOAlZUV+/bto3Tp0rx//54CBQrQunVrHj16xKFDh2jatCnr16/H2dmZMmXK8Pz5cwCaNGkio9eFEEIkWUSHxwkTJuDq6srYsWMZOnQoHTt2ZO/evRQvXpydO3fi6upKp06dGDBgANbW1vz000/cu3cPe3t7VCqVTpLBKpUKtVrNH3/8QatWrVixYgX79+8nODiYQoUK0bBhQ0aOHElQUBCWlpaEh4fj5+dHhgwZsLOzS/bxhRBCCCEMQZLSQgghhBBC51QqFcbGxnz+/JkRI0bwxx9/MGnSJEJDQ+nYsSO9evXi/v37zJ49W7tN1qxZ6dWrF3PmzKFatWoGjF6kVWq1GmNjYz5+/EiVKlV49uwZwcHBvHr1CnNzcxQKhbYjxNy5c7lx4wYPHjwAYMqUKcyZMweFQiHlMPXg259p5cqVCQsLo1evXiiVSv744w+2b9/OvXv36N27N6VLlyYwMJCAgADt/NJCCCFEUkT+DvL19WXlypVMnDhRW7FDrVaTOXNmhg0bxsuXL7l69Sp2dnb069eP7du3s3z5ckxMTAgPD4+xkk9SKRQKQkJC+P3331Gr1axcuZIzZ84AMGjQINRqNe3atSMsLIzg4GBu3LiBp6enfC8KIYQQIs2SpLQQQgghhNA5IyMjbt++TbFixXjw4AGhoaGsW7eOBg0a8PjxY/r370/lypU5cuQIa9as0W7n6OhI27ZtAWKcU1qIuCgUCry9vRk7diw5cuQgQ4YMVK9enYwZMzJp0iQALCwsAAgICMDKygp7e3sAzMzMAHQ2Akp8FblqQsTvdbZs2Th8+DAXLlxg8ODBKJVKbG1tAXj48CGHDh2icuXKODo60rJlS4PFLoQQIu2K+M5RKBR4eHjw+vVrbG1tOXPmDDlz5uT69evaktkAP/30EwUKFMDZ2RmImsxWqVSYmJgkO6aIfRoZGXHr1i2yZMnCypUryZIlC3fv3mXSpEncvn2bunXrMn78eFxdXXFwcKB27do0b96cIUOG0LBhw2THIYQQQghhCJKUFkIIIYQQOqVWqwkMDGTcuHG0adOGo0ePsnDhQipUqMCrV69QKpXa0snFixdnwYIFHDt2LNp+dDkSRfwYLl++zMCBA3n//j0zZ87Ezs6OokWL0qxZM44cOcLixYsBCAwM5MyZM+TIkQNjY+MoN53lc6d7xsbGPH36lN69e9OvXz+uXbtGaGgoJUqUYOPGjaxYsYK///4bAG9vb5YuXcrEiRPp2rUr27ZtM3D0Qggh0iKVSqX9Tt+yZQuNGjXiwIEDeHl5UbRoUVatWsWRI0dYtGgR/v7+AAQHBxMYGEjp0qWBqHM2J+f64MOHD+zcuTPKPgMDAxk+fDi//fYba9as4ezZs1y5coVXr14xffp0Xr58SfPmzblz5w7Tp0+nf//+HD9+nNGjR2tfnxBCCCFEWpP8Ln5CCCGEEEJEolAoCAoKwt3dnaVLlxISEkLdunXx9fXl8OHDFC9eHH9/f3LlykXv3r2xtLSkVKlShg5bfAcibuiGhYVRsGBBANKnT0/fvn0BGDt2LIsWLcLKygpvb29cXFzIlCmTIUP+bqnVau2N9/v371O/fn3KlCnDx48fadiwIXPmzKFNmzY0b96cWbNm8ccff5AvXz5atGjBkCFD6N27NyVKlDDwqxBCCJGWhIaGaiufRCSRp0+fzsyZM5k5cyZOTk7a+ZgbNmzIvHnzGDp0KJcuXaJKlSrcvHmTN2/e0L59e53FpFQqGTx4MD///HOU5Wq1Gj8/P+13nVKppFixYqxbt44mTZrg4ODA4MGDyZs3Lz169IiyXeTXJ4QQQgiRlsgVjBBCCCGESBalUhltWbp06QgICODQoUOUL18eGxsbTp06RfHixXn79i1Tp07l/fv3VKpUiSVLlpA9e3YZ8SESJabP3fDhw+natSv+/v5MmTJFuzxv3rxMnjyZy5cvM2bMGAYPHsyTJ08oUqRIjPsRyRO5XDfAp0+f6Ny5M4cPH+batWu0adOGefPmcejQIcLDwxkxYgR9+vShZcuW3L59m4IFC6a6hPS3nxOZd1wIIVKXTZs20bx5c8LCwrTL7t+/z5YtW9i5cyf9+/cnW7ZshIWF8ejRI4KCghg8eDCjRo3i+PHjfPr0iRo1auDm5kbevHl1dl1qbGzMsmXLGDVqFADPnz8HwNzcnJCQEN6+fQtoRj6rVCp++eUX6tSpw/bt21m3bp12FHcEhUIh04wIIYQQIs2SkdJCCCGEECLJlEolxsbGBAcH8+zZMxwcHLC1tcXc3JyGDRsyfPhwmjVrxq5du7Tb3Lx5ExcXF5o2bUqOHDlQKBSo1WoZ8SESLOJzFxAQwMqVKzEzMyNPnjw4OTkxePBg3r17x5EjRyhcuDBt2rQBNDeFHR0dcXR0jLYfoTtqtVr7M50/fz7e3t68fPmSChUqaNssW7aMpk2bsnDhQuzs7Khfvz5LlizB2NiYPHnyGCr0WKlUKu1rOnr0KI0aNZKEgBBCpDJly5bF1tYWU1NT7TK1Wo1arSYsLIyHDx+ydetWXFxcePv2LQUKFODMmTPMmjWLx48f4+LiwpEjR7C0tNT59YGtrS0qlYoBAwbg7u7OjBkzKFasGMOHD6d37940btyYOnXqABAeHk6mTJnInz8/pqamWFtb6ywOIYQQQghDkzt/QgghhBAiyYyNjbl37x5FixalVatWlCpViuPHjwPQtm1bKlWqhKenJw8ePMDV1ZVVq1bRoUMHOnbsSNWqVbX7kQSPSIyIz12RIkXYvXs3R44coV27dgwaNAiAESNGkDt3blasWMH58+eBmMtcSkJat1QqlfZ3uWXLlsyaNYtz586xZcsWTp48yefPn7VtN2/eDGgS1xcvXgRg0aJFZMyYMeUDj0PkOUmbN2/O5MmTcXd3N3BUQgghIpw8eZLAwEB++uknmjVrxsOHD1m5ciWgSQbnzJmT8ePHU6FCBd6+fUuLFi1YunQp9+7dY8OGDQCsX78eU1NTunfvzqdPn3RyfRCREAfN9YaRkRElS5bk3bt3rF27Fi8vL9q3b0/nzp1p0qQJu3fv5urVq2zevJmnT5+ybds2xo8fn+w4hBBCCCFSE0lKCyGEEEKIRIkoZ6hWqwkODmbcuHG0adOGjRs3UrZsWfr378+ePXuoXLkyo0aNwsLCgjJlytChQwdmz57N6tWrGTp0qHYfQsTF398fX19f7XO1Wo2Xlxd9+vTht99+4+LFixw+fJgyZcpw8uRJPn36RKFChRgwYADp0qVj/PjxuLm5SceHFGBkZIS3tzdr167F3t4eV1dXzp49y5QpU3j58iWrV68mNDQU0CQKVq1axfXr1/n48aOBI4/q/fv32sdGRkb4+fmxYcMGLC0t2bdvH9mzZzdgdEIIISIsW7aM+vXrc+rUKe00CydPnqRv377s3r2bXLlysWLFCqZNm8bJkydZsGABo0ePpnLlyhQoUID8+fMDkCFDBnbt2sXp06dZunRpsuOK6KSlUCg4efIkc+fOBaBfv360atWKkydPsmrVKkxMTFi6dCk9evRg8ODBtGnThpEjRzJs2DBy5cql3ZcQQgghxPdCoZY7gUIIIYQQIoEilzN8+/YtNjY2TJo0iZEjR2oTNc2aNcPd3Z25c+dSs2ZNAC5fvkzGjBmxsbEhR44c2mS0JApFbNRqNT4+Pjg4OLBjxw6cnJy0616/fk2rVq1wdnbGysqKJk2a8Pr1a7Zv306ZMmW0n9ONGzdy+PBhFixYQI4cOQz4an4MAQEB9O/fnxMnTlCxYkX279+vXTdgwABu3LhBr1696NGjh3b5+/fvU817o1Kp6NOnDxkzZmTWrFmoVCq+fPlC27Ztefr0KVWqVGHjxo1S9l0IIVKR33//nevXr7N9+3YqVqyIQqFg6NChrFu3Dmdn5yjTR4SEhODv78+gQYN4+PAhR48eJVu2bNr19+/f56efftJZbFOmTGHx4sW0atWK/v37U6JECQAGDhzItWvX6N27Nz179gTg4cOHhIaGkilTJnLlyoVarZbrZCGEEEJ8dyQpLYQQQgghEiTyzbGOHTty48YNPD09yZMnD8ePH8fe3h4APz8/6tatS9asWfnzzz+pWLFilP1ELocrRHwuXryoLfUe8dl58OABv/zyC9u3b2fgwIHY2dmxZcsWsmbNyrNnzzhw4ABDhw7FyMiIsLAwTE1N5eauHkS8H5F/tgcOHGDWrFl4enpy9epV7OzsAPjy5Qvdu3fH19eXnj170rp1a4BU974cOnSIJk2aRIlpxYoVTJ8+naxZs+Li4oKNjY2cx4QQwsCCgoKwtLQENPNJW1tbs2zZMm1SuUmTJri5ueHs7EzevHl59+4dq1at4sSJE4SHh+Ps7Iytra3evofmzZvHnDlz2LFjh7aTZgR/f3969OjBx48fGTBgAK1atYqyXr5jhBBCCPG9kiscIYQQQggRr4gyhCqVimXLlvHw4UNmzJiBk5MT7969459//iE4OBgAGxsbNm7cyN27d5k1axYfPnyIsi+5ySbio1artWU4q1SpglKp5Pfff8fZ2Znw8HDy589PuXLlqF69Oj/99BMuLi5kzZoVgFu3brFv3z7u378PIAlpPQkPD9f+Lvv4+BAWFgZo5l3u168ftra2jB8/XnteSJ8+PTNnzuTDhw9cunRJ2z61vC8R5VGdnJxQKBSsWLGCP//8E4A+ffrQt29fvnz5wj///ENYWJg2GS+EECLlqVQqLC0t+fLlC0uXLqVx48ZcuHCBuXPn8vbtWwC2bNmCubk53bt358uXL+TMmZPs2bPTunVrrl27hq2tLeHh4Xr5HgoLC+PatWsMHjyYmjVr8ubNG86fP8/YsWNZt24d1tbWzJ07F5VKxcKFC3ny5EmU7eVaWQghhBDfKxNDByCEEEIIIVI/IyMjPD09mTJlCq9evWLatGk0bNiQFi1aMGDAAE6cOIGDg4O2BGGxYsX4+++/effunTZZKERChYWFYWZmBmhGQllZWfHo0SNGjRrFtm3b+Omnn2jVqhUvX77E0tIStVrN+/fvOX36NIMHD2bChAnaEpmQehKf3wuVSoWJieZPyS5duuDm5oaZmRkVK1Zk9uzZdOrUCXd3d/bs2cOMGTOYNGkSAAUKFGD79u0ULVo01b0nkRPM/v7+XLx4kUePHpE/f366devG//73P16+fMm///5L1qxZ6d69OwqFQjo8CCGEARgZGfHu3TuqVatGiRIlqFevHr169WLNmjXY2dkxbtw4MmbMyN69e6lZsya9evVi+/bt9O7dW3vOViqV2u8yXTM1NcXU1JT169djZ2fH3r17CQwMJDg4mDlz5vDp0ydGjRrFxIkTefv2LYUKFdJLHEIIIYQQqY2U7xZCCCGEEAly7Ngxxo4dy7Nnz7RzxsLXsryfP39myJAhNGvWLNq2krgRCXXq1CkWLVrEgQMHOHLkCAsXLmT37t1YWlpSuHBhChYsyJYtW8iYMSNLlixh6dKlfPr0ifz58+Pu7s706dPp2rUrIJ87ffry5Qt16tTBysqK9u3b8+bNGxYuXEjjxo1Zv349CoWCP//8kytXrtCmTRsGDRpk6JDj5e3tzbt373B0dOTx48dMmzaNV69eMXr0aBo3boyPjw/du3cnICCAvn378uuvvxo6ZCGE+GGtWrWKhQsXcvXqVaytrQFYv3493bt3Z+HChfTo0YN06dJx5MgRfv31V+7cuUPRokUB3V0ffFtmO/J+b9y4wfTp07lz5w59+vShXLly1KlTh169evHu3TsOHjyIsbFxjNsKIYQQQnyvJCkthBBCCCGiUSqVUW6URdi2bRt//fUXZcqUYc2aNaRLlw6A58+f07dvX4KDg5k3bx7ly5dP6ZBFGubu7k727NlRqVRs3LiRBQsWYGtry4ULF1i6dCl9+/YFNJ+zEiVK0K5dOxYvXoylpSU+Pj4cP36cTJkykS9fPgoWLAjIfIy6ENcN8hMnTjBs2DCOHz+Og4MDoLkBX716dUaPHs3EiRN5/fo1f/zxB8+fP2fXrl3kyZMnJcNPlODgYAYOHMihQ4e4desW2bNn5/z588yePRulUsm0adMoU6YMjx8/plOnTtjb27Nu3TqyZMli6NCFEOKHNGbMGE6ePMn169dRqVSo1WqMjY0ZOHAgO3bsYNWqVTRq1Ahzc3O8vLyws7PT6fEjX2c8ffpUe/3xrbCwMExNTbXbNGvWjHLlyjFx4kS5ThFCCCHED0eS0kIIIYQQIorw8HBtOcMbN25gYWFB1qxZyZw5MwCzZ89m37591K9fn8mTJ2u3O3nyJNu2bWPRokXaZLUQ8Xn8+DEVK1bE2dmZ8uXLo1QqqVWrFhcvXqRHjx6sWrUKgNDQUMzMzDhx4gQNGzZk6tSpDBgwAFtb2yj7k2S0bkROSB88eJD3798TEhLC4MGDAZg/fz7z58/Xzt0Z8f4sX76c0aNHc+/ePXLlysXdu3dRKBQ4Ojoa7LXEJOJzEvl1uri4MGPGDEBzPgPYu3cvS5YsIWvWrMybN48cOXJw9uxZrK2tKVeunMHiF0KIH93hw4dp1aoV58+fp3z58trk7+bNm+ncuTPp0qXjwoULlCpVCtDtSOTI1xq9e/fG29ubv//+m+zZs2vbRD7emzdvuH//PjNmzMDHx4fTp0+TKVMmncQihBBCCJGWyN0aIYQQQgihFTFXrEql4pdffqF79+5UrlyZQYMGceDAAQD69+9P1apVOXHiBMuXL9duW7duXVavXk26dOmQfo8ioTJmzKhNSKtUKgBq165N3759uXjxIlu2bAHAxMQEpVJJgwYNWLx4MePHj2fHjh0olcoo+5OEdPJFvpHeu3dvxo8fz8OHD/ny5Yv2PapZsyZBQUHa80JERxZHR0fSp0+Pl5cXACVKlEg1CenQ0FBAUwki4nPy6dMn7fq6desycOBAPD096dy5MwAtW7akbdu2vH37llGjRhEaGkrNmjUlIS2EEAZWpkwZmjVrxsCBA/nw4YN2NLJarWbBggUMHz5cm5AGdFoaO+I7xNnZmQcPHjBp0qQoCenIxwsMDOTs2bPMmjWLXLlycefOHTJlyhTt+kUIIYQQ4kdgYugAhBBCCCFE6mFkZMTHjx9p2rQp1tbW7N+/Hy8vL0aPHs348ePJnDkzVatWZdCgQfj5+bFq1SoyZcpE69attfuQOfFEYmTJkoUsWbLw+fNn2rZty9q1a5k8eTJPnz4lPDycGTNmkCdPHqpVq0ZoaCjGxsYMGDCAO3fu8Pnz5xjLzIvkUSgUqFQqWrVqxePHj9m9ezf58+fHwsJC2yZLliw0atSIf/75h9y5c1OmTBkAgoKCsLGxwcbGxlDhx8jZ2ZmBAwfi6uqKubk5AAsXLmTNmjWcPn2azJkzo1AoaNiwIV5eXkyZMoXJkyczYcIEevfuzdOnTwkLC9Mm34UQQhhWjhw56N27N1OnTqVatWr06dOHsLAwpk2bxvr16xkyZAig+woqarWa4OBgqlSpQvr06alSpQrFixeP9ThWVlY0aNCAUqVKUaJECSBqVSIhhBBCiB+JDCMQQgghhPjBfTuq+cCBA9jZ2eHs7Ey+fPk4ceIEV69excLCghEjRvDq1Svy5MlDz549yZ07t7asdwRJSIukePfuHR4eHvz666+oVCoKFixIt27dKFq0KH/88QcvX77EzMyM9evXc/z4cVauXMmYMWMMHfZ36+DBg7x48YLt27dTvHhxbUI64nyRK1cuOnfujLm5OS1btmTKlCnMmTOHDh06UL9+ffLnz2/I8KPJnj07AQEB/P7779plWbJkwdbWloEDB2qXWVpa0rJlS6pXr878+fPZunUrAJMnT2bBggUyEl8IIfQsoiLHtyJfr0Y8rlevHqtWraJq1ars2bOH7du3s2DBAn777TdtW12ctyOOF9Hx0tLSEicnJy5cuMD79+8JDw+P8zhZsmTRJqQjqhIJIYQQQvyIZE5pIYQQQggBgIeHB9myZSMoKIjTp0/TuHFj+vXrx4kTJ9iyZQvv37+nR48e1K9fnw0bNmBpaYmvr2+0OX2FiI9SqYw2wlmtVnP27FkGDBhAwYIFtWWhjx49yvz583nx4gWVKlVi69atXLt2jZ9//hmQOaT1pWfPnty7d48rV65EWxe5GsKdO3dYv349Z8+excbGBicnJ0aOHJnS4SbIhQsXaNq0KR06dGDJkiUolUo2bNjA33//Te3atVmwYIG27Z9//smSJUswNjbm9u3b5MiRQz5nQgihZ5G/07dt28bly5cJCgqiQ4cOlCtXjvTp02vbfluZJzw8nICAAO11qa6uDyJfs4SEhGirbQB06dKFAwcOsG/fPmrVqiUdM4UQQggh4iFJaSGEEEIIwcyZM3n79i1LlizRLrtx4wZ9+/Zl3rx51KhRg1evXlGnTh0+f/7MwIEDmTJligEjFmlV5JvEa9euxdzcPEp57oMHDzJw4EDatWvH/PnzAbh06RK7d+/m06dPTJ8+HQcHBykTr2etW7fGx8eHEydOxNrG3d1dO4dmSEgIYWFhWFtbp1SICfJtUmLr1q107tyZ+fPnM3jwYHx8fFixYgUbNmygV69eDBs2DJVKxahRo8ibNy+//PILhQsXNuArEEKIH8/w4cNZs2YNTk5OvH37llu3bjF06FD69u1Ljhw5orTV5/VA5O+QmTNnaqeAKFOmDIMHDwagfPnyKJVKNm/eTPHixfUShxBCCCHE90LqxQghhBBC/IAi+iVG3MR7/PgxPj4+2nUKhYLXr1/z8OFDbdLp8ePHVK5cmQ4dOtCoUSODxC3Svoibu7/++ivnz58nZ86cPHz4kPXr19O+fXsaNmzIn3/+yf/+9z8KFy5M3759qVKlClWqVNHuI6aR1kK3zM3NefbsGW/evCFXrlzR1n/58oVevXrRsWNH2rZti7m5eZTRY6lB5M+Jq6sruXPnpn379rx+/Zphw4ZRqFAhGjVqRIcOHQgICODPP//k6NGjBAYG8uHDB86cOUPOnDkN/CqEEOLHcuHCBfbv34+Liwvly5cHYN68eWzevJlMmTLRv39/zMzMtO312UEt4pqlSZMmPH78mF69euHv78/o0aN5/fo1c+fO5cyZMxQqVIhx48axePFiHBwc9BaPEEIIIURaJ/XHhBBCCCF+MBFJ58hz9mXMmFF74y3i5l6OHDkoW7Ys/fr1Y9asWbRu3ZoyZcpoE9JKpTLlgxdpVuTP2+vXrzEzM+PBgwccPXqUcePG0a1bN44cOUK6dOlo3bo1Q4YMYdCgQRw7dizKftRqtSSk9SjifRoyZAju7u7Mnz+f8PBwbUeWiPUPHjzAyMiIkiVLGizWuKhUKu3npFOnTgwZMoTDhw+jVCoZMGAAvXr1ok2bNjx+/BgHBweGDh3Kpk2bcHBwoGzZsty8eVMS0kIIkQK+LeDo5eWFSqUia9as2u+cESNGULlyZVatWkVYWFiKxvfvv//i6+vL+fPnGT16NCVKlCAsLAwrKyvCw8NJly4dLi4uHDhwgAkTJhAUFJSi8QkhhBBCpCUyUloIIYQQ4gejUCi4c+cOf/75JxUqVKBWrVpkz56dO3fu4O/vry2/W7FiRbp27cq+ffvYu3cvU6dOZeDAgYAkBkXiGRkZcf/+fZ49e8a9e/fIly8fWbJkAWDixIm8evWKrl27cvLkSUqUKEG3bt149uwZ3t7eUfYjJbv1K6JzSvHixRk5ciRTp04lc+bMdOzYkTx58uDv78+tW7fo3r07LVq0SLWlSo2MjAgLC6Np06Z4eHiwatUq8ubNi7GxMenTp2fSpEm8ePGCJk2acOfOHTJlysSvv/5Ks2bN5NwmhBApIKI0tkKh4NWrV9jZ2WmvQT9+/IiRkRFGRkYEBQVhaWnJX3/9hYODA1evXqVOnTopFue1a9ewsbEhe/bsTJ48mblz57Jy5Uq6d++OWq3mw4cPFC9enF27duHj44OlpWWKxSaEEEIIkdbInNJCCCGEED+Ab+fb27VrFxcvXuTChQu8ffsWMzMzgoODGTduHLVr1yZ37txkyJBB297b25uMGTMC0edoFSI2kT8r27dvp3379jg6OnLv3j0aNmzIxo0byZw5M6CZk7hRo0Z4e3tz+PBhcuTIQWBgIFZWVoZ8CT80d3d3Fi1axOzZs8mbNy+5c+fG0tKSmzdv0qtXL6ZOnWroEOPk4uLC8OHD2bdvHwUKFIi2/smTJ/z2229YWlpy5coVA0QohBA/psjXpevXr2fKlCm0bt2aGTNmoFarqV69OqAp5R3h+vXrdO7cmX///ZdChQrpNB6VShVrh8v169dz6dIl/P39OXPmDJs2baJu3boAbNu2DTc3N8aMGZPqprAQQgghhEiN5G6iEEIIIcR3Ljw8XHvjLyQkBIDff/+dhQsXcurUKa5fv07Lli359OkTp06domrVqvz888+0aNGCQ4cOAWgT0mq1WhLSIsEiPiv3799n9+7dbN++ncOHD7N48WKOHTvG3r17CQ4OBjRzGG/bto2nT58yZcoUAG1CWvrR6lbkUupxyZ49OzNnzmT//v20b9+erFmzUrVqVbZt25aqE9IRn5cnT54QFBSkPX99q1ChQvzzzz88f/6c27dvp2CEQgjxY4u4Lp09ezaDBg1iypQptG3bVrtu6tSpvH79mkaNGnHixAlu3rzJX3/9RcaMGcmaNavO4nj58iUQtQLQ1atXefLkCX5+fgDkyZOHHTt2cP78eQ4dOqRNSHt4eLBp06Yo19lCCCGEECJuMlJaCCGEEOI7Fnmk6oABA/j06RNfvnyhQ4cOtG7dGlNTUwDOnj1LmzZtcHV15dWrV9y5c4d79+4xd+5cTExkxheROOHh4drPzYQJE/j333/JnDkze/bswdbWFoAxY8awePFitm/fTqNGjbTtX7x4Qa5cueRzpydKpVJ74/2///7D2toaCwsL8ubNm2arIER+TZGtW7eOwYMH8+LFCzJnzqz9XKpUKg4fPkzRokUpVKgQfn5+2NjYGCByIYRInFmzZmlHGQ8ePDhNl4r+/Pkzv/32Gx07dqR79+5R1imVSu7cuUOvXr349OkTZmZm5MmThyNHjmBmZhatAlBSPH78mDJlyjBt2jSGDRsGQLNmzbh69SqmpqaYmZmxZMkSGjduzMKFCxk3bhz/+9//KF26NMbGxowcOZKcOXNy5MgRuWYRQgghhEggSUoLIYQQQnznvnz5Qq1atTAyMmLIkCFcu3aNo0eP4uTkxIIFCwA4deoUXbt2xc3NDQsLiyjbp9VElTCsDx8+sGPHDn7++Wc6dOiAUqnkyJEjODo6atu0bNmSO3fusHnzZipVqhTlBnPkxLZIPrVaHaXSQZ8+fXBxccHIyIjAwEA2bdqUonN06krk89P69etRKBSUKlWK0qVLExYWRpkyZciWLRsuLi7abZ4/f07//v3p1asXrVq1MlToQgiRYL6+vtSsWRNjY2N++uknLl26RPbs2ZkxYwbVqlUzdHhJcv/+fUqVKsXRo0epX7++tspF5GsBf39/Pn/+jLe3N6VKlQJ0d33g6enJvHnzWL16Ndu3b8fPz48FCxbwzz//EBQUxLJlyzh+/Dj//PMPrVq14s8//+TcuXPcuXOHokWLUrZsWZYuXQrE3jlKCCGEEEJEJXd5hBBCCCG+UxGjSDZt2kTGjBlxdnZGoVDg4eGBu7s7ZcuW1bYtWbIkgYGB3Lt3j59//jnKPiQhLRIrMDCQBg0aYG9vz+DBg1m8eDE9e/Zk27ZtDB48WFt6c+/eveTPn59+/fpx9uxZ7ShqQBLSOvL27VscHBxQKBQoFApUKhWtW7fm0aNH7Nq1i1y5cjFw4EDatWvHzp07qVmzpqFDTrCIhLRKpaJGjRq4u7vj5+dHgQIFaN++PYMHD2bhwoV06tSJ2rVr07hxYzJkyMDUqVMpW7asJKSFEKmeWq1GpVIxfPhwcubMyeHDhwGYOnUqkydPThPTW8Q2qtnExIRixYrx+vVrbVI34n8XFxdy5sxJsWLFsLa2Jnfu3IAm+aur64MsWbIwYMAAPn78SKdOnahUqRJ9+vTRJr8rVarEb7/9xvjx46lXrx5TpkzBz88Pb29vTExMyJkzpzYmSUgLIYQQQiSM3GEUQgghhPhORdwAfPXqFcWKFUOhUNC9e3fmzp3Lrl276NSpE0FBQbx+/RovLy9MTU3JnDlzjPsQIqGOHj3K33//TZ06dThw4AAATk5OjBw5ks2bN7N7924CAgK07W/evMnChQujJKSFbsyePZu+ffvi6uoKaBID586dw8zMjH///ZeyZcty5MgRTpw4ga2tLR07duTRo0cGjjrhjIyM8PHxYdeuXeTPn587d+5w4cIFKlSowPr169m6dSv16tXj1KlTWFlZsWPHDjZs2EDXrl3Zt2+focMXQoh4RXQmevnyJUOHDgWgS5cuLFmyhH379lG9enXCw8MNG2QcIiekz5w5w65du3B2dgagSJEiZMuWjWXLlnH79m0AbXJ306ZNLFy4EKVSGWV/uk7+5s6dm6FDh1K9enX+/fdfMmbMCGg61wFs2bKFDx8+sGnTJgBsbGzIkyePNiGtUqkkIS2EEEIIkQgy/EAIIYQQ4jsXFBTEx48fcXJy4sWLF7i4uODo6Eh4eDjbtm0jLCyMPn364Orqir29vaHDFWmYv78/Bw4cYOXKldSqVYt06dIRFhaGqakpI0eO5Pnz5yxduhR7e3tatmyJiYkJGTJkoFatWjqZH1JE5eDggLu7O2vWrGHUqFE4ODhQpEgR2rRpQ/78+Vm4cCGzZ8/m77//pnHjxpQuXZpRo0axYsUKcuTIYejw4/X27Vv69++Pm5sbnTt3xtramiJFijBo0CACAwNZsGAB9vb21K9fn71792JiYoK3t3e0zjdCCJEa+fv7Y21tjampKZ8/f8bNzY1ly5bh5uaGs7MzJUqUwM/Pj7///pt27dpRoEABQ4ccReQRxBMnTmTu3LnkypVLe+6ePXs2u3btomLFiowYMYIqVapQsmRJNm3axP3793F2dtZ5wjemUc0lS5Zk4MCBPHz4kJkzZ+Lk5ISVlRVKpRK1Wk2BAgWidKaLTKoJCSGEEEIkjlw9CSGEEEKkYXv27OHgwYPRRpIA2pKOffv25fDhwzx69IirV69q5/R9+vQpy5cv195os7e3TxNlIEXq8e3nztramoEDB9K2bVtu3brFp0+fMDU1JSQkBIClS5eSL18+xo4dqx29G0ES0rrXvn17unXrxvnz51m5ciV+fn5kz54dJycnfH192bVrFxMmTKBTp04AZM6cmcOHDzN+/PhUeS74NqbMmTOTL18+Pn/+zPv377XLCxUqRO/evcmbNy8zZszg9u3bmJubY2xsjJ2dXUqHLYQQiTZ06FAmT57Mhw8fAGjevDmjR4/m9evX3Lt3jxIlSgDw4sUL/v33X+7evWvIcLUij9qOSP5+/PiRO3fucObMGU6cOMGyZcuYP38+M2fOJEOGDBw6dIhy5cqxfft2li5diomJCffu3aNQoUKoVCqdxhYR0/Hjx7l27Zq2OkjlypWZNm0ajx49onv37tptPnz4wPv378mePbvO4hBCCCGE+JFJUloIIYQQIg1buXIlXbt25b///ou2LqLkY/HixZk6dap2lOrmzZvZsWMHtWrVonjx4gwfPjzKNkIkRMRoo6CgIE6fPs3hw4cJDQ3F0dGRcePGkSdPHpo1awaAubm5NjG9adMmevToEWVOc6F7ETfyBw4cSIMGDTh69Cjr168HNImChw8fcvPmTSpUqABoSpX+/PPPXLlyhVmzZqW6c0F4eHiUmNRqNRYWFvzvf/+jefPmnDt3jt27d2vXV6hQge7du+Pj46MtuwpyjhNCpA329vasWrWKgwcPolKpaNGiBZUrV8bc3Bw3NzdevnzJyZMnady4MeXKlaNFixaGDpk//viDESNGaJ+r1Wrat29PyZIlMTMzw9HRkdy5c9OpUyfmz5/PuHHj2L17N4UKFWLevHm4urqyf/9+Dhw4gLW1NeHh4TobiaxWqzExMcHf358aNWowbNgw2rZtS4sWLTh58iRmZmbUrVuXmTNnsmXLFhwdHenTpw/169enatWqdO7cWSdxCCGEEEL86BTq1NgFXgghhBBCxCimEsflypXDxMSEjRs3UqRIkVi3XbhwIRs3bsTHx4fs2bPTuHFjxo0bB8RczlCI+Ny/f59mzZqRLVs27t69S82aNWnUqBH9+/fHxcWFfv36UbFiRTZv3gxASEgI5ubm2u1VKpWUvtSjiJ+vUqmke/fuPH/+nP79+9OuXTsASpQogbGxMS1btmTdunVUrFiR7du3Gzjq6CKfn0aMGMHnz5/JkCEDnTt3pkyZMjx48IAJEybg7e3NlClTqFKlinbbc+fOUaNGDUOFLoQQCfbtNd6QIUPYvXs3y5Yto1mzZhw9epTFixfj7OxM4cKFCQgIoEuXLkyaNAkw/HfqiRMnsLe3p0yZMtplR44cYcCAAeTKlYtz585plyuVSoYPH87mzZs5ceIE5cqVi7IvfbwWd3d3mjRpQq5cuVi7di2hoaG0adOGu3fvcvbsWUqWLMnHjx9Zs2YN48aNY8SIEbRr1077euRaWQghhBAi+SQpLYQQQgiRRkS+WbllyxbSp09Ps2bN8PHxoXDhwtSoUYO///47zrlgP336hJGREaGhoWTLlg2Qm2wiaZ4+fUrTpk1xcnJizpw5+Pn54ejoSIECBdizZw9WVlbs3buXESNG0LFjR+bMmWPokH9IEb/fHh4e9OjRA5VKxdChQ/nll194+vQpAwYMAKBUqVLMnj3bwNHG7uPHj1SrVo0sWbKQO3du3rx5w7Vr19i1axdNmzblzJkzzJgxg/Tp0zNz5kwKFixo6JCFECLBjh8/jpubGz169MDKykq7vHHjxrx48YINGzZoK1ucPHmSjBkzYmJiQsmSJYHUdS13/PhxgoKCaNGiBeHh4Wzbto2uXbuyfPlyevXqpb2eDQgIoHHjxri7u3Pv3j3MzMz0Gte2bds4efIkK1euxMjIiGHDhrF161ayZ8+Ol5cXt2/fxs7OjidPnrBo0SKaNm3KL7/8AqSun68QQgghRFomSWkhhBBCiDQg8oiRfv36sXnzZo4dO0aVKlVQKBTcunWLChUqMGTIEMaPH0+GDBmibB/TCOu4lgsRn1OnTjFq1Chu3LhBeHi4drTRzp07KV26NADBwcHMmjWLHTt2cPnyZWxsbOTzZgAR5487d+4wcOBAsmfPzpgxYyhbtixhYWEEBweTPn16Q4cZxbfnpokTJ3Lq1CnOnDmjTQz06tWL/fv3c+nSJQoVKsS2bduYPXs2jo6OrF27FlNTU0OFL4QQCaZSqVi4cCEjR45k165dNG/eHBMTEwDCwsIoWrQoBQoUYPbs2drv18gMfS33bcK2V69erFmzhuvXr1OuXDlCQkKYMWMGU6dO5eTJk9SsWVMbs6enJ35+fjrtSKRWq1GpVNGSyJ6enty9e5e6devSo0cPrl27xvr16zExMaFy5crUrVuXgwcPAtEruwghhBBCCN2QWnlCCCGEEKlc5IR0/fr1OXr0KJcvX6Zq1araeaPLlCnDxo0bmT9/Pps2bSIoKCjKPmK7WSkJQpEQ4eHh0Za9efOGXLlyERISQqlSpfj8+TNnz56ldOnSPH/+nP3792NmZsbgwYO5efMmtra28nnTA6VSGW8bIyMj1Go1pUqVYsyYMbx+/Zq5c+fy6tUrTE1NU11CWqlUaj8r3t7eANy6dYt8+fJhbGys/TyuWrWKXLlyMWbMGADatWtHnz59GDlypCSkhRBpwuLFixk8eDDDhw9nwIAB9OnTh0uXLqFSqQAwNTWlS5cunDp1iunTp+Pu7h5tH4b8bg0PD9cmf11dXQHNublJkya0a9eO169fY25uzvDhw2nfvj2tW7fm9evXKBQK1Go1WbJkoWDBgtrXmxyenp4AURLSL168IDw8XHusunXr8ujRI1xdXVm4cCHlypUjXbp0ODg4cPjwYSZMmAAgCWkhhBBCCD2RpLQQQgghRCpnZGTEp0+fKFSoEGq1mps3b+Lo6Khd7+zsTGhoKO3atWPixImMGjWKo0ePxphIFCKxVCoVJiYmhIWFsXHjRvbv38+bN2+oVasWR44cwdLSknr16nHo0CGyZ88OwIEDBzh8+DDe3t5kzJgRCwsLndxwFlFFHp126NAhTpw4wbVr12JsG5G0aNKkCa1btyYsLIyMGTOmWKwJFfk1zZgxgz///JOPHz9SokQJbt26pf08hoaGAtC+fXtev36Nl5cXAH379qVUqVIGi18IIRIiLCyMgwcPsmXLFg4dOsSKFStYvHgxpUqVol+/fjx48EDbVqFQ0LdvX+zt7bXfs6lB5PNx48aNmTRpEi4uLgBs3LgRMzMzunXrhp+fHzY2NsyaNYtChQpRvnx5wsPDoyTTkzt/9Pbt2+nWrRuXL1/G2NiYkJAQfvnlF5o2bUrlypXZsWOH9nvj3r173Lhxg/LlywPw8uVLqlWrxt27d5k8eXKy4hBCCCGEEHEzMXQAQgghhBAibsHBwQwdOhR3d3d27txJpkyZAM0IwmrVqlGpUiWqVq2KmZkZEydOxM3NjaFDh5I5c2aqV68uo1NFshgZGfH69WsqVapExowZefv2LQ4ODrRr147ly5fTr18/2rdvT7p06VAqlezcuZM5c+YwYcIE7OzsouxH6I5ardYmb3/99Vdu3ryJjY0Nbm5uzJs3j86dO2NjYxNtG4VCwfDhww1e7jU2Ea+pefPmuLq6MnnyZD5//kz16tVxdnZm3LhxzJgxQzv36JcvX8icOXOUz5oQQqRmr1+/pmLFilSvXp18+fLh7e3NlClTyJo1K8ePH6d48eIMGTKEZs2aYWdnx8qVK3FxcaFIkSKA4ct1RzAyMsLDw4PatWuTN29eJk2apE2aZ8yYkT179lC1alWGDRvGypUryZ49O8uXL+f8+fPa8uS6Ym1tzbt371i3bh02NjYsXLgQY2NjZs+ezeLFi1m0aBHu7u4MGzaM+vXrU6RIESpVqkSDBg1YvXo1EyZM4KeffgI0o791HZ8QQgghhNCQOaWFEEIIIdKAAwcOsGzZMmxtbdm+fTtPnjyhTp06VKtWjZUrV2JjYxPlJlqRIkWwsrLi3Llzqa40r0hb3r9/z/z58wkJCWH27Nk8ffqUEydOMG7cOIYMGUJ4eDhLliyhbNmyZM6cmbNnz7JgwQJ69Ohh6NC/e0FBQfz+++/4+vqyfft2cubMSf/+/dm6dStz5syhU6dOWFhYRNkmtSQzYhIR2+LFi9m4cSN79uwhd+7cgKZzzsyZM9mzZw81atSgbdu2eHh40L9/f0aNGsUff/xh4OiFECJuEbff+vTpw6tXrzh+/DigOZe3bNkSHx8flixZgoODA127duXFixcEBgYyYcIEevbsqd1HajiHRz5fHzhwgAMHDpAuXTrtOtCM8HZ2dqZ58+YMHz6cqVOnxrgPXVm/fj3//PMPderU4d27d0ybNo08efIQGhpK//79efz4sbYj3a1bt5g+fTrh4eG0bNmSTp066SwOIYQQQggRO0lKCyGEEEKkEWvXrmXdunWYm5tz9epVRo4cydixYzE1NdXe2IsofRsYGMjTp08pWbKkocMWadilS5f466+/8PT0ZNasWfzyyy8A+Pn5sWjRIhYsWMCVK1e4c+cOT548wdzcnPr162s/d6nl5vn36tatW6xevZqRI0eSL18+Zs2axezZsylVqhRXr15l27ZtNGrUKM3Nr9ylSxd8fX3Zv38/oCkRa2RkhJ+fHzt37mTmzJmoVCrUarU2KS2EEGlF27ZtMTExYfPmzdrrtkePHtGgQQPy58/P6tWryZ8/P15eXoSGhpIzZ04gdX6n9unTh6dPn+Ls7BytIkrEa5s7dy7z58/n/v37ep82YuLEiaxbtw4TExPu3buHlZUVoCnRPXz4cPz9/RkzZgx16tQBNB0CLC0tga/fNUIIIYQQQn/kaksIIYQQIo3o3r07TZs25dmzZ9SsWZOJEydiamqqnZfP19eXVq1ace7cOaysrChZsqTM4yuSpVy5cqhUKu7evcunT5+0y21sbPjtt9/ImTMnrq6u/P7774wdO5YRI0ZE+dyltpvn35v8+fPTtm1b8uXLx4wZM1i5ciWbNm3i1KlTFCpUiOnTp3Px4kVDh5kooaGh+Pn5aRMXkZMENjY2VKpUCTc3Ny5fvszZs2clIS2ESHNMTEzw8PAANN+TKpWKokWL0rZtW27dusW0adN4+PAh9vb25MyZM1V/p6rVakJCQrQdIyMEBASwa9cuAgMDGTlyJE+ePNF7Qhpg0qRJtGrVirCwMNavX69dnjdvXv73v/8BMHnyZB4/fgygTUiDTDMihBBCCJES5IpLCCGEECINGThwIK1atcLLy4vly5cDmpub9+/fp3Tp0vj4+FCjRg1te7nBJpLD3Nyc3bt3U7p0abZv346rq6t2Xa5cufjy5Qve3t7aZRFFmORzlzJsbW2pXr06gYGBHDt2jBEjRtC4cWO+fPlCxowZuXbtGkuWLCE8PNzQoSaYmZkZv/zyC1u2bOH27dtRPktPnz5l/vz5PHz4kKxZs2pLewshRFoyduxYzp8/z7x58zAyMtKe58LDw6lfvz5ubm6sWbMGT09PA0cav8GDB3Pt2jXmz5+PsbGxdvmdO3dYv349d+/eBSBdunQp1lFy0qRJVKhQgX379nHw4EHt8vLly9OjRw9q1aqlnZ9bCCGEEEKkLBNDByCEEEIIIRLOysqKoUOH8vnzZzZv3kzhwoUxMzOjYcOGdOzYUZuojiiZKERyZciQgS1bttCuXTumT5/OjBkzyJQpEydPnsTf35+8efNq26bGUVw/gufPn3P79m3tqGEvLy8KFCjAunXryJ49u3au+bSiW7duHDt2jCZNmrBp0yayZMmCUqmkc+fOFCxYUJIJQog0rXjx4syfP59Bgwbx5csXSpYsiVKp5N9//2XXrl2cPHmSHTt2YGFhwYQJE7CwsDB0yLFydHRk3rx5DBkyBHd3d/LkyYOJiQljxoyhb9++VKxYUds2pTqs2djYMGPGDPr27cvatWuxt7enUqVKALRp00bbLjWWQxdCCCGE+N7JnNJCCCGEEGnQ7du3mT59OpcvX+bdu3fMnz+foUOHApKQFvpx4sQJOnToAEC1atV4+vQp3bt3Z9iwYQaOTAA0bNiQmzdvUr9+fU6cOIGTkxPr1q0zdFhJFhgYSNu2bXF1dSU4OBhra2sqVarE5s2bDR2aEELoxKpVq1i6dCmfP38mLCyMkSNHMnz4cAC6du1KQEAAa9euJX369AaONH4bNmxgw4YNvHv3jkyZMtGlSxf69u0LGC75e+bMGcaNG0fOnDmZMmVKlA5NkpAWQgghhDAMSUoLIYQQQqRRR44c4Z9//mH06NHakt2R518VQtdWrlzJ3Llzad++PV27diVv3rzacpzyuTMslUrFkCFDCA4OpkSJEgwePNjQIenE7du38fX1xdTUlCpVqhg6HCGE0ClPT0+Cg4NRqVTkzZtX27EwNDQUIyOjNFXpwt/fXzvHdObMmQHDX5euWrWKLVu2sH79+iiVXYQQQgghhGFIUloIIYQQIg0LDg7GwsIClUqFQqGQUR9Cr9RqNYMHD+bBgwcMGTKEZs2aGTok8Q2plCCEEGlH5BG7kW/PpcXrudQ6+tjf3x9ra2tDhyGEEEIIIZCktBBCCCFEmpdabwKK71NYWBjt2rXD29ubsWPHUrduXUOHJIQQQggRK7lWFkIIIYRIHaTGnhBCCCFEGic32URKMjU1ZdWqVSiVSm15TiGEEEKI1EqulYUQQgghUgcZKS2EEEIIIYRItLCwMExNTQ0dhhBCCCGEEEIIIYRIAyQpLYQQQgghhBBCCCGEEEIIIYQQQm+kfLcQQgghhBBCCCGEEEIIIYQQQgi9kaS0EEIIIYQQQgghhBBCCCGEEEIIvZGktBBCCCGEEEIIIYQQQgghhBBCCL2RpLQQQgghhBBCCCGEEEIIIYQQQgi9kaS0EEIIIYQQQgghhBBCCCGEEEIIvZGktBBCCCGEEEIIIYQQQgghhBBCCL2RpLQQQgghhBBCCCGEEEIIIYQQQgi9kaS0EEIIIYQQQgghhBBCCCGEEEIIvZGkdCoQEhLCX3/9RUhIiKFDEUKIVEHOi0IIEZWcF4UQIjo5NwohRFRyXhRCiKjkvChE6qJQq9VqQwfxo/Pz88PW1hZfX19sbGwMHY4QQhicnBeFECIqOS8KIUR0cm4UQoio5LwohBBRyXlRiNRFRkoLIYQQQgghhBBCCCGEEEIIIYTQG0lKCyGEEEIIIYQQQgghhBBCCCGE0BtJSgshhBBCCCGEEEIIIYQQQgghhNAbSUqnAubm5kycOBFzc3NDhyKEEKmCnBeFECIqOS8KIUR0cm4UQoio5LwohBBRyXlRiNRFoVar1YYOQgghhBBCCCGEEEIIIYQQQgghxPdJRkoLIYQQQgghhBBCCCGEEEIIIYTQG0lKCyGEEEIIIYQQQgghhBBCCCGE0BtJSgshhBBCCCGEEEIIIYQQQgghhNAbSUoLIYQQQgghhBBCCCGEEEIIIYTQG0lKCyGEEEIIIYQQQgghhBBCCCGE0BtJSgshhBBCCCGEEEIIIYQQQgghhNAbSUoLIYQQQgghhBBCCCGEEEIIIYTQG0lKCyGEEEIIIYQQQgghhBBCCJHGqYKCDB2CELEyMXQAQgghhBBCCCGEEEIIIYQQQojEC374EBQKAi5cwHPuPHLMmUPgzRtYFC1GxjatDR2eEFoKtVqtNnQQQgghhBBCCCGESISTU+DxEeh+HCxsDB2NEEIIIYRIQSFPnvBu5CgytPyVjB06oDCR8Yc/KlVQEI/LlI11fbYpk8n4++8pGJEQsZOktBBCCCGEEEIIkZbs6AgPD359PvolWGY0WDiphjIMzs2FAnUgd0VDRyOEEEIIoTfPnJwIffoMgHRVKpN77VoDRyQMJczTk6c1asbZxqxgAfJs2oRJRvmbQRiWzCkthBBCCCGEEEKkFWHBURPSAMfHGSaW1Ob6ajg7E9Y2MHQkQogECFGGsPPxTt77vzd0KEKkCm7ebpx+fdrQYYhUTh0WxqeVq7QJaYCAS5cNGJFIC0KfPuPDjBmGDkMImVNaCCGEEEIYzotPAWy8/JL6xbNiaWpM6VwZUCgUhg5LCCFSr7DA6Mtub4Em88DUMuXjSS383sOxMYaOQgiRAKHKULY83ML8G/MBsDSx5FqHa1HaXPe4zr4n+xhVfhQZLWRUl/g+eQV5kcE8A8ZGxgC0+rcVANubbOenzD8ZMjSRinlMnozPrt2GDkOkAoE3b/F+1CiMrK0T1N7/zFnUYWEoTE31HJkQsZPy3UIIIYQQwiD8Q8JxnHg82vJBdQoysE5BzE2MDRCVEEKkYsG+MDN37Ou7HIJ81VMuntRkeXXwcP36/C9fw8UihIjT1CtT2fF4R5RlJTKXoEGeBnR17Kp5vqEEAPXz1Gd+rfkpHaIQevfQ6yGtD7WmbJaybGi0Afj6ua+UvRLL6y3XJquFiOxh0WIxLi/26GEKRyIM7XHZcqgCY+iwGof0v/yCw6KF+glIiASQ8t1CCCGEEMIgbr7yjnH54lNPGbztVqzbKVVq/r3znmmHH/DiU4C+whNCiNTnVTylGTc4aeZU/hE4T4C/bGFpZXh3I2pCWuhWkA+Ehxo6CvGd+M/jv2gJaYC7n+4y78Y8VGpVlOXOr5zZ+2RvSoUnhN4EhAWw+cFmPAI8ANjzZA8ANz1vcvj5YRbcWKBte8X9Cvue7jNInCJ1i2t8Ydh7mQrhR+J/7lyiE9IAX45HHxggREqSpLQQQgghhDCIzmuvxbru+P0P0Zap1WrClCoKjD3C4G23WHX+BbXnnmHeicfce5fyI+J8AkOZ7+zG1EMPCAwNT/HjC5FcoeGq+BuJ1MPfE7a1ib/dqSn6j8XQQgPh4iLNY88HsKpO0vflfgf2D9CU/xbR+X+EWXlgcTlDRyK+E92Od4tzfaejnaItm3hpor7CESLFzLw2k1nXZ9H+cPto68acH8Pae2ujLLvx4UZKhSbSkLjmBH5ap24KRiIM7U3vPoYOQYgkkaS0EEIIIcR3RqVSExymJChUGW1dcJgS17c+cfaw1qd/77xn6PZb3Hwd8yjp2AzedovC449SaNzRaOsWn3qK0+ILKf6aWi69xN8nn7D6wguKTzhOuPL7T/Dt+u8No3e7xvjZAnjtFcgilyf4BMY8om7u8ccsOfUkWTHce+eL84MP+IdIR4DkOHLXncLjj7Lz+htDhyIS6sxMQ0eQery6pLt9ragBtzfD/GLg9Ux3+/1evDir+d/3tWHjED8M14+uPPd5bugwhNC5i+8uAvAx6KOBIxFpmffGTXGuV4eFpVAkIi1znyCdvYThSFJaCCGEEOI7svzsM/KPPULRP4/x08Rj0UZCdlt3nWZLLtL47ws8+fAlRWO7986Xwdtusf/2e1ouTVxC4d877wlTxp10VqVwnv35N6XDN195lbIBpLAFzm6M2u3Kjv/eUGzCMVaffx4lEX/g9jtqzDnNAhdNu2/9uf8eS04/Ze4JN0LCY05qx6f5PxdxWnyBXhv/w3Hij9ERQF/6b7kJwB97XPELlptXaUJ4sKEjSB2+eMCWVvrZ9+Ky+tnv9+LfwWCgTm3ix9L8QHNDhyASac71OfR17otSlbRrvO/dkedHoiSjj708hgJFnNscen6IT0Gf9B2a+M48c3IydAgikYLu3OFZEyf8z19IsWP67NyZYscS4luSlBZCCCGE+E7ceOXNzKOPtM9Vavh9xWV8A78mnC4/9wLgobsf9RecY/X5lBmJ4u4bhNPixP2RFZG4/ByQsHksC4w9wh+779Bv8w28E7iNLs04+ui7TpIuOhl1hPPUww9Ze/EFoBkhPWT7be065wcf+PJNonNTpKT96BiS1rFRq9VsuvKKfptvcOeNT5R1g7fHPve4SLi+m6Q8ZKrn9Qxub0l4+yvL9BeLofm+092+3GUe6jgF+8LBoV+f39ygKZcuRBIES8ea71a4KpyNDzZy8f1FdrpJoiMmo8+PjvJ81NlRvPGPv1rNmHNj9BWS+E6FvZLKJmlJwJUrvGzTltBnz3jTqxf+Fy8aOiQh9E6S0kIIIYQQ34HQcBU9NlyPtvzOGx9KTT7BJ/8Qzjz2jLZ+6uGHKREelWecSvQ2y89oEuatV1xO8DY7/3vL0XselJnirNdy3jHNYR0SrmLDZd2Nln7lFcDpR9HfM3175OHH8B23ee0VSJhShVqtZsqhmJMQ0488QqVS02Jp9D+eJxy4j29QGLdee0d7L/bf1szd6vklmLA4EvmfA0LJ978j/Ln/HkfveURbf+SuBzv/k/LTyXXpmZehQxDx2dU1ce2PjYFgP72EYnDn5uhmP94vYUV13ezre3VqGoR+U1VFEosiCf7z+I/yW8oz7co0Q4eSLP6h/qy7t463X94aOpRU45bnLcpsKqN9Pv3qdLY8TEQnqh9YRDnvuMi80lFdeneJ3/79jfte9w0dit6EvX/Pl9OnY/xb1nPePANEJHQp3MuLZ05OvPvjD8Lev+d1125R1r/p0TPFYvm0fLnBpnUTPzZJSgshhBBCfAecH3zAJzD2EryXnnnRdV30pHVqdv6JpsTdU0//JG0/YOtN3nwOJDhMt2UEvQNCYx31fVlHyb2/Tz6h5pwzdFt/nbNuKTfv3NXnXjRceJ69tzSluAuNO0r5aSdZc+FFrNvMPv44xtHs5598pMGCs/y69BKnYkiuP3jvR4VpJ2PtdBCuVFF2inO8Mf+x2zXGTgJCfFe8k9DhRZkGyrK/vQGuuxLe/v0tcDuasLbn47hx6/kIFpVK+HF/VN4xnPvPz08bny2Rqsy+PhuA7Y+3GziSpFOr1XQ62on5N+bTeG9jbn64+cOXqlar1XQ+2jna8pnXZnL+7XmdHy9MFcZ1j+uEKlO+IpKhhKvDDR1CqqFUKenj0ofH3o9pe6jtd/v797ROXd7264//6dNRlqvVarxWrU7QPjwXLtRDZEIXPi1bTujTZ/j9e5CndeomeT9fXFySHcvHhYsIOK/7c7UQ8ZGktBBCCCHEd+BxPPNDzz/xONZ1+p5bOqm9b1VqNReeJH0etSN3Pag++zR1551N8j5i0n711VjX6aqn8XxnN+3jLmuv8eB9yox4bLPySrRln/xD4txm+dlnMS7/5B/KBz/Ntj02/Bdt/W/LNfOK33rtA2hG+99964tKpWbVuecUHJfAxBPEmTQXCbP+ovwMU7W4p52MRQqNfPB6BgFemrmGA7zg2WkISMC5+9ISWF0H9vaExeUSNlfx7W0Jj+vkZAiP5fz1NI4OL2EGHAns9QyUqSkBEcMH79EhuLYq5UMRadpj79ivQ9MCn2Afym0ux1OfpwCoUdPlWBdtsv1HNeXKlFjX9T/ZH98Q3XYanH1tNt2Pd+evS3/pdL+pXVB4kKFDMDiXVy5RRuQDDD0zVPvYzduNdofaJWj0eVoReO1rh/LQN2949JNjgrf1Wr4CpX/SOnYL/VKH6qZTzduBg3SynzCP6NXIhNA3SUoLIYQQQnwH4stXvPQKjHXdr0svJfg4KlXikxwDtyVt3l+FQsGik27xN4zHO58gjt1zT/Z+Ijx0jz1BfPKRZ5zrk6rx3+f5Y/cdlEn4+adWgaFfRzfkHXOYwuOP0nTJBfKPPcK0I4krK7/v1rvv6mdjCH8dfMB7H7npmSq57tLM7ZtY+i7H9/oK/GULi8vCnPww2U7z/6YWMKdA3IlLtRpOjPv63OspfIwnafXsNFxbkbgYFbHc8ojrZzM7H6hin1ZA597fhpubYHV9zc9yWWXwSQXzQT4/C0+Ox7zO8/stmyp0L1wVjkqdgr9TiaRWq/EK+lrpJiAsgDNvzuD60ZX/nf8fJTaUoPqO6oSpolcI2PpoK5ffJ3yame/Jh4AP7HKLu9JF472NdXIsjwAPplyeoh1pf/D5QZ3sN62ot6ueoUMwKJVaxbAzw1B/09nuzJsz7HLbxQvfFww5NYR7Xvfo69LXMEHq2bP6DRJ9bRJw7pyeohFJpVar8dm509BhCGFwkpQWQgghhPgOKJI0ik7DPyRho7Le+QRRdqozM48+irXNtmuvWX/xBaHhX/9oPuya9ISwrnIq/bfcTPK2wWFKHrr7ERKu5OZr73jbN1p0HnffpCf3AmJ5P3b+95Zac08TFPp9lqpLLucHHwwdQprXaJGUb9OpV5c1JaRVSvD31PyfFHuTOrecHpPSgZ9h7S/fHO6b13dkZMzbvroMkzJEX35nq+b/S4vhyvKo6+7u1iS7Eyu2pHRsI6gBwgIhhuST3qysCf8OhLfXNM8/ucHCEil3/NhsbBb7ulubNXNyC5EA8/5LPXOgvvR9yZaHWwj7/xL0L31fUnJjSWrtrEWJDSX49cCvVNpaiUGnBtHhSAcOPT8U7z57O/cm+AeYa33vk738fvB37n68C8CwM8Pi3cYvVDcdNcddGMdOt6iJnIZ7GvIhQLfXfjc/3OS6R+qb7khXP0dDeO7znAEnB2g/NwDHXh5j6e2lCaowFaoMpeWBlrGun3x5Ms32N+Otf9R53r2Dvdntthv/0LQ7Wvjz+vWoQ0OTXInr3fARhDx/nqC2wY8f471jJ+qU7JT3Db8jR3hSvQav+/TRxBKemirH6IbPzkRMWZNCAq//hzpMpmURKcvE0AEIIYQQQojkUySttmuCBIUqGb7zNqcfexIcpmL52WeMaVQ0WjvfwDD+t1dzw2Hxqafc+LN+sspZq9Xq5GXbI1GpNXMUmxgnrk9mQEg4P02MZaRYHEbuusOWnpUSvV3EMWPz5nMQC13c+F/jYoBm1HY6MxNy21kl6VhBoUpOPvpA9UL2GBvp7zOUEp591M1NJ7/gMHZce0Ptovbkz2yNURr/uSSGb5DmhkRSfldEDNY11Px/crLm//y1ofP+lDu+PkdK/1MxYe3e34YcpaMui/i5fOviIijeHE6M1zzPkAsy5oPwYNjTI4mBxvL7m9KjNu/tgd3dodowqDkGfF6BfZG4t7m+Gsp1AyPjlIkxsTb/BoOiT80gxLc2P9ysk/2EKkMxMzZL1j6a7m8KaEZDZ7TIyOTLk6OsjyjPnVjlt5TnfJvzZLDIkKz4UrOJlyYC0P5Iew7/epi7n+7Gs4VGiQ0l2OG0g8yWmRl2ZhjlspSjd8neWJtZJ/jYNz7ciLbsnf87xpwfw7qG6xK8n7iEKcPocqwLAJfbXU5UfPH5HPxZZ/tKa4adGcZz3+ece3uOAy0OkN82P6POjgKgfLbylM9WPtZtw5RhdD3WlWe+MU8VFJcBJwdw99NdLr+/zLxaqadjTGI9qV0HpZdX/A1jEfriBeb588fb7kXzFgAEXLpEzvnzUBin7LXHp2XL+LjobwDCz34k4Ow51GFhZOrYIUXj0DePiRMT1C7k2TPMCxTQczQafgcPYpIpI1n/978UOZ4QICOlhRBCCCG+C8nNm7nEMcp0zYXnHL3nQXDY15v4u/57E61diPLrKDmvgFAeefhRZPyxZMWl0mFS5fDdxI/Y7rUxaTfcLz710ttc3VdeaG5svfYKpNGi89SYczrJ+2r+zwUGbr1Fw4XncExC8j01mXP8MSHhyR9F/r89d5l25CH15p+jy7prOogsbck75jAFxx1lgXPyS+eLbzw/nbKjS78duZxUKpXm36lp8O9gmJELAjwTtu3KmlGfPz8bd/tVdb4+3t5eU8p6Ve3ExZsQ8SWlX17QXVL/9RVNQhrgwgJNUv6fCprS53/Zxr7d4RGa1+6XzOkn1Gr4cD9xc1W/vhp/G68n8NQl6XEJkUhDTg/R2b5uet6MlpBOrm2PEjHnfRrz77N/ozxvsq9JorZvc6gNdXfVxfWjK+vur6PFgRYJ3vaK+xWUsXyf/fdBdx1jQlVf53n1D9Pt6NqZ12bqdH9pRagylOe+X0fqNt/fnCW3lmiffwr6FOf2Gx9sTHDnh8gOPT+k3e7EqxOJ3j41SU5CGuDDjJnxzmHstW699vGX48fx3rI1WcdMrA+zZmsT0lGWT51K+OfoHTpUoaH47N5NwJWr+B07TujLl4k+plqtTtDoYHVYWJL2n1zPmzjF+Nr15fOGjSl2LCFAktJCCCGEEGmel38I85KZQOoZR/LVKyD6H7Kjdrty7UXUP5S+Ha3dcOF5QpXJG43m6RdHidVEuvMmcXOyqtVqLj1L+o2APpujj+qITXCYkk5rrrL6/PN4Jwi/88aH4/c9oiSjkzLXd5hShdsHzU03d9/vo+xkxEjfhLj52pu1F15w/snHKMtPPfqabDv/JO6bZfG5+9aXevPPcvJh2istvujkE0OH8H1aVCr6Mp83cH9/zHMF3t2d9GMltVx4ZKdnwOSMmn/nZsPNDRCSyDKivu80/4cFxV0SWh82tYg5uRxfwn5zS3iigxvZvm+jlzl/fyvh27vfgTUNkhfD5SWwrArs653wbdYm8JibW6Xs/Nvih3bh3QU+Bn5EGce57Z3/O+380EHhQdTbVY/ux7vzyu9VlBLbF99d1Hl8Kr7f34VxF8bpdH8fAj8Qqow7URahn3O/ONc7v3LWRUh69drvdbL3ERAWEONyz0BPnF85c+ndJW1Z+tRizPkx0ZatcF2hfRyuiruz1O2Pt5N03P+djzriM6aR9j+KsLdv8d6+Pc42nrNmRXn+Yfp0lH76LRkf7u1N0P37BD94wOd1sVc7eFKlKkF37xLy/IW2ApvX8uW4j/+T11278m7oUJ41bMTDosXwWpvwqgnvhg3nUYmSPKlZi49LlxL+8WOM7V517MSzho3wc07588yLFr+maGJaiJQkSWkhhBBCiDRu9rHHOtlPbHNF33njE+Py1isuRynPrctRzQA3X/vwzifpczN/a+3FF3yOIcEeG5eHCRwJGAt3n4QnenfdeMv5J5+Yevgh99/HfxOgz6aoN1eck5D0nHroQaK3Se3i+gj+9/Izf598gsuDDxT98ygtl15i8qEHdFqjGQ0drlThFxxGWDI7UkTWfcN1nnr602ND6ihxG6ZUceOVd7JeY1CokmP3POIsMy8SaaEj7OrydU7lyJJctvr/9xucjJuKHvfgrA5Gd21rA2+uwUo9jHiOz4uz8DmG+RQTkrB/ooMbkK8uJ38fvv+fzAjwgl3dNKOrT05J+PYR5dDv7Ym/7d3die8IsaZe4toLkQx1dtXh5y0/x7jOO9ibhnsaUmtnLdz93amwpQIfAj9w3eM6TvucKL8l9jLBupCcKWt+RI32NtLJfoafGc6/z/7lusf1GN+DU69P0XRfU06+Psmp16cIDAsEQKVW8dz3OZ6Bnqi+qZ7x64FfWXhjoU7iAwhRJr+TbaWtlWJ8fQ33NGT4meH0cenD/Bvzk30cXYqvw4CauH9njHSUtuh6rKtO9pNWBd+P/W8+n/37Y1z+bsRIPUWj8aRyFV62+o0XLVvF2/bl76153rgx7uPGo1ar8b8Qc6ciz9mzeT9uXLznYr9jx/hyTFPNLfzDBz79vZgn1WvwsGgxXnXtRshTzTQOapWKoDt3AHg/clRiXp5OhHt64jl7ToodTxWku/suQsRHktJCCCGEEGmcV4BuRhMvPxvzfF03X/vEus1H/6/HViZhtG5K67w2ASVJgdOPPJNcujtCUJgywSW8g0O/Jki6rbue6GP12XQj0TdDN1x+lejjpHZvPgfGuNztwxd+W36Z+c5u9Nz4X5RS9KBJSLdcdomSf50g/JvP8ahdd7j24jMDttzkg1/iRpR/CY57xEpwmDJFk7tTDz2g1bJL2rnfk2Lsvrv03XyDETvv6DCyH8zp6V8fv7r09fGBAfBFx6PqIxKSSbGnp25i8LgLa+rDx4e62V9iLasCW9t8TTKHh8KFFLpxfyoRyeO4PDsFu7vC/b2a5+fnxr/NhweafwkV7KfpBJHYjhDvbkDgZ7iyHPb2gXt7E7e9SFPufbrHoJODeOH7IkHtv0326UJsoysjlwne8yQBnTB0bIXrCp0kHxNLrVbTz6UfJTaU4N6neyl+/KTyDIza+TM4PJjtj7bz3v99lOXGRvHPbTvuwji6H+9OyY0l8Qn2AcA3xJc/zv3BkNNDeOn3kqGnhzLk9BAqbq3IoJOD+N/5/9F8f3Pq7qpLqY2l+BL69ZrdP8yfNffW6Ozzm9S5yr913+u+9vHRF0fp69KXMNXXa83NDzdz7OUxLr67SJejXXjp+1Inx9WXyD/zb116f4lTb07pPQalSsktz1tRKinE5LbnbU68TJlS4NZ16sTfKBF8DxyINZHrPibmeYQDzp/n89at+OzerU3S6ornokVJ2s53714eFStOsKtr7G327OXjwkWoQ0MJvHEDdXjU7wt1WBjvhg6LdfvAK1d47tQUgJAnX1+3OiSEgCtXedrgFwIuXYpt8zipQhL//eAbS6eB8E/Jq+YVk8dlyuK97fudhkKkLiaGDkAIIYQQQiSPkSKZE0onQ4VpJzn/R21yZbJKE0npe+/iHzV44r4HvTfppsxb/QXnuDG+HnbW5nG208VbeOX5ZyoXsEv+jtKw35Zf5uXMqPMcbrv2Ot4kbMFxR2Ndt+vGW3bdeAtAQGg43armIyhUSUPHbLFuExKu5Pj9D1GS3+effKR6IXvt87xjDmsfP5rSEAvT+G+6JldER4Td//964qNWq1F88+Hcd0tTivnYfQ/eegeSM4NltDY/vLAgWFop9vVnZ0HtsZrH674ZKTavMBRuCPX+gizFkh9LYvbx/CwcHQ1NF0LuSoZLIutaeDC4HdP8G/kUFhRP2HaJTUYow2Bvb8hXHX7+/zmkfXTU+WfTrwlv+/4WrKyV+GMkZCR1bPb1+Vru3HU7OLZM+r5EqtbucDsAXvq95OCvB6Ose/PlDd2Pd6dRvkYMLzccgN1uyZiCIJEiTyNjZmyWYseN7MTLEzQt0DRFj3nkxREuvLsAaN6fA80P4B7gTtWcVVM0juRQqVU0298M9wB3Ft9azMV2XxNoiU30V99Rnbq563Ly9clY25x5eybasuse0TuFdjvWjQ2NNiTq+JF5BXlx/OXxJG//Ld8QX+1+/zj3R4xtRp39OqKz6f6mOP/mTFarrDFeq6nUKr6EfsHW3FZnMUbY4xb/d8rMazPJZ5sPUyNTHKwdyG6dXbuuj3MfncZz3eM61qbWFLOLel209t5a/r6lmc94a+OtlLAvEeP2nY52AuBAxgPkt82v09i+pTDW/d8Eb3r2pNDFC5jYff1bMczDI85tPkyO2rEufYMG5Fy0kNCnT/m8cSOZ+/XDNEeORMfitWx5ordJ1P5XrMBrhaZMvGXZsgTdvKlZYWSEZakYptGJxetu3aI+79pV83/3HhS6fAmTjBkTFdcXF5dEtY/gd+IENg2+Tqui/PKFJ9WqJ2lf8fGYNJmM7drpZd9CRCYjpYUQQggh0jh9JKUTM+r2r381vfa/HWGaWsWVPH/vE6SzhHSEclNdElU2PKnarbpCaPj3O59hQl18quk5/iU4jHknHidrVPC3zjz+SJe11+i7+QZ5xxwmKDTmEsCTDj5g8Lao88Z2WnONj180N1aff/SPsq7arFMJfu/8Q8IpPO4of+zW/0jlnpHKjj/76M8il6jzTFebdZqlZ2KusBCfq8+9+HmqS5qcbzteV1eA98u427yNoxKD2zHY1BJeJ6yyQ5wSU0FhYzNNInp9E81o4u/R3IKQwHlM+W8NXFmW8H277tSMZD40TPNzf3QkaTEm1Ldltl9dgn39kpaQ/vwCDg1Neizfzr8dKHMgfu8ij86M0HhvYzwCPFh3b512FKy+ktIRybkIe5/s5d9n/2qfmxqZ6uW48UmJkdJ3P96lxIYSlNhQgnNvz7HhftSkafMDzenr0pfFtxbrPZbkivh749DzQ7gHuAPgF5r8uWzjSkjHZuyFsdGW3fS8yS63Xdz9mLRryQEnBzDj2owkbRuTiAoAEy5NSPA29XfXZ+KliTGuG3p6KNW2V0vy64vs2ItjXHqvGUG66OYi/rr8V4K26+Pch+7Hu9NgTwPe+b9Ldhyx6X68O60PtY62fNujryND2x9pH+9+mu9vzs0PN3UaWzQm+umo+qRqNbzWr0etUuE+YSJPayVuWpUvJ07wfuQonjdths+u3TytUxd1aNRrKreq1XhYtBhqZQKmSUkB2oQ0gEpF0K1bsTf+hjKO+ZyfVK6S6FjUYUmb8/3d4CFRnoc+j2FamhQS5unJyw4deVi0GF9OJv48K0QESUoLIYQQQqRxvkFJ+wMnJnnHHCbvmMNUnH5Sm0CLT1CY5o9OpSptJEQLjD2C/zclkx+892Pu8cdcfeGll2N2XB13cklXI0133Xijk/2kZR1WXyXvmMOU+OsEi0/pttzct2Ydiz4Pe3CYkq1XX8fY/omnpkxhQEjUGzWf/EMpPP4oD+KZT3zNhRc4TjxOqFLFzv/e8sorIImRJ8zJR57ce+fL6vPPqTvvLAtc3KK1mXM8aXPat1l5hU/+Ialmvm2dCkxASb3VdSGuc+aX97C2QezrE0wNHx9rRtomNMmtCgf/uEfP/DCOjUl420+Rfj8mZYAz02NtqhMRZbaDfcHdVTPqPqZ5ySOL6KSg/Oa64Y0OOkBENjsf+LyBLb/Dqam63bcwqAGlBwBgpDCKs/Tu0ReaCiQPP+un4sKfF//UPvYN8WXipYlRSnY/8EpE6XodOvbyWKwln5UqJQFhyf/eHnByQJTHsf2MV7qujLXUeWoR8bP6zyP1XgtMvjyZ9kfaJ+lnGbncti44v3Km5o6anHt7LlHb7Xu6L8rzvU/2UnFLRU6/OQ0kLBkbl/f+7xl1bpR2dPPqu6uTtJ+Vrit54/cGryD9/D0G0OFIhyi/o9/+DfYpSHMNd+fjHUpsKMGxl8ei7aPLsS403ttYbzEqjPVX2NZz5iweFf8Jn507k7S93+HDUZ6/7tETj8mT+bxlC4/LV0DppXnvHv3kiPufEwh58iSm3aR6XmvW6n6nyqTfK1FH/pshAVMaJEfww+jfKf7nL/Bm4ECe1qhJ0A1NB/63AwbysGgxHhYthvJLwqYsEyKCJKWFEEIIIdKwMKWKy891/4e755cQyk9zYe2F+OcMvPTMC7VaTYd4Eq+pyeVnUX9mTRafZ8nppwzboZ/Rpw/c/Tj/5CMTDtyj89prhH/zR6muxrpffyGj01LS+ksvtR0c1Go1Xdddo+if0W9eRXjnHQTAQ/eYk8+N/z6P61ufGNc98vBjyqGoN9lrzjnDsXvuSYg84ZwWX2Dq4biTCp5fEjfX9reVGNZdfBFlmUqlxjsglBuvvBM9V3rqkMDf6C2t9BsGaEpQ7+ikmZM4MUnuELm5pPU5YXPn8v/zmGp56K5KQ5wWloAVCSzjOK8InJ8HUzLDwaFfl+/TbZlUALa11YygPjcn6nLft6BKHSOoROJFlMV+8+UNbQ614dzbczGObHXzjt6JSZdOvzmt/X4IDAuMtj6mRFJKuOp+lVIbS7H67mpt4utz8GdmX59N6U2lqbS1Eg+9kpeoDwoPSnDbZvubaR/rY37v5Ip4n9TE/F1/8lXqGYlXZlMZwr7t0GMAn4OTf60/8dJEAsOj/t7MvDYzyfuLSORC8uLb+2Qvjfc1ptbOWkneR3xcP7py4tXX6h7fzm1ee2dt7ny8Q8cjHQFNOfQSG6KX9H7z5Q1qtTrG61S1Ws17//dJ/p3TR/lufQm8fh3vrdv4MGUqqm8Skz67dvG8aTOUvr6EvnqF0scn2hzPqZXnnDnxtgm8mfBR1wDqZFz7PK1T9+tIaz1fQ734tSWPK1Qk5PnX6983vXrh7xL7+ditfAU+b9yI76HDeK1br93W9/Bh3o8dhyo4GHV4OD5796EK/U6rMYlEkTmlhRBCCCHSsC/B+v3DbvKhhI00eej+hQ9++i9ZqCvf3kBIibxXpzXXtI933XhLuwq5tc+NdJSV3n/7PQvalI5x5HVIuJImf1+gzc+56FVDv3Oh/UgcJx7HLp0Z01uW4Mzjj3G2nXH0Eb//nIt05rH/GdZsyUUODaqGy8MP3HjlTa/q+alR2D7WygV9N9+MNo92Smu9/DJnRiW8BOB856jJikkHH3DW7SPru1VgyaknzD3xdf3KTuWoUdieiQfu8yUkjBktS2JraZiyrAmXwBPKs1P6DQM0SelPkUazez0DuwLxbxec/PKp342/S0PXw5C3mqEjiSpTflCGa0ZKJ5T/Bzg5WfP4xjrNvx5Jm+MwXh/ufX08LQd0OahZdnAwlGwDLVfq57hCr8yMvs7V/ObLGwacHEDJzCXZ0mRLlHbGCv0nVaZemYpDegfefEl9VWIW3VzEopuL2NV0F78f/D3KutaHWrOr6S6KZiqapH0nprrOmy9vCFGG8OeFPzn68igDSg+gm2M3zI3N49zuqvtVLEwsKGWf8PlXk2LM+TFc97jO/qf7Y1w/9MxQvR4/sS67X6aGQw1Dh5EkJTaUYN0v66KVe4+w5eEW+pXql+z5pV/7xVwtKDVx94+7Q2dEQjo+K1xX8M/tf2iQpwFKtZJgZTDhqnCuums6avcs0ZMhZYfEs5cY6Kl8t6G4Vaxk6BD04lX79uTZshmrcuXibft54yYCbyR9irBwDw+e1qtPgRPHedmmbZL3k1AqPz+eN26MTeNGqIIS1vn4w/SvUxV4zpoFCoX2JovfwYPapLr72LEUuXkDIyurKNsH3ryJkZUVFkWT9t0o0haFOm12PRdCCCGEEMDSM0+ZfSxp5XN/ZKs7/0y94lkBzajM/GP1PPdnDA4NqoZjTs2Nn/UXX/DXQd2UmpzzW0l+/zlXtOXd1l3j9P8nTV/ObELeMYejtRH65za1Ef023+DkI8/4G/+/lzObMH7/XTZfiftGn9vURpiZxF4MS5/veXyJ8S/BYZT46wRlc2fg5mufRO07r50VL700I3oK2Kfj5IhaSYwyFioVGOmwiNiJ8XAplc7lmacadD2kuVH0rb8i3YhuuQr29kq5uNKCn36FX1eCiVnM6w8OgRvrUzSkNO2vRCTTRaqx8/FOplyZEm25jZlNtBHTF9tdpOq2qikVWprj8psLWdNlTfR2MY3aTIo7ne9gpIj+3ecT7EP1HZrqC0YKI+50vqOzYybU3S53eef/joZ7GqboceNTLFMxdjZNeNnjlP65JVf/0v3pV6pfore77XmbTkc7AWBpYpmo0fyGcqvTLUyMTPT+Hq1vuJ5yWeNPWkbmPvEvfHbs0FNEQtdyrV6NdbXYv+u81qxN0MjrH5HD8mWkr1WLwJu3eNVeM41A0Qf3Uejy7zKRKklSWgghhBAiDXntFcjCk27svfmO2b+V5I/droYOKc1q5JiNo/cMO2/q1BaOWJubMHTHbZ3ud0fvSlTMbxdlmSShv3+zW5WkdfmoHRJefgrg8F13zrp95Joey7vfmdAAWyvNCOZbr705dt+DgbULkt5Cs2zSwfusu/hS58d9Nr0xxskpNRAeCjMcwNgUGkyBn7snfh9fPCA8GFBAhtyapPTlJUmPKSXUnwwVeoMyFIzNNUnqqVkMHVXq51Aefu4BxZtr5t1Wq8E2lyZR/e8guLnR0BGmHYNvQ6Z8mtHeCiPddgwRenPg6QHGXxxv6DC+GxbGFuxsupN8tvkSvI0uk2gr6q2gSs4q2uchyhB+3vyzzvafVG2LtGX74+2GDiNG06pNo1mBZvE3JO0lpQGutr+KlalVjOvCVGEYK4xRqVWYGH2t+nPzw026HOuSUiHqhKOdI22KtokyP72+3O50G+NEzAPsMXkK3lu36jEioWtF79+LVnZdrVSCkRGPihU3UFRpV5Eb/2GULl2it1MFBhLm8QGzXA6EvXuHka0tJhkzatd/3rAB//MXyNDyV2wa629eeBE/SUoLIYQQQqRyV557MeHAPdw++Bs6FJGGlM2dgVmtSmJnbc64fXcNnoAXKePgwGqUcLBFrVYz8+gjVpx7buiQaFs+F9uvp1x5VWtzEyoXsCO9uQl7b70DYFCdgiw+9TRa24kmG+hmcjzqwkoDwPM+2OTUzNsW+EmTfHQoDxnzQLaSYGIBz0/D/f3w6sLXbQv9AuFB8OKcHl+hEN8hUysICwSHCpCtBKTPBukya34HFQpQhkFYkOb3r1A9Q0f7wzr+8jgjz440dBjfnYvtLrLh/gZWumrK2tdwqIGdhR0ZLDJgZWKFd7A3Wx/pL0lVLms5bnxIemnZH82+Zvv49/m/ZLHMwjWPa4QoQ7A0scTOwo6PQR+55nGNgLAAQ4eZbB2KdaBL8S7sfboXl1cuPPN5hho11qbWdP2pK5Ymltz+eBvnV86GDjVVMzMyo3z28tz6cItQVSjhKs30W8UyFcMhvQOl7EthZWqFvaU9HwI+kGfcGmzvp75pCUTc8h85gol9ZvyOHMVj4kRDhyPiYNuiBTlmzoi/odAbSUoLIYQQQqRiLg8+0HPjf4YOQwiRhmSzscDYSME7n9RfPtHQ9phNpJzRk+TvyMgE/v8moxBCj37uDk4LDB3FD+vk65MMPT3U0GEIIcR3a+cMuZ4UQp+MbW0pfPWKocP4oZnE30QIIYQQQhiCX3BYlIT08o5lyZs5HYWzpGeBi1uMo/6EEMLDLxgAIwWMbVyMHtXy8eyjP/Xmf/+jd6sWtCNf5nRYmBjz+MMXzj/5FGf70WG9cDH/I+aVBepA1p/AOhuoVeDhCh/ug+f/z/9uVwh+7gbFmmrKdr++AhuagTJEx69KiB9EgbqQo4zm9833LQR91oygRg3GZpoKBbkqGjrKH1pweLChQ/juNSvQDHNjc575PONz8GfsrezJnT43AHue7DFwdCKy9KbpaZSvEVmssvDY+zG25rYUylCIjBYZOfD0ABffXzR0iDphamSKU34nymQpg2NmR/Y+2cu9T/cIVYVia2bLZffLhg4xzWlRsAXZ0mUjVBnKM59nBIQFEBgeSBbLLChNTmEcrjJ0iCKBsv31Fx5//RVtefr69TGyssL3wIGUD+o7Y164MCFubjrbX4GTJ3W2L5E0MlJaCCGEECKVuvfOF6fFmrKwyzqUpVGJ7FHWhytVFBx31BChfRf61MifKkob60PuTFa8/hyofX5pTB2y2lhw1s2T7utl5P336N6kXzA1VjD/hBt33vqQ3daSAbULUjCLtbbNmD2uKVJGe3238hTMYo1DRitO3PdApYbaRe3ZdPkVUw8/1Pnx9vWvQpncGWNd/8k/BAtTY9KZGaNQfJ1/Wq1Wa57/ZatZMO6DJvGlCtfMERwblRJimxvw5QVY3yQpLyNl/TJDM6c0ak0C0MRcszziZyHiV6QxlO4AeauB3zvwego7Oxs6qrTFsRWYWkLjeWBqYehoRAK4vHJh2Jlhhg7ju+D8mzNewV60PdQWgLW/rKV8tvLxbqeLeYpvdLxB7Z218Qv1i7auSMYiFMlUBHNjc3a57Ur2sb433Ry7ceDpAfY224udpV287dPivNKDywymV8levPB9wZsvb6iWsxpGCqM4t2m2vxkvfF+kUIRpi6WJJf1K9SO/bX5q5qoZb3uf3btxH6//ua6FbhR79BBVQAB+x08QdOsmdn36YObgoF0f9sGTpzXjf9+/d5Y/lyPov8RNE1H4v/8wto4+t7Q6LAx1eDhGlpaogoNR+vqiMDHBxC76OVkVEECQqytBd1zJ0Pp3TDJlSvJrELojSWkhhBBCiFQsXKnC3TeYXJmsYlyfd8zhFI7o+2CkgOczNImjxx5f+GVhyo4grV4oM0PrFSJLegssTI1Jb2FC0T+P6WTfL2fGnRBTq9UM3XGbA7ff6+R4QvdMjBR0rZKXKgXtqJjPjjrzzhAQosQ/JPZyfvG97wB33vjQ/B/9jNpxHlYD17e+tCybM0riNyaJPW+5TW2EqbGCD34hZLO1IDhMqf19eTGjcbzHS3EnxsOlxYaOImYKIxjn8TUJ/S1JSsfOaQEUaqCZ1zhzoejr3Y7D1tYpH1daNd4z9s+hSLXCVGGU3VTW0GFoVc1ZlS7Fu9DbubehQ0m0u13uAuAb4otKrSKjReydqyLzDfGl2vZqyT4uwEOvh7Q+1Jpzbc7FePzrHtfpfrx7ko/1PYr880uILke7cNPzpp6i0Y//Ov6HuXHiz89n3pxh0KlBug9Ix+ws7PAK9tL7cfLb5mdPsz2YGCWuUK3v4cO8HzFST1EZVpZRo/CcM8fQYehMht9/I/uUKfG2e1y2HKrAwHjbfc+yz5jB540bCXn4tYNyltGjydi+HR4TJmpHlGfu3x/7wan/PCKSR8p3CyGEEEKkYibGRrEmpEXSmRp/7e3vFxyWIseML2n4d7syDN52K1nHeDSlYbxtFAoFE5yKS1LaQDb3qIiREbRfdTXWNmf/qE3ODJba51fH1gPgkYcfDReeT/KxS+XKkOhtXIbXRKVWY2KkYPQeV66/9I7WZli9whTKmp5CWdMnObbYFMxijZmJ5vc1m61mJKWFqXGCkvAGY5mwxIK2bVD0n6neDLkTdyJwog+sqgPv09YN9BSRrRTYOsS+Pp5RZKlK5wOQoywow2BO/pQ5ZoNpkC4z2BcFuwKSkE6jTI1MudvlbpTRnwtrLSR/hvw0298sStuhZYey8OZCvcazvN5yQJMoDFeFs8p1FUvvLNXrMXWhWYGvPytb88R1BrI1t9UmRpUqJaU3lU7wtulMo444K2ZXLM4ka/ls5dnYaCOdj+qvCkR6s/TUy12PfU/3RVl+tf1VftnzCz4hPno7dmJFfN4Sw8Ik7VSB2NdsH9nSZUtSQhqgVq5arGqwil4neuk4Mt060+aM9nFyR7I7WDuwr/k+jr08xp8Xv45u7leqH31K9sE4tso6cVAYJ36btCDXyhWkq179u0pK2/XsmaB2OWbP4u3AHzzRqlaTf99ezUOVChQKbcfi7DNnoFapMEpnJQnpH4QkpYUQQgghRDRtfs7FrN9Kolaryfe/I/G2vz6uHi2XXeTN56AUiC75Sjlk0D4OS4E5u0Y3LBpvm2alcnD/nW+ySopbmCbsJoal2fd5syMtqFYoc7xtTI1iHvlbNJtNjMvXdYu/3GdSRS7/3aNavmhJ6RalczCkXgyjRuNQysGWO299E9R2T98qidp3qmCRIWHtbHODha3+k9LZS4H7Heh/VTP3dVwUCuh9WkZMf6vlanAoF3ebyCP2i7eA4s1gtx5HGBqbQ08Xzaj8JnNh/k8Q+iX+7Trth/y1vj6vNwlcJuoryq+qDNT/MUSKWVR7ETc/3KRi9opUd6geZV0Wqyyc/F0zX6M+k9Ir6q+I8tzEyIR+pfsZPCmdO31u6uauy7r768hqlZWtTbZSd1dd7XrXzq46q/CR0IRXvdz1KJ+tPHVy10n0MfQ9j3jBDAUplDHqdUREovzU76couzl1jcxPrKQmePWtYvaKDCoziGMvjtGhWAeMFcZkt84e/4bxSK2vVx+GlxtOo3yNsDCxoEXBFliZWHHL8xZNCzSlWKZiSf49D374SMeRGp5pzpxY16gBQNbx4/kwdaqBI9INszx5EtQufb16eo5E/4wzZUL5+XOSt7csXVr7WGEUtSOnQqEg55zZSd63SHskKS2EEEIIIQCwT2/O0SHVOfXIE6eSmpsSCfljem//KtinN8c4tZXQjcOidqW1jyvlj38+uORKZ56wm4ZZbFJmNIWFiSSldcXa3ITM1mZ4+AVjamzEzj6VyZnRkvrzz/LBLyRJ+zQxjn3E5aK2pRmy/TYAma3NOfdHLazMEv5nnZWZMYGhyiTFFTFiObIMVnHMvRyLZR3LUWXmqVjXv5zZhNOPPMmYzgxbK9NE79/gLDMkrJ2CqIlMfem0X/O/VSLmUKs8EC4v0Us4aVLJ3xPQKNJ72XKVZl5yfSalRz0FCxtotUrzfKgrzM4X9zaZ8kdNSANUGwq3NoPXE31EKb5TdXLXiT3BGWmSwA7FOrDl4Ra9xFAlR8ydlkyNTAlTpUwVnJjsaroLK1MrhpYbilqtxtjImJsdbzL87HDKZy1vkCknplWbhpVp0iovqdT67bw5s/pMTr4+GeM6U+PUcw0wp0bSRndaGBt2pHT9PPUZUnYITvucADjd+jSZLb92kCxlX0qnxzMzTvx14bfSm6XnS0I6WSXBP3X/ifI8r01eXvq9TNK+ujl2i/K8Qd4GNMjbIKmhaakCApK9D32xHzqUjwsXApBjzhzejxoV7zZ2PXtgP3z41wVp55ZBnIo+fJCix8vYsSMBFy4Q+vJlih43gnmBAgQmMSmd78ABzPPHc40qfihpqL6UEEIIIYT4VuQSv8llb21OZmtzWv+cK1FJrrK5NaVqjWIZ3ZnavJzZhOy2X39uRkYKXs5sQhE9lB4GyGNnReufcyWobUr9BNPKe5UWDKtfmDOjavNoSiPuTGhAsew22FiYcnVsPYy/+Tm7TW2UoH2aGMf+/kS+mf3f+HqJ+l0FWNoh6SOOTIyi//nYuXLCRghEliOO81a7CprfldpFs1A6CeXGU4UMeRPYUAGxjXLLXQV+35D8WJrM0ySjE5OQBqj1v+Qf+0dmkvyb8vGy+KZyglUm6H027m0G34q5I4RtTt3FFZPf1+t3/yLV6leqn172G1fJa1MjwyYyLU0033FGCiPtSGZTY1MW11lM55/0VwY7LklNSAPYmMVcpUUXbna6SQ7rHCgiXQHvcNqht+PFxtTIlEO/Hop1/eAyg/kl7y9J2re5gacqmF9rPnls8rC9yXbW/rI2SkJaH8yMEvf917ZIW+3jq+2vsrvpbs62jue7LBlqONSI8nx5/cSXZAconLGwLsKJWSr+Oy1z3z4Ue/SQYo8eYtvUiewzZ8TZvtijh2QZOTLqyNg01JE9NjnmzE7xDkbZxo8jfYPkd3pIiHTVqlHs0UOMM2T4uqxG9dg3iIFFyZJfHxfR4++LSJMkKS2EEEIIkYZdHJP4MnyxUanV8TeKg1Ea+AOzcYlssa77p0MZyuTOQPm8iZgPNgH+aV82wWW1k3MPouFPsb+2mFTKn8gklYhRKYevN8a/TfaXzZ1B+7hPjfwxjjSOiWkMyd8I2W2TN+ImoZ/FmJh88/pWdCpHfnvrWFon3qhfijCx6U8625/BxFfmOULJNqCI5f1otxV0MQ9l+YTNdReNuTVkS948iz8cK/1X3YhXjtLQ/TiUbAtVhyZ8u9g+h8nRZJ7m/5ar4adfdb9/kSYY6+OzhWa0Z2ymV5uul2MmlCFGQsdlUe1Fydq+mF0xHUXylZWJFcdaHYuxA0Fxu+I6P158KmSrQHqzmDundv2pK71K9kry+2rIctb5bfNrH/+U+SfKZ9PfdC8REvt6TYy+dq60MrWiSKYiKTpCPqd14jplufzmQq8SvaKNuNYlRRLmoU4JDsuiT42QoUULity8QcHTpzDO+PVv6NwbNlD0Xszz1ae2c2RMCl26GG1Z3l07IyXkmxogKsg8cAAWpUrG3zAZLEqUIMesmYAmOQ1glD49xjaJm9rHumYNsk2ZTJ5tW3Ueo0j7JCkthBBCCCGSbNqvjtrHaaF8d5GssY/2KJglPfv6V2WXAeewTc4f6cs6Jm4E7KRmjvE3EnFa3608P+eNPbm/pP3X96RzlbwJ3q95HMnr8nkzMa5xMdZ0+TnB+4vMNI7S4PH5duR3oSzJT0g75rRhUrOf6FuzAANqF0xW0jxNsSsENf+IeaT00LtgqdvOMUmip2TSdytHaagzHn5dqf9jZYijQkHuStByBdSfBOkjzQ/aflfs2xjpYWa38j1hzJsElj4X34MOxToAMOznYdplRgr93HaMazR03Tx1Y12nbwdbHEzxY677ZV2c65Myj3RkJkYm1Mutu/lQ7S3tudL+SpRkYK1ctQBwsHbQ2XESQ6VWkc40XYzr6uVJ3mvPlT5h1ZL04UCLAyl+TJNEfp94BXnpKZLoGudrHG+bK+2vRFsWUf0AIGu6rAwuO5hs6RLXGThR4uicaigZ2rYhfe3aMa4zsrLCNHt2Cl28QK7Vq8k+fTrpKlZAYRLLZ0FP3wu6ZJIpE/mPHImyzLKE4TtrGpmZkaFlK70eI+voPzCx03S0zDbhTzIPHkS+PbuxKFok1m3M8n/tAGNRvDhWP/9Mps6dyfj771iVKaPXeEXalPrPAkIIIYQQIkUkdqT03v5V6FDx643xNJCTNkiMCR0dC0kfKW1tbpLohHYycpM/lAujY74Bk9najFpFssS5bVYbC26Mr8fJETUTXGr/9oT68ZZX71UjP3WLZU3Q/r5llow3PlQZdV5JVTKKKxwaVI3mpXOwrEM5ulTJy5hGRZO+s7TIsRUYm8ac+M2QW/N/gdqa5LWhpNKROqlajVFQqo1+j6Ewgo57E9a250loPBf+9w4Kx1HyMbnv9bcj8kc81vz/bYlx8V0bXX40p1ufxim/k3ZZYhNUaV1um9wpfsyfsyWtk1pi6GKe4AjVclaLds3qkN6BM63P8G+Lf6O1X1o3+uhMXfvvw38xdnSYWnVqsudc/q3wb8naPrLIv1uplSKRkxENLTcU0MztrE+rGqxiUpVJMa7LkS4HoCnJnc40XbQR5dVyVtNrbN9S6Kl8d66VK8jUvXuSts06Zky8bRRGRlhXq0qGlnFXRzHEnNkWSUgop9Y5kGNN9utA5sGDsCz3teKTsY0N9v37Y5Y7N5alSpF9Rszl2h2WLCFdtWrk2bqFfHv3kGfzJozT62dqNPF9+LGuDoUQQgghRKwSW707Yi7pCMkZgZlSEvonvvOwGtRfcC7Zx3MqmT1xo0mTmDVf2TmB5YIjSQvl1lMDh4zR52Ec27goXRI48tnO2hw76+ilDPf2r0LLpZeiLc9gpd+5aE1Nkv6+p7eIesNWnYyS/445bVnUVnrOx5kMNDGHAVdhsoFK7etypLRNTvB7p7v9JUWdP6HyQM3jaYno1NFqjX7iSYqao6HmmISPorLNCRV6xd9OrYq/TWw67oWCdSFHWTjQX/PzSq/HEWQi1VIoFNHmq9XXSOnUKrEJubRCl+V21cR87WBnGfMUCCXt9VuqFiCfbb4Yr2maF2ye7H3HNgI7KaZXm86h57HPfR1ZLYdaOjtuYsT2/sbEtbMrCoWC823O6/TnFJNK2SvFum5lg5VserCJ7o6ahO2MajP45/Y/1Mldh/Rm6SmWqRhB4UHUzhVzR1Wd08N5M8e8uVjXqIF1jRp8cXYm7M2bBG9r4eiIkYUOppT5f6rgIJ3tKyEcli8jfa1aeK1fj+fMWfG2z9BGz50Lk0lhpr/y9vb9+8e5PsOvLQi6fRufHTvI1KUL6RvUxyx3bkzs7cm9epXe4hLfnx/r6lAIIYQQQuiEQ8booz5ntdL/TaPkalshYSNYCmXVTc/eab+WSNSNvKR2jK9SIHP8jb7xbSlmkTAmRgp61yiAuUnyEnbfdupIKSbJKAlYMqctNQrba5/bWKbcnH/fnYjzQnw3Hg05WlmnxzbQ+WbUc/h9PYx9DzVGgqmF5l+Cy6MroITuRrklW+2x+inrWW1Y/G2+VboDTPisSUgDlOkA4z1T189LGJwu55TOZJGJurnrsrVx6p2f0lBzpY76eZRe96/LZHtiO7Ql9jM0vuJ47na5G+e849+aX2t+tPeuY7GOiTpuSojt8/VH+T+iPF9UexELai9IiZCisTVP+LyzEa8ng0WGaPNI3+l8R2cxxXfOyGOTh/GVxpPDWjNiOmu6rEyuOplauWpRLms5rEytWFZvGa2LtNZZTHEx0sMIU7PcX/8Gzr1ubeI2TkYn1JhYlUt8Z+qkyty/P+lr1QIgU8eOmOaMew5xI2trso4bq31u10tTCcammWHmkI6Qd+cO7eP09etjXriwwWLJPukvit51Jev/xmBVrhwm9vbxbyTENyQpLYQQQgghADCJZaTz3N+jl607Oyp6T/HiOXRbpnOCU3Gd7u/JtEbYp48+YlWfEpv3Tejo5cJZrfm7nWaUacdKSSsVGdMI4O/JoDoFk72P5R2j3zRR6vjGTEpLTmcEIyMFG7tXYHnHssz9vRRZbXQ3auKHlZpLZDvovySs3qWzg59+BbNvRmBZxzJSuqgTDIl8I1yPv+/VR2jKfSdUn/P6iyV37CPIYpSluKYs+LefX5OU/Y4VqZ8uk7QO6R1YWHshJezjL8Oqy2R4WtD5p84xLo88b3Ny6DIpnZikJSR+tH2boppRjpktMzOk7JB422932k4emzwYKYxY33A9C2stZEW9FQwvNzxRx00pg8oMirasU/FOrKi3gpaFWnK1/VXq5K5jsNL56UzTsaZB8iuMGCmMyG+bP/6G8aidq3aCzhmpibGN7qe+MHX4Ol+7mYMDxpkSXoXHyDJhUxAllFXFiimyDYD94K+/LwoTk3iTy+kqV8bI7GvVKvuhQ8m7fRs5pk5N0vF1waJUSSxLfu38b2RhQf5/dT9fvMPyZQluqzCVjskieSQpLYQQQgghAGhfIVeMy38r58CitqUp6fD1JlJKjLJtVdaBJe11V97XEOXFEzOfNCR8LOGitmVoVioH18bVZUpzx8QHhuY9fDKt0Xc7YnpA7YIcGVw9we371ixA3aJf54iuWdieho7RS9CWdMigi/AAWNzu/9q77/CoyrQN4PeZlpn0MumkTEIqgRRSSOiEECAgIFIDgiA1CAFhCWAApcOKiHRpNhAFsSDuykakLb0oLE1X1NX9EN1VWFAUk3x/xJRJZjJ9ziS5f9flRc457znnMQmHmXne93nsX75aZsLPu2O07pnv3RMC8UjrZjqPkYnUtVY6hGZa79r5eyw7v9Nsw2MaqjZ6yhMOehXwCgcUf6xS8jfv+Vpl4Cv6j2XPBXxN6KUeaONqJEEmPI8m/B1QNO6JTeR4TEk0b801cSVgI/B6r9fr7suru88c1pxcMD5xvEnjLSkBn6Cu+wx/KPKhqq939NyBFj4tqrZb+7dGdlg2soKz6qzcdRRjW43VuT8rOAtPZz0NZ7n4z2Zd33dzbMm1PLltSjlxRyEPDrLq9SL2vw+Zl3aFmGZrXjD6/IAFz1g1HkEQ4DPGiNYif9C8tQdhL223yr199PTUdopqDqm3N/ye1J6MIkilUCUlQVDYtr1STerJTyDm/DlEnzwB/6eeQsi6dTrHufe27urtyhXlRPbApDQRERFRA/d2QVuLrxHp64L8jDC9x/skBSM5xNPi+5hCKhXQ1oyy1LoUdI40+ZzDOlaDm+LFR1NNLvFsbOLcTVmx+sHPTWnRB4VyqQTvTWpn9vmOKtrfFUq5FPFB7rixpCcmG7FquqhHLLaMTEOfpIoPggo6V5+z7bE0dI7xRX5GKDYMS7FanL0Tg0zrOW4FEhOS0qtFSJo3Go8aWMFQueK+8xzt/SP31x07/XPzYojqat55lRTOwPC9ll3DUTnr7l1aVVb98QNA8jBgsIVlguMfqv94Qn/9q6VHvg8MfQMIbw88+q5lcRhj8E7jxg18ufr7RGRHkZ7Gv5ZL8U/B0NihNozG8dRMrlbyVHraP5B6DIgeADeFaaWJTZmMEO4errXdJrANnu34LFZ0XIG9D+3FqfxTWNRuEU4MPYGDAw82uBW0lZyk1VUp6uuV3NDV7lFvDl0ryx2dS7t28J06FX5FM61yPaeIuivOnVNS4JKVZfBc75Ej4aTRWCWOmnzG6Z5coYsyvqJ6mmbvWybdwy2n7utgqZsboo4c1toXMH8+It57D1HHjkIRHm7SPUxiZPsV34kTIVGpIPXwgPewfMh8dL9mDV6x3Hqhudr3/SgRk9JEREREDVySFZLFKaFeBpNV9p5nLhUEyE1caazP9G4xJp8T6mPeSoN3Ctriw6kdkBOvpzxsPfJaBRocs7x/K6uW3q4vtzAjNwY9Wwbg4+mdcHF+N61jH0xpjz0TDH+YYapAD+uWhBYEAdO6xeDawu5GjV81KAkX5uYgXVNd1q5zjB+2PZaORf1aItDDuiXslHL7lhiVGplMyo71gwd7RpsvolP9x8vLKv5U1irRqOsDK1dfIMKySTJmi+wCuBl+LhlkahIzZQQw/ihQ/J+KpKw99KrRf9MvDuizFvDSP1nLbGM+AiadrfhaEIAuT1UkwL0jgaAUwCO0YsV2eDsgOhcYuQ+I6Gj9OGpzD6z4fvf8c/3j4vvYPhYiHUwtpzwrYxYODjyIeB/rtoPRp12w+JP8NuZsrPq6tb/1+raWVf6bZSEvpZfhQbWYMvnyjd5v1NnXLbwbuod3R3Ov5lDJKl7DuchdrJLwFMszWRUrV4fFDcOGrhtEjqYuucR6rx/f6FX3Z+rl5IVdvXbBV+Vbb9L52JBjiPYSr/euuQRBgHrcWHgNtG0Pa/UEw1UL1OPH2eTeUldXeI8caXBc8Aurq75WxsWZdI/g1at17pf5+iLq+N/R/PAhRJ8+Ba/BFSX/rVkRQiep4fd8vlMm2zYGPcJ3WaeqBpGxmJQmIiIiIpQZkXG2dytdiQRwdbKsH9r4jpE4V5xj+zeZNSSGeCLa37RVIJWUcinGtK9/NnrfZOv0B6zkp6PP9qpBSZiRG4OJnSKxLr81wtUucFPKcWNJTyzo0wLbRqYhLtDd5J7Zhoxup8HxWdmY3s2yD5D6JNX9HjnJpFjQ13A5QUEQ4OlsvxJtSx5uCWdFxYcU1ixXr4+x5drLGnjvbIdXc6WuMX0fh1lYitsSwdZIbJj4sOj9PBDQEpDKKlbmdnkKCEyyQhx/CK+VPOowA0jVXdLR6oJbA+pa1Rv6rAUmnwPGHgSmXgTirFuS0WhSGZBufElNImMsbb/U4mtMTJxo8gpboGKl5as9XkWct2nJDHOs6bLG5vcwJCsoC0cGHcGs9Fl4rtNzhk8w0u9lv5t8jq4S6qMSTH/OmrJSujLp3Nj1jOiJ40OOY2b6TEgljtc/XS6VW62EfpxPHC6OuIhzw89ha+5WrO+6HocHH0a8TzxKBpRgbKuxSAtIq3NeUXoR3BXW781sTxJnZzQ/fAiCs/mTkX3G6U8qO6elwX/OHJ3H3Lp3R+w/LkHq6Wn2vQ2RB9f/njb4hdVwz8kx69rK+Ph63//LvLwg9/OD1M289+zmqBlP9KmTOseoJ0ywVzhVfMaOhVOk6VXliCzBpDQRERERGdVva2LnSLgrZfUmTcPNXF2si+yPFYND0kPNOv/InzqjqEcsvF3sl2C0BomBBLopPYGN4ePqhDfGZWLnmDbYMSYDb47PRN/kYBR0bl7nzbwgCBieGY7Of/RejvC1TqmvPklB+HxRDxT3qljNNKlLlEXXG9ehbpk6ABjeJgxbR6ZadG1rSwj2wKX5ufhyaR56tbJuDzldjE9K2ziQpi71seqvB7xkeLxECihEKq3Xa5V975e7RHtltbN3RdJ43CErJcgBqDyB2f+u3lZ6WOe6uugrFe7IZn2re3/f9faNgxqFvIg8i69hSV9YuVSO1/JeQ3GbYp3HV3ZaidEJo82+fiV7ToCsj6fSE0Pjhpq1Klmfx1s+btJ4qSBFWkAa3u37LiYkTsD27tvx6aOfwkXuYvK9HeX7KrZJSZO0tl3Fek1gpLSANJwcqjvxZg65RI60gDStigSVvxubu21GUXoRJIIEGQEZ2N17N/Lj8q12bzHJ/fwQ+YGO1i5GCFq2FH5TC+sd4zUsHwELnkH47t1wycqEe8+eiL38DzRb9RwEI1b2WsJr0EAoW9YtoR+4eDGijv9dZ0I6dPt2re2wnTsQsrFutQCHXPlb4/spdXdH80OHtA7LAgLsHRGRaJiUJiIiIiKjyvQGeqhwfm43zMnTXwbxldEZSLRS7+nK3NnkbMP9gGs7OrMzQrytlyC3KwOfvZnSE9hY6RpvZEb6ICtSjbRwb8Mn/MFDJcfxWV0surefmxOeH5wMWa1+2uvzzevdvHZoSp1r1dQl1h/TchyrlJ8tfqb6GFu+myulrWjEe0Dhxert/lsAWY0KBbF5FSuhp16u/zpjD9V/vCaV8X+PDXL1BUIs6FmZOKQiCWwMjxAgc6L+48Yk8Cvlraz/uMKluix6wiPGX9dUE633obzdOOlJdiQ1rR69ZD2WlpK2NDEpl8gxMGYg+jXvp7X/4oiLyAnLgcbD8p6pgqkVIRqQGO8YhLiFGD1+V69dAACNhwYTkyaitX9rmyeXp6RMsen1xTYu0TZllG3JWa7/vVjtJLslJIIE+XH5OD/8PDbnbkaMt+ltmxyamX93PPoYbrchCAK8BgyAKqEFQrduRfDKZyEY2fvYUoJCAc2bbyBsxw6okpMR/NxKxF29As+H+0HmpXtSjUubDCj+6HEd8cF+OCcnw7VjR8ReuYzQl16Cc5s2CF71HAS547Ugqv0MlLpXr9JWJSUh8i8fmHzNmuXNzcaJPyQCJqWJiIiIGgFLV39OyTZuZaqhVZYh3s54p6CtRbEAgJuTrOqNmyk9fI/8qTOuLuhu1Z7L9mZopbSjsbTHcsdoX537e7QMxNGZpvfRNaYv9+TsKCzo06JqO7GZDVdJOhg7fc5ENbkGAJ41Kj6oa02KEASgeVfAw0BpfnVzwMnIUpQTT5gWoyGPbDH/3H4bgP6b6x8zbA8w/hgw8Xj94zxDAP+6q2p0SjNi5ePwvcCcmxX9lG3FVfczzuGNO6y93dy8EppEALAtd5tJ4//cUbu3ubUSvvoSo9ZYzd3YGVMCfXj8cFwccdHqScGavbJ1OfDIAZNXc5N9vPXQW5iVPgubcjbBTeGGeJ947Oq1yyZJdonQOF/k2nrFsticU5IRvnMH3Hv0MGp8xDtvI+roEThpqicTCYIAl4x0hG3fBvfu3W0VqkUCnqnoBa+eVDEhQ6JSIXzPbmje2oPw13dColSafE33nJyqJL25nNPrlr8nsjXLmvQRERERkUPoEusPqURAqRk1d1uHedm1h64xJnTW7msU7e+K69/drfecDI13w10dXUN9H7v+qXvjmvlf3Cseg9L0r7xp5uWMnWPaYMiLVk6wARieGY7+rZvh9Jc/IjnU0+rXd1Q1Jz28NCodxz7/AZsOf1FnHBdKW0H+HuDuTcD3jyT0+GPA7W+AwFbmX7Os1Lhxbv7m30MXj2ZARCfgi4/NO9/XwLOreVfjr+UdDnx30eAwowgCIG8aPUhNFpgIzPsJuPNv4NYVINSC1fLU5AmCgBmpM7DizAqjxncL66Z9vrWS0jWuszNvZ9XXMokMqzqtQuHHheZfu4FNKjTVnDZz8OFXH+o9vrnbZmQEZtjk3pmBmVBKlbhfer9q37C4YXj1yqsAgACXxlX2dk7GHGQEZuChtx8SOxSLRXlFIcqrYvLzscHHGv3fE1uQ+fjAvWdP3NlvfBlvvxkzbBiRuASFAjK1WuwwTObRKw+uHdpD6l49wVTVokU9ZxjJgjdtodu2wiUz0/IYiEzUOKcQERERETVBNUsSrxmabPR5zbzE+UB++2P6Z+Um1SoB/urjGQj2rD/Oyj7H1nRhbg4OTu8EHzv2pda3Uvp8cQ4mdjK9lLkjG91OA1en+ufJ+ro51Xu8ppdHpZt0f2eFDB2jfeGudLwSb7ZSs9pBhNoFY/X03w7xZqLOYlFdgeRh1dsBCUCMhas3eiyz7HxL9N8KRHcHIAAtHgbaP2md64792LTxhspyO6Jez1X82bdu30OHJggVK/ijuuov6U1kpEdbPIqns542aqwgCNiau1Vr2xrGJ46HWqXGuFbjkKBO0DrWJbQLitsUY2vuVpwffh45YawOUJO30htpAbpfuz+T9YzNEtJAxc//+S7Pa+2bnDIZOWE5WNxusc3uay2ZgaYlfQbHDobGQ4Oi9KKK7ZjBtgjL7piQNl/QctNe//mMHmWjSMgSNRPS1uKabX4rLSakSSxMShMRERE1EhM7ReLDqR3wz8U90atVEN6amGXUeYv7GVkK1YpKnuyITjF+ehPTWZHas5/93JQ4VtQFT+XFQe2qwPRu0XhzfCbclBUJTZVcilFtLe8HWJunswIatQveHG/cG7b9k9tbfM/aFdLfmpiFU7Oz4WXHxLipzheb/sGtUm7cW5Hmfq5oE2G4P26Unys66CkFTtUUNfptq12dINdTz3tm91h7hUSmSBlesYJVDC4+wNBdwPyfgAHbgOy5ll9z/FEgyPhJVAAAVz+g1SDL721PqaOA2f8HJA0ROxIiUT0c9bDBMWu6rAEArQSotcryBrgE4KMBH2FSct1+toIgYGDMQKQFpEEmkWFlp5XwVRn3umJt9lqrxOfoVnSou9I9wScB/aL66RhtXbVXy6tkKqzstBK9I3vb/N6WWtphKVqqjXu/1T28evLa0Nih2N9vP2ZlzLJVaNRACDIZwl55WewwyAH5Tp4M1+xsncckzvqryGneedtGEREZxqQ0ERERUSMhCAKi/d2qVkKmhHrh+cFJ9Z4zrkMEXAysVDXHhE6R9R6P9K1YcdUpxg9fLs1DlF/1Cqzjs/TP9n28fQTOPJWDSV2ikBbujYvzc3FtYXdcWdAdCpntXtp610gI17eCND7I8tnPg9Kre8+uGZqMlFAv+Lmb3mPKnrxcFHg0M8ykc1573PhSsDvH1D/2qbw47B5v3CSMpk4iEXB6TlecmJUNlUIKD2c5Hm+nPaFjcb+WDlfSn2pQGZ6kYTcDtpt/7pBdQICZk6JMTWQ7AkXDby9BZEsJPgnY3Xs3OoZ0rHPMW2m9554pqzU3d9ts+HoQ0D7Y8kmJDYGPykdre0u3LdjZa6ee0dYll1RXtRmdMNou97QWb6U3duTtwKePfmrwd3lFx+rEvyAICHEPabS9ksk0zmnG9f5Vpba2cSTkSCROTmj2wmp4DqmuqKCeOAHND36EyA//qvc8ZUzjagtGDYtQXs5uYURERESNWcmV7/B8yWd4UFqOf//0C27/8qDq2Pr8FPRoGWiT+246/E8s3n+1zv6XR6XrXNH6y2+lUCmkNonFGo589j0UUglOfPFfPPe36zrHfLk0zyr3uv+gFE4ySYMrc3fmy//ikQ3HjRpr6vdq2q4LeOv8t3X2j8wKx/yHrNCPq4kLL3ofABDm44wDUzvadJIHWeil3sCNw/qPx/cFBr5kt3Aw30P/MY8QYOol3ePm/VRRHtocpQ+A1SnA7a/rieu2edcmIptq+VLdyShpAWla5bor7ftiH0793ykUZxZrJSXt6ea9m8jZrb8izPqu69EuuJ0dIxJXydclePbMs1jeYXmdEui2VFpWioKSAmg8NJiZPtNu97W2q/+9igHvDdB5LMUvBS/1sOO/39Tg3L98GTce7l/vmOgzZyB1dbFTRORIyu7fR/nvv0PqWj3h//7167jxUJ86Y+OuXrFnaERamJQmIiIiakLuPyhFbPFfAADto9R4ZbTt+r8BwI0f7qHznz9GmI8zsiJ98EjrZmgd5kCr/Mxw+5cHSHz6Q53HrJWUbsjKy8uhmbXf4DhTv1e3f36Ahe9fxptnv6naN7lLc0zrxlne1rDsL1fx9X9+xpqhyQ1uMkSTsy0P+Oqo/uNzbgJyO/YE15eULrwIuAYAMkXdcSP2ARoLVxae3AR8MEP3sZYDgf4vWnZ9IrKJH375AV/d+Qq3f72NKQenAABmZ8zGkFjHLW//zf++wboL65Adlo31F9bj2o/Xqo6dH34eMon1qw5R45X+Wjp++f2XOvs/efQTroomg67ExtV7nMlGqq3274zm3XegjI4WKRoigK+aiIiIiJoQpVyKwzM6QxCAEG/blxLVqF3w2aIekAoCJLWbJTdQHio5RrfTYMvRG2KH4pAEQYCXsxw//vxA75jUMC+Tr+vhLMeKAYlaSenU8IY9wcGRsId0A9JlDrCth+5jWZPtm5DW5+EXAc9Q/cctTUgDAOqZX997lRWuT0S2oFapoVapAQBHBh3Bpf9cQmZgpshR1a+ZWzMsbr8YAJAdmo1r/72G6z9eR6+IXpzIRSbTl3hmQposJcjFqShBjs1n3Dj8Z+NGAIB68hNMSJPo+K8dERERURMT6uNsl4R0JblU0mgS0pV0/e88nBxs/0Ac1InZ2XqPPTcoEZtHpJp97f2T26N3YhBeGJKssww8UaMXlgV0W1h3v1sg0Hm2/eOp6fGSirLZrQbWPZYxvuLPtoXWuZeTm+79/bcACpatJGoIPJWeaBfcDlKJ47Zv0SXGOwa9I3szIU1mEcDfGzJf5N/+pvdY2Csv2zESaij8phYi5uwZhG7dAvXYsWKHQ8Ty3UREREREplqy/wo2Hv6iart9lBovj0rnh5M17D77Daa/+UnV9oZhraFRuyAmQE8iiYiMd3oL8P606m1nNfDEWUDlaf9Ybn8LfP43wDcWCK2nJURZGXDrMuAXD0isMD++9AGwZzTg1wL4/irwzRlg2B7Al6s/iIjIcZV8XYLCg4Va+97t+y40HhpxAqIGR1cJb2V8PDRv7REhGiIi0zApTURERERkom9/+gXtl32EvsnBeKZPAlwUUiaka/n5t98RP/evVdtfLO7Z6FbME4nmnweBV/pWb8/8SpyEtCMpLwf4HCYiogbg7m93Mfj9wfjqzlcAgIsjLoocETUk/541G/eOHsXv339ftS9i//twiogQMSoiIuMwKU1EREREZIbffi+DXCowGa1HzaT0zjFtkBnpI3JERI1IeTlwciPwl5mA0hP40w3rrD4mIiIiu/j5wc/Y9Okm5ITloIW6hdjhUANTXlaGO+/vx0+7diH4uZWQ+bKtERE1DExKExERERGR1ZWWlSNy9n4AwOVncuGskIkcEVEj9PuvAARAphA7EiIiIiIiIqJ6MSlNREREREQ2cfP2fZSWlyPYUyV2KEREREREREREJCImpYmIiIiIiIiIiIiIiIiIyGbYdMoB/Prrr5g/fz5+/fVXsUMhInIIfC4SEWnjc5GIqC4+G4mItPG5SESkjc9FIsfCldIO4M6dO/Dw8MDt27fh7u4udjhERKLjc5GISBufi0REdfHZSESkjc9FIiJtfC4SORaulCYiIiIiIiIiIiIiIiIiIpthUpqIiIiIiIiIiIiIiIiIiGyGSWkiIiIiIiIiIiIiIiIiIrIZJqUdgJOTE+bNmwcnJyexQyEicgh8LhIRaeNzkYioLj4biYi08blIRKSNz0UixyKUl5eXix0EERERERERERERERERERE1TlwpTURERERERERERERERERENsOkNBERERERERERERERERER2QyT0kREREREREREREREREREZDNMShMRERERERERERERERERkc0wKU1ERERERERERERERERERDbDpLQDWLt2LcLDw6FUKpGRkYFTp06JHRIRkSiWLFmCtLQ0uLm5wc/PD3379sW1a9fEDouIyGEsXboUgiCgsLBQ7FCIiETz7bffYtiwYfDx8YFKpULLli1x5swZscMiIhJFaWkpiouLodFooFKpEBkZiQULFqC8vFzs0IiI7Obw4cPo3bs3goKCIAgC3n77ba3j5eXlmDt3LgIDA6FSqdC1a1d89tln4gRL1IQxKS2yXbt2Ydq0aZg3bx7OnTuHxMRE5Obm4tatW2KHRkRkd4cOHUJBQQFOnDiBAwcO4MGDB+jWrRvu3bsndmhERKI7ffo0Nm7ciFatWokdChGRaH788Ue0bdsWcrkcH3zwAS5fvoxnn30WXl5eYodGRCSKZcuWYf369VizZg2uXLmCZcuWYfny5XjhhRfEDo2IyG7u3buHxMRErF27Vufx5cuXY/Xq1diwYQNOnjwJFxcX5Obm4v79+3aOlKhpE8o5bU5UGRkZSEtLw5o1awAAZWVlCAkJwRNPPIGioiKRoyMiEtf3338PPz8/HDp0CB06dBA7HCIi0dy9excpKSlYt24dFi5ciKSkJKxatUrssIiI7K6oqAjHjh3DkSNHxA6FiMgh9OrVC/7+/tiyZUvVvv79+0OlUuHVV18VMTIiInEIgoC9e/eib9++ACpWSQcFBeHJJ5/E9OnTAQC3b9+Gv78/tm/fjsGDB4sYLVHTwpXSIvrtt99w9uxZdO3atWqfRCJB165dcfz4cREjIyJyDLdv3wYAeHt7ixwJEZG4CgoKkJeXp/W6kYioKXr33XeRmpqKAQMGwM/PD8nJyXjxxRfFDouISDRZWVkoKSnB9evXAQCffPIJjh49ih49eogcGRGRY7hx4wZu3ryp9X7aw8MDGRkZzMMQ2ZlM7ACash9++AGlpaXw9/fX2u/v74+rV6+KFBURkWMoKytDYWEh2rZti4SEBLHDISISzeuvv45z587h9OnTYodCRCS6L774AuvXr8e0adMwe/ZsnD59GpMnT4ZCocCIESPEDo+IyO6Kiopw584dxMbGQiqVorS0FIsWLUJ+fr7YoREROYSbN28CgM48TOUxIrIPJqWJiMghFRQU4NKlSzh69KjYoRARieZf//oXpkyZggMHDkCpVIodDhGR6MrKypCamorFixcDAJKTk3Hp0iVs2LCBSWkiapLeeOMNvPbaa9ixYwdatGiBCxcuoLCwEEFBQXwuEhERkUNh+W4RqdVqSKVSfPfdd1r7v/vuOwQEBIgUFRGR+CZNmoR9+/bh4MGDaNasmdjhEBGJ5uzZs7h16xZSUlIgk8kgk8lw6NAhrF69GjKZDKWlpWKHSERkV4GBgYiPj9faFxcXh6+//lqkiIiIxDVjxgwUFRVh8ODBaNmyJYYPH46pU6diyZIlYodGROQQKnMtzMMQiY9JaREpFAq0bt0aJSUlVfvKyspQUlKCzMxMESMjIhJHeXk5Jk2ahL179+Kjjz6CRqMROyQiIlFlZ2fj4sWLuHDhQtV/qampyM/Px4ULFyCVSsUOkYjIrtq2bYtr165p7bt+/TrCwsJEioiISFw///wzJBLtj3ilUinKyspEioiIyLFoNBoEBARo5WHu3LmDkydPMg9DZGcs3y2yadOmYcSIEUhNTUV6ejpWrVqFe/fu4bHHHhM7NCIiuysoKMCOHTvwzjvvwM3Nraqvi4eHB1QqlcjRERHZn5ubGxISErT2ubi4wMfHp85+IqKmYOrUqcjKysLixYsxcOBAnDp1Cps2bcKmTZvEDo2ISBS9e/fGokWLEBoaihYtWuD8+fNYuXIlRo0aJXZoRER2c/fuXXz++edV2zdu3MCFCxfg7e2N0NBQFBYWYuHChYiKioJGo0FxcTGCgoLQt29f8YImaoKE8vLycrGDaOrWrFmDFStW4ObNm0hKSsLq1auRkZEhdlhERHYnCILO/du2bcPIkSPtGwwRkYPq1KkTkpKSsGrVKrFDISISxb59+zBr1ix89tln0Gg0mDZtGsaMGSN2WEREovjf//6H4uJi7N27F7du3UJQUBCGDBmCuXPnQqFQiB0eEZFdfPzxx+jcuXOd/SNGjMD27dtRXl6OefPmYdOmTfjpp5/Qrl07rFu3DtHR0SJES9R0MSlNREREREREREREREREREQ2w57SRERERERERERERERERERkM0xKExERERERERERERERERGRzTApTURERERERERERERERERENsOkNBERERERERERERERERER2QyT0kREREREREREREREREREZDNMShMRERERERERERERERERkc0wKU1ERERERERERERERERERDbDpDQREREREREREREREREREdkMk9JERERERERERERERERERGQzTEoTEREREREREREREREREZHNMClNREREREREREREREREREQ28/8pS6dpnsyewAAAAABJRU5ErkJggg==\n" + }, + "metadata": {} + } + ], + "source": [ + "target_samples, intervals = get_s2st_delayed_targets(delays, target_sample_rate, prediction_lists, speech_durations)\n", + "\n", + "plot_s2st(\"/content/LJ_eng.wav\", target_samples, target_sample_rate, intervals, delays, prediction_lists)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Seamless Unified Inference" + ], + "metadata": { + "id": "Yy12VEzvJ1zo" + } + }, + { + "cell_type": "code", + "source": [ + "# If you haven't already above, please follow instructions to download\n", + "# SeamlessExpressive here: https://ai.meta.com/resources/models-and-libraries/seamless-downloads/\n", + "\n", + "!wget \"https://d11ywzt2xtszji.cloudfront.net/SeamlessExpressive.tar.gz?Policy=eyJTdGF0ZW1lbnQiOlt7InVuaXF1ZV9oYXNoIjoiZ2sxMzhuZnNkNDQ0dmM2dDhhazgxbWluIiwiUmVzb3VyY2UiOiJodHRwczpcL1wvZDExeXd6dDJ4dHN6amkuY2xvdWRmcm9udC5uZXRcLyoiLCJDb25kaXRpb24iOnsiRGF0ZUxlc3NUaGFuIjp7IkFXUzpFcG9jaFRpbWUiOjE3MDI1NzIxMjl9fX1dfQ__&Signature=npTULjeiKp9U8hUng4f9Njb6QKpK52Rl9pQjRpamsQSNzWgYeshMABRUNjWQJrw5givbbdGhaa6mW2l3UYHi66x3rBLazIS7d7npHu6aTElyNRZtFgjKMlNWSRfZOXh7NsQSZOFwWy0VxJwVZ%7EKtJnBWvgh7Mov3SKeJFeJEdAESDVO%7EWCHO1Z2zIWl%7EIkfpX5OnMqz7ntU9SpzsVpEHgefcyktm5NZ2xIr%7EoOml3YUXwNEUDj5PhLUkeoSHpFXHSzI0S0GHlxp48C162gUS8qK1HtaXalk7GUDem%7ErAGpx-Bo9oPBe33PdSsvpqngT9E32eS33oJoU1am4RGKFysg__&Key-Pair-Id=K15QRJLYKIFSLZ&Download-Request-ID=1024805765443779\" -O /content/SeamlessExpressive.tar.gz\n", + "!tar -xzvf /content/SeamlessExpressive.tar.gz" + ], + "metadata": { + "id": "smeOkMUSyLRk", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "061652e9-2ba0-481a-9ebe-d3b09fd00968" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2023-12-13 21:45:45-- https://d11ywzt2xtszji.cloudfront.net/SeamlessExpressive.tar.gz?Policy=eyJTdGF0ZW1lbnQiOlt7InVuaXF1ZV9oYXNoIjoiZ2sxMzhuZnNkNDQ0dmM2dDhhazgxbWluIiwiUmVzb3VyY2UiOiJodHRwczpcL1wvZDExeXd6dDJ4dHN6amkuY2xvdWRmcm9udC5uZXRcLyoiLCJDb25kaXRpb24iOnsiRGF0ZUxlc3NUaGFuIjp7IkFXUzpFcG9jaFRpbWUiOjE3MDI1NzIxMjl9fX1dfQ__&Signature=npTULjeiKp9U8hUng4f9Njb6QKpK52Rl9pQjRpamsQSNzWgYeshMABRUNjWQJrw5givbbdGhaa6mW2l3UYHi66x3rBLazIS7d7npHu6aTElyNRZtFgjKMlNWSRfZOXh7NsQSZOFwWy0VxJwVZ%7EKtJnBWvgh7Mov3SKeJFeJEdAESDVO%7EWCHO1Z2zIWl%7EIkfpX5OnMqz7ntU9SpzsVpEHgefcyktm5NZ2xIr%7EoOml3YUXwNEUDj5PhLUkeoSHpFXHSzI0S0GHlxp48C162gUS8qK1HtaXalk7GUDem%7ErAGpx-Bo9oPBe33PdSsvpqngT9E32eS33oJoU1am4RGKFysg__&Key-Pair-Id=K15QRJLYKIFSLZ&Download-Request-ID=1024805765443779\n", + "Resolving d11ywzt2xtszji.cloudfront.net (d11ywzt2xtszji.cloudfront.net)... 65.8.49.128, 65.8.49.90, 65.8.49.107, ...\n", + "Connecting to d11ywzt2xtszji.cloudfront.net (d11ywzt2xtszji.cloudfront.net)|65.8.49.128|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 3808363189 (3.5G) [application/x-tar]\n", + "Saving to: ‘/content/SeamlessExpressive.tar.gz’\n", + "\n", + "/content/SeamlessEx 100%[===================>] 3.55G 40.1MB/s in 81s \n", + "\n", + "2023-12-13 21:47:08 (44.7 MB/s) - ‘/content/SeamlessExpressive.tar.gz’ saved [3808363189/3808363189]\n", + "\n", + "SeamlessExpressive/m2m_expressive_unity.pt\n", + "SeamlessExpressive/pretssel_melhifigan_wm-16khz.pt\n", + "SeamlessExpressive/pretssel_melhifigan_wm.pt\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# You may need to delete earlier loaded model to free memory\n", + "# del system, system_states\n", + "# import gc\n", + "\n", + "# gc.collect()\n", + "# torch.cuda.empty_cache()" + ], + "metadata": { + "id": "Q_hyGuCgMy6O" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# TODO: to run Seamless unified inference, need to download gated model\n", + "# and specify gated_model_dir (here we use `SeamlessExpressive`)\n", + "from seamless_communication.streaming.agents.seamless_s2st import (\n", + " SeamlessS2STJointVADAgent,\n", + ")\n", + "\n", + "print(\"building system from dir\")\n", + "\n", + "agent_class = SeamlessS2STJointVADAgent\n", + "tgt_lang = \"spa\"\n", + "\n", + "model_configs = dict(\n", + " source_segment_size=320,\n", + " device=\"cuda:0\",\n", + " dtype=\"fp16\",\n", + " min_starting_wait_w2vbert=192,\n", + " decision_threshold=0.5,\n", + " min_unit_chunk_size=50,\n", + " no_early_stop=True,\n", + " max_len_a=0,\n", + " max_len_b=100,\n", + " task=\"s2st\",\n", + " tgt_lang=tgt_lang,\n", + " block_ngrams=True,\n", + " upstream_idx=1,\n", + " detokenize_only=True,\n", + " gated_model_dir=\"SeamlessExpressive\",\n", + ")\n", + "system = build_streaming_system(model_configs, agent_class)\n", + "print(\"finished building system\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7LATtfmGJ5hW", + "outputId": "c1cd5a69-5366-4f99-84f1-e5fa75970878" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "building system from dir\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Using the cached tokenizer of seamless_streaming_unity. Set `force` to `True` to download again.\n", + "Using the cached checkpoint of seamless_streaming_unity. Set `force` to `True` to download again.\n", + "Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.\n", + "Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.\n", + "Using the cached checkpoint of seamless_streaming_monotonic_decoder. Set `force` to `True` to download again.\n", + "Using cache found in /root/.cache/torch/hub/snakers4_silero-vad_master\n", + "/usr/local/lib/python3.10/dist-packages/torch/nn/utils/weight_norm.py:30: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n", + " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "finished building system\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "source_segment_size = 320 # milliseconds\n", + "audio_frontend = AudioFrontEnd(\n", + " wav_file=\"/content/LJ_eng.wav\",\n", + " segment_size=source_segment_size,\n", + ")\n", + "\n", + "system_states = system.build_states()\n", + "# you can pass tgt_lang at inference time to change the output lang.\n", + "# Seamless unified supports 6 output languages (eng, spa, fra, cmn, deu, ita)\n", + "delays, prediction_lists, speech_durations, target_sample_rate = run_streaming_inference(\n", + " system, audio_frontend, system_states, tgt_lang\n", + ")\n" + ], + "metadata": { + "id": "1Go-cO6OKS3q", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "80323e57-3849-44e9-b125-2edb57e563e9" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Using cache found in /root/.cache/torch/hub/snakers4_silero-vad_master\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "3200.0 El examen y el testimonio de los expertos\n", + "4160.0 permitieron\n", + "4800.0 a la Comisión\n", + "5120.0 concluir\n", + "7040.0 que\n", + "7360.0 cinco disparos pudieron\n", + "7583.9375 haber sido disparados,\n", + "End of VAD segment\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "target_samples, intervals = get_s2st_delayed_targets(delays, target_sample_rate, prediction_lists, speech_durations)\n", + "\n", + "plot_s2st(\"/content/LJ_eng.wav\", target_samples, target_sample_rate, intervals, delays, prediction_lists)" + ], + "metadata": { + "id": "ptr3nXlQKYed", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 341 + }, + "outputId": "e3c91ea0-9f46-4aa4-86fb-25bc1125eae8" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Output translation (without input)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Output translation (overlay with input)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAAB6UAAAGsCAYAAACVaHIlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdZWAUVxcG4Hcl7k5IiBFC0EBwd6e0RSq01I26F4qUYqW0pQL1FipfCxVoaYu7uzskJJAEiLuuzPcjZMkm6zubTcL7/IHszs7cyM7O3HPPORJBEAQQERERERERERERERERERHZgNTeAyAiIiIiIiIiIiIiIiIiosaLQWkiIiIiIiIiIiIiIiIiIrIZBqWJiIiIiIiIiIiIiIiIiMhmGJQmIiIiIiIiIiIiIiIiIiKbYVCaiIiIiIiIiIiIiIiIiIhshkFpIiIiIiIiIiIiIiIiIiKyGQaliYiIiIiIiIiIiIiIiIjIZhiUJiIiIiIiIiIiIiIiIiIim2FQmoiIiIiIiIiIiIiIiIiIbIZBaSIiIiIiIiIiIiIiIiIishkGpYmIiIiIiIhuWrYnCc/9chQFZQp7D4WIiIiIiIio0ZAIgiDYexBERERERERE9rZ0dxJm/3dW83XyglF2HA0RERERERFR48FMaSIiIiIiIiJAKyANADsuZtppJERERERERESNC4PSVKeYmE9ERERERPVRam5JrcceXnoQR67k2mE0RERERERERI0Lg9JkczsvZuKPwymImLIGkVPX4mp27ckeIiIiIiIie3ppxXGdj7+/7nzdDoSIiIiIiIioEZLbewDUuKnVAh5aelDrsb4fbIOfmyOWPdoF7UO97TMwIiIiIiKiavRlRKtY7YmIiIiIiIjIasyUJpvKKirX+Xh2cQXGLNlTx6MhIiIiIiKq7aONF/Q+d+RKLlTq+hWYzigow/X8UnsPo85lF5Vj5ZFUXMkutvdQiIiIiIiIyEzMlCabuu/b/QafP5Waj3ahXnU0GiIiIiIiotoWb00w+HxKTgki/N3qaDSGqdUCus7fovn60/s6YExcU0gkEq1thn6yEwkZRVj6SGcMjA2yx1BF9eafJ/D74VTN10NaB+GD8e3h7epox1ERERERERGRqZgpTTaTW1yBy5mGV7DfsWQ35q89h+0XMpBbXFFHIyMiIiIiIqqkr7pTdfUpTzotTztD+qUVxxE5dS02nrmheWzkZ7uQkFEEAHjsh8MoU6jqdIy2UD0gDQCbzqajw+xNSMoqRkZhGYrLlXYaGREREREREZlCIghskEW28cfhFLzx50mTtw/wcMKhaYNtOCIiIiJxqNQCrueXItTH1d5DISIiK72w/Bj+PXHN4DbjO4XiwwlxdTQiw9rN2oDCMvMCsO5Ocvz9XE/4uTnBx63hZBafTsuHr5sjkrKK8cB3B4xuf37OcDg7yOpgZNSYCYKA1NxShPq4aFUgICIiIiIi67B8N4li/ekb+HjTRXx6fwfENvG0aB+ZheUQBIE3fUREVG/9uDcZeSUKnEzNw5bzGfB2dcDR6UMglfKzi4ioIVKo1EYD0gDw55HUehOUNjcgDQBF5UoMXrQTALDs0S4Y0DJQ7GGJLiWnBKMX7zbrNauOpmFitzAbjYhuB4Ig4OFlh7DzYibahnjin+d68zqPiIiIiEgkDEqTKJ753xEAwL1f78eJd4ZavJ/IqWux840BcHaUItDDWazhERERieKdf85ofZ1XosCOi5kYEFv/J/eJiKi23JKG1UJo16VMq/fx6LJD2DNlIEK8XUQYke2cuVZg9mve/usUPt+WgBVPdcfexCwMbxMML1cHG4yOGqvtFzKx82Ll++x0WgGi3l6LT+/rgJ0Xs3ApoxCpuaXIKa6At6sDBrYMxGO9I9E2xEtrHznFFVALAvzdnezxLRARERER1VsMSpOo8ksVmv9/uSPRon30/WAbAGBCp1C8P649VyUTEZHdlSlUeG/tOZ3PFZQpdD5e5d1/z2DHxUz8+3xvuDnx0ouIqD6RmlGlqaBMAU9n+wU4BUHApO8PirKvvQlZmNC5mSj7shVLC2il5ZWiz8LKe8r/Tl7Hz493E3FU1NjtTsiq9dhLK47XeiyvRIFVx9Kw6lia3n19ML59vX+fERERERHVJam9B0CN1+XMYqte/8eRVES9vRbX80tx5EoOVOpb7c9XHknFxjM3rB0iERGRUcdT8hA7Yz1+3HdF5/PG2k4s25OMy5nFaPPOBmw9n26LIRIRkYU2mHFP8YqOwFRdWn3ceJlxU13Osu5erS6IsTR516XaAUaiuvLGnyex7UIGSitU9h4KEREREVG9wHSdRqagTAFHmRTODrI6O+ZZC8qqmaPHe1sBACHeLugS4YOSChU2nq2c1E9eMMqmxyaiupdeUAapRIIAD5a7I/srU6hw1+d7DG5jaNJ89XHt7JnHfjjMzy4ionpk2l+nTd52y/kMG47EuJd/Oy7avr7cnoje0f7oFe0v2j7FZmzRl6kEQRBtX0TmenTZIXSL9MVvT/ew91CIiIiIiOyOmdKNyOm0fLSftRGxM9bjvXW6S4zaQnphWa3HcorF782WlleKv49f0wSkgcpgARE1HqUVKnSbvwVd5m1Gy+nrEDFlDb7dednew2rUsovKETFlDSb/74i9h1IvrT113eg2hua5dZV7jJiyBik5JRCEygogCRmFyLj5WVquVCGvgfU3JSKihumB7w7g7b9O2XsYeokVRn7w+wMi7cm29iZk4URKnr2Hcdv7fneS6Ps8kJSD6/mlou+XiIiIiKihYaZ0I5GQUYjRi3drvv56x2XkFldg4fg4mx/72NW8Wo/N+e+szY8LADNXn7bp91hQpsDp1Hx0j/Jjb2uiOvDLgVvlkcuVagDAvLXnMLxtEzTzdbXXsBq1e77eBwBYd/oGLtwoRMsmHnYeUf1ScfPv0BCJBdPmVb0u+7Tw15QWdXGQofTmYqu1L/ZB66aeZu+XiMSXVVSOKStPYfM57fL7s+9sg4d6RNhnUCSK5AZQwtrWfj1wFZP7Nbfpddaqo6m4kF6IKcNjzcpYFiu5eU9CNlJySur1teSijRfw2dYEAMBfz/ZExzAfO4/o9lRYprDZvnu8txXHZw6Bt6ujzY5BRERERFTfMVO6EShTqDB40c5aj/9+OBXpBbWzmMX22ZZLtR5LqqMJnt8Pp9pkv/klChSUKTBk0Q5M/O4APtx4AU/+dBiTvj/A7GyRFZYpNNmCdPsqLlfi7LUCzF2ju8pDVQCPxJeYeet8bU5fTbpl8dban4Omqt7rsrTa58vIz3YhLY8ZNUT1Qee5m2sFpAFg5uozdVqdiMT39M+sEgLYfkHxq7+fwNc7LmNfYrZZrysWsQ9vRmG5aPsSU3ZROTaeuaEJSAPA3V/sxYHL5v2sSBxqG9+Wvv7HCdsegIiIiIionmNQugFLyyvFyE93IXbGer3bKG19V9VAzFtzFv0/2IYxS3bj820JBoOgGQVliJu9Ee1nbUR6QeXkxRfbE7HpbDp2XcrCLweu1tWwG73TafloN2sjIqeuxYJ151GuZMD/dnTkSg7avLMBIz/bZXC7n/cl182ArHA9vxQbztyAuoGee5Uq41nBVNv5G4U6H7d2EdNrvx9vsH9L+uxNzKqTBXPWKChT4LdDVxEzrbKFwJKtl5BdVD+DGWR/X++4jIgpa7DNzr2GyTJJ2cyUBqDVHgmo/PwSY9HoydQ8pOSUaL6e+N0BlJoRaH5x+TGrx1Bl3Jd7RduXmMYs2YOndCyOuPeb/YiYsgYL1p1Hfqn42btqtYBjV3Px3a7LSM0tMf4CEsXmcxk4d73A3sMgIiIiIrIbBqUbqN8OXUWvBVtx1sgNzdVs+9xgKupRYCOjoAzf7kpCcnYJTqbm44MNF3D/t/t1bqtQqdF1/haD+5vz31k8+dNh/HE4xaSyrvVBam4JypUqbDqbjtNp+fYeDracS0fElDVaJee/2pGIltPXW/S388zPRzQ9WqlhKalQYtyX+0zadsbqMzYtqSeGHu9txdM/H8E/J641yAoAZQ3knNZQfL3Dun7o+y/nIOrttcgSOSAqCALOXS/An0dSkV+i0HrclouD9iZkYeK3B9DNyOesvXWZuxlvrTyFipufRx9uvIhOczdj9OJdrJZCej36wyFk1PMFF1RbQ7mWrwvxczbhyJUcpOSUIHbGevT7YLtVFTtSckowZsmeWtVu/jySgnKlCslZxVA1soVXljD2M/5qRyLi3t2I1/84IcpCNZVawJqT1xH19lrc/cVezF1zDgM/3GH1fsl0Iz7dhc+2XOI1BRERERHdlhiUboA2n03HWytPmbStvuCrLanVAs5cqx+rf49cydUZZN5/OafWY/sSs9Fi2jqT9rvpbDre+PMkYqavw5V6nmFx5Eouer+/DS2nr8eTPx3G6MW78dRPh3FBT2afrR27movHfzys9/n1p80rH3wyNQ/rb5YcrssSz+euFyAlpwRqtYCCMgVOpubh6NVc/LAnCUMW7cA/J64hu6ic5XeNiJ+zyaztH112yEYjsU5xuRIJGbfeUy//dhyRU9eiz8Kt+GDDeRSVK+04OtN9s9O6ICppuyrSQpnOczeLtqDorT9PInLqWoz4dBde/+ME4mZvROyMdYifswmRU9ei5fT1+HDDBVGOVZO+8vy2kJpbgm93XsbAj7Zj+UHzKpyU6wlSnU4rQOyM9ZxEvs2YU0K36/wtKKlQNshFSbejxMwiew+hXskprsC4L/fh+91JACo/w3ot2Ird1dpMmENfFZFFmy6i89zN6P/hdrR5p/Kc2hCD0yq1gJ/2JddZ1uufR1Lx8eaLVu/n/m/247lfj2o9VqFSY/XxNKv3bUhKTgkXgVSzaNNFLFxv/fWWLbLoiYiIiIhsSW7vAZBpbuSX4asdifhhb7LZrz2ekocOzbxFH5M+ZXVcglkQBEgkEp3PGSrTlllYjgAPJ83Xr/x23KLj9/tgO/ZNHYgmns56x2FPfx2r3Xd749l0TZm+FwdG48VBLSCX1c0albu/MFw674Xlx3BHXFOT9pVRWIYxS/aIMSyzZBSWYcSnhktN1yw3+OUD8Rjetkm9/Buxh8+2XMKiTeZPrB2+kivaGFRqAQeTctAu1AvuTpZ/HBaVK9H2nQ06n0vJKcXn2xJx4HIOlj3aBR7ODlCrBUil9ePvYIuOHqllChWcHWR2GE39ZM08uQDxJtmrKkvMu7st7u3czOxztlKlRttZG1CmqD0hXKZQo0xRofl6ybYE/HY4BT8/3hWxTTytG3g11au7RExZgw0v90XLJh6i7b9KYZkCvd+/tUhp6qpTiAlyR6dwX1H2Hz9nE07PGlZv3sdkW/d+Y94Cz9YzNyDE2wVrXuwNb1dHG42KxHDsap5FrzN079EY1LzffPD7A0heMMrs/Xyw4bzOx3OrVegoU6gRO2M9AjycsOieOPRq7m/Tc6sYv7vjKXm4eKMQS/ckaQLvlvx8LLF4awImdGqGMD9Xi16fX6LAweTai7MB4KUVx3FnhxBrhqdTaYUKm86la90bDWkdhAVj28HP3cnAK+2kDtdHLN2ThDPX8jEuPhRjOjSFs4MMmYXlyC+tQHSg9vXRpfRCfLE9EUNbB6GgTAEXRzkEQcBLK46ja4QvFo5vjwh/t7obPBERERGRhZgp3QDMW3MW3d/bYlFAGgAeXXZQ1PHsvJiJO5fsxh+HU3SuqN50tnaQw5YSMizLcli6J0nra2vmP3q8txWRU9ciYsoaTFl50qTXCIKA02n5tcpVJ2QUYe2p6wazbHZfykKvBVsxZeVJgyvO80sU+N9+wxlin21NQPS0dcgrqTC4nRiOp+SZtJ2xjIzsonJsPZ+O134/Ueu5zELzytweSs7BtgsZZmW+JWeZn/04+Zejmr+R0Yt31Ysy6vZkSUDaGuVKFZbuTtI6X9z1+R7c/+1+vQFlU51MzTO6zeEruej3wXb8b/8VtH93IxZtuih6SWZL6KpaEDtjPbP8qnn7L9Mqk1zPL8XP+5K1MlZs8WOc9tdpRJtY1ePWOAQM/GiHzoC0PpmF5Rj+yS6zen8aG0NNr/1xXJR91zRFRzUZU6ssmPIZUlKhws/7r5g9LnMUlCns1oKFrJeWV4oOszc1up7wjY2l6zE3n2P/cEOu5ZXi0WUHcTHd9Hu0zMJyTPr+IL7bbduKLW+ZeJ9myF2f78GbK0/qzQS3tb4fWFYZqkKpRtzsjSKPxrBvdiai1cz1tRbrbjqbjk5zN+OIiItNxfLLQdt+vtd0ICkHb648idgZ63Hn53vQZd5mDF60Ez/uTcaTPx3GPV/tw6jPdmHIxzvx17E0TP7lKN5aeQovLj+Gl1YcBwAcTM5B/w+348mfDmuqDuSXKljZhYiIiIjqJWZK13MrDl7Ft7uSjG9oQG6JAkqVWrRM2IeWVga5T/yp+6b+VR2BQlsqKFMiv0QBL1cHs15XplChtEIFB5kER6/moUIlzsThikMpeGlwCwR7uejdJiGjCIMX3erdlTh/JCQAfj14FdP/Pg0AmH1nGzzUI6LWa1NySvDg9wc0x/J0ccDbI1vpPM7irZdMHneH2Zuwb+pAg+O21m+HTCuhqi8jI6uoHC4OMoz6bDdu6Onb+Orvx/Hz4910PlemUOGd1Wdwd3wIukf5ISmrGBO+quxnHOTphANvDzbxO7HO6bQCjF68G6PaBePzB+JtdpzSChWOpVRO9kT5uyPI06lRZ/YY0nnOZhTeLKEd6uMCV0eZ1mRpxJQ12PpaP0QFuJu9bwlM+5nmFFdo3t+fbbmEz7ZcQqdwH8wY3RrtQ7zqLOty0cYLKChT4p07WuvdpqRCBTcrsscbC3P61Pd4byuAyt7nAODhJNf8zdnCmCW78fezvYz+3QiCgMipay0+zkcbL2D6aP1/K6Y6qiMjUayAd01rTl2v9VhBmRKrj6fhwo1C3NUxBGG+rnCSS1FQpoSXS+X1g1otYOyXplXfeOefMxjcOggh3uJ/Zn6zMxHz11ZmGNoqm5z0u5Ffhge+24+iciUe6BZu1b6i3l6LEW2b4LWhMdh9KQuZReV4bUhLZtnXE1ILr4nEas3QWD3+42GLS1rPX3seT/VtLvKIbvn9cCoWjo+z2f5NIQgCDl/JRV6JAvsSTW8PUN2UlSexYFx7k7dXqwUM/2Sn0e0KyxTwcDbvnloflVrQfJbpM+7LvfjqwU4Y3raJKMcUgxgltS11otoC7nf+OWP26zedTUfzt9di+qhWmpYpvaL9cE/nZvBzc4KnixxpuaW4mF6EEB8XjO8UatJ+xZxLIiIiIiLijHM9lpBRhCmrTMvQMmbhhgt6A5emMOdGpK57glWV6HZ2kGLnmwMQ6OGMg0k5mHEz+KPPsj3JWLYn2SZj+nJ7It4d06ZWAHBvQhYmfneg1vbN364dMJi5+ozOoPRdn2tPmH+z8zLGxofoLLFq7qTZWytP4afHupr1GqCynLWDVAofN/HKVP64NxkP94zQfH3ueoHRktkAsMtAlvXDSw/iQFIOfjucguQFozDgw+2a59ILTM9aFSuLdM2p68j7bj/+93i3Wn8rKrWADWdu4EZ+GVRqAc18XdGmqSf2Xc7GydQ8TOoegas5JYgKcMPyA1dxMjUfw9s2QZlShSPJuZBIamfyjIsPxUf3WDYRp1YLiLr5d9ovJgCLJ3aEp0iTVuY6f6PA7JLC1YODqbm6+3wP/GiHReUX9yVa1msRqOz5XvWe/vWJbugZ7W/xvkyRUViGz7YmAKhdnrO6o1dz0adFgE3H0hBY06felgFpADiZmo9jKblGS1KfTrOu1+V3u5MwbVQrqxe0PKKjaktiZrFV+zRXVUbRF9sTRdnf38fS8NyAaFH2VSW3uEJrEn/YJztx8O1BCPR0BlD5+ZOUVYyMwnJEB7rD34zSpyq1gEWbLqBLhC/6twwUddyNhSAI6P7eFs3XYlT1WHf6BtadvqH5OiGjCK8PbYmFGy7ghYHRaB/qrfN1pRUqODtIb9vFZHXB0qC0vaqJlFTY9nNFLHXVY9lSJ1Pz9L7vDCkqV+LzbQk6n1u06SJeHRJj8PWHk3Mw/uZiWGutOJSCFkEeeLx3pNFtl+5Owuz/zpq03xeXH8OyR82/F9TlpRXHjG8EYMm2S/UqKN0YVAWkAWBPQjb2JOhe/NAt0hfNfA2Xgt90Nh2T/3cEH06Iw10dxS/vTkRERES3Hwal67HqmbTW+mbnZUwZHgu1IBgNLueVVOCV346jY5gP+sYE1AqC1ldlCjW6ztuCwa0C7V5W76d9V5CcXYJvJnXS6s2qKyBtruzi2mW2h3+yC8dnDoGnswOSsouRlFkMF0eZpm+0qXZezES5UgUnuen9ZIvKleg6r3IC11hA75/j10ze7zv/nMGk7uGQSiWImLLG5NcZciDpVg81XROKmYXl+GJ7AsJ9XdEz2h9SCeDqKIeXiwNcHWWaiWGliAsv9iRkIyWntFZvuB/3JhucQNJVll1fj7gqK4+mIsjTCW8OjzVrjNUD0gCw42Im2s+qLP93f9cwdInwQUpOKSQSoImXM1wcZLgjril+3peMY1fz8MGEOMhuZoV9vSMRTbyczTp+TcM/2VVnvfuMySmu0AR5rTXxuwOY1D0cvaL9MDA2CI5ycTMSDiXnaCoDGPPZlkuY9P1BfDQhDuNMzKKguqcwocJHhcr0kt36nLteiNZNLe8tfTI1D4VldRNMqcuFcR9suIBJPcI1C3SKy5UoU6hw+loBOof74Oz1Avi4OiA60AMJGYXYci4DI9oGI8DDCS6Ouj9n7/2m9nu06/wtWP5kd2y7kIFvduoubfv+uHYY36mZ5lyryz8n0vD5tkQAiXi4RzjeuaON3ozdvQlZOHo1F4/1joSro/i3C/klChRXKBHo4aTzulSpUuNQci7KFCr0urlYJymrGDFB7jYN0s7575zxjay04Uw6NpypvD7bdDYdM0a3xh+HUyCVSLT6rld5sk8kruaUICbIA4NaBWHFwavoFuWLgbFBmix/soylPaWr/gbrurf038dMv44W2+XMIqPVZOLnbEKOjvsUc0VMWYPYJh5Y/3Jfq/ely5gle8y6jjx6NRdjv9hrcJvPtlzCS4Na6DwHlytVmPbXafx5JNXssRoy57+zkAB4TE9gOruoHG/8eRJbz5t+X7ztQiauZBcj3M/63sT/naxdtUSX02kFWLYnCY/2Mh5gJ3H1WbgNB6cNQqCH/nuzJ3+qbPXz8m/HGZQmIiIiIlEwKF2PFJUrsebkNQxuFYSDSYaDS5aoCir983wvnavDEzKKMP6rvWge4I4jV3Kx7UJmnfd9FYO9A9JVdl7MxLBPdmLHGwMAWNb7ukyh0gpqH7uqv+9Wh9mbzB+kDjP/PoP3x5teDi6pWqZbxJQ1+PXJbihXqNEvJkBrsvt6fimKzSzVumjTRdzVsalZr6libKLwheW1V+93mbdZ7/YyqQTRAe64kC5+/7jRi3ehoEwJJ7kUA2MDUVSuNJjxbY0vticiMbMIn97XUetvy5B7vtYfyFx+8CqWH6wdIPd0cdCUMfZ0ccCsMW1wOi0f760zXMZPLFXvHWtLF+ujUguQSoCcYnH7Qv+8/wp+3n8FLwyMxmtDW1q9P5VaQHpBGZzkUpMD0gBwKLnyXPPaHyfQLcoXoT6GsyjIPh5ddgj/vdgbzQ0ECpbutq4FCAAcTMo2GJS+kl0MD2cH+OqolrHzYqam7UddWLi+bs4xVXZdzMKo9sG4lleKngu2Gt2++jnwp8e6onOEj1bQV18P1vu/3W9wv2+tPIW3Vp7Cs/2b49kB0XDXUX5/zclb2bo/7ruCH/dd0RmY+fXAVU0v9ZOp+fjmoc6Gv6kaqtqj6KueklNcgfg5t65Zql+XVijVeH/9eXxv4O+2Twt/vDCwBbpGGq4SYImle6x/v5hrjpEMxqo2PhvOpGPxzUVQKw6lINzPVXONSZax9Pc957+zuJReiBWHUjSPVa9oYCtKtfWLjCx1/kahVlBaqVLjjyOpiPR3w33fGD4/WXo8W2aG/3kk1eTSxcYC0lVe/u04Ft/fUeuxvJIK0e7RdJn931lIJKgV0C0qV6LTXP33NYb0+2A7gMo2JC8NboHBrYIQ7ueK/FIFvF1Nq4plbq/od/89i483XURBmRKj2gXjvq7N4Ooog4ezA/zdneDmJDNr0TSZ7vvdSZg6wrSKev+cuIbMwnIMig1EhL/1CxeIiIiI6PYkEexVf4xqefrnw9hwJh0dw7wtXrlvrlAfF/zwaFeUKVQYvXh3nRzzdjMoNhAqQcD2C5kW76NqoqvPwq1IydFdelhsn97XAXd2ML4aWl/25UuDWuCVamXsTqfl1/nfWPUx7L6UpenFTbd0i/TFkNZB6N8yANGBtfuWnr1WgJGfGS+bbszaF/vgi+0JJmdNGGMow6Xz3E3IKqrM1BnZrgnWnrqhd9ua9kwZiAB3J4NZygqVGoMX7UCQhzMWjGuHgR+JV9WiuqkjYvF0P+v6KopVZaC+ZKbXNbF+frY2Nj4EH02I01qIU1yuxHvrzumsqmCJFwdGo2O4D6ID3CGTSvDJ5ov4/bD+rK+F49ujYzNvDPnYcA/Ly/NHmtRf97tdl7Fg3Xm8NTwW89aew4+PdUW/mNol5uv6d+YklyI+zAf7LlvWF7TKmhd7o1UTT62qFNY4OmNIrUUCun42q5/rhbhm3ga3Oz9nuEmLmKoWg1W9fukjnfH+ugv46J44tA3x0mw34au9moUvVYa0DsL4TqF4+ucjRo9T5c9neqBzRO3A9MX0Qgy9+Xf302Nd0TcmwOSM1obynq9y4O1BCPRwwum0AjTzdYGzgwzODjKUVqjw1sqTGN62CUa2C7b3MOudlJwSzPnvrNkVhYwJ93PF8ie7I7ekAk5yKeauOYeMgnI82D0cYzpULrLUtWDEFBVKNf63/4rJZZht4YWB0TiRmo8ofzesOHQVZQr7BcnF8L/Hu6F3i8pKDCq1gKJyZa3qA1ezS9D3A9NbedS8Xnr2lyNmXYdaqk8LfzzUIwLrT9/AyqPiZmTXPM7SR7rAQSaFSi0gv1RR67PmkWUHrbrv1eflwS3w4sAWJl0zWKOhfQ6IoUuED+7sEILBrYLg4SyH283zVEZhmaYiWk23670BEREREVmPQel65Ha8AaKGoWrxgoNMAi8XB3i5OEAQAKlUgh0XM/FwHWbBke2N7xSK1sGe6Bbli1GfNazFKhM6heIPkcojujvJ4evmiDBfV5RUKPHqkJbIKiqHo1yKZ385CgBY/mR3oxmM1gjwcEJmYTnGdgzB+M6haB7gDi8XB00WeEGZEnHvbkSncB+snNwTALD+9HU887+jNhtTfJg3ukT6YsPpG5jcvzkyC8vRo7kfYpt4oqhciQB3J0gkQEZhOfzdnSCVVJY6VasFSKUSTXBIEAQUlCrh6iSDg0xa52VQjflgw/mbpY7J1lY81R0KlRqHk3Nx/kYBnu7XHK2DPaFSC3B1lOFkaj7ubCCtROqTsR1DIJdJEO7nhgmdQ/VOLFcnlQC6KqAfnDYIcqkUN/LLIJNKEBNUmTFZ9Z69nFlkdIFOkKcTukb64d8T4pUgfnN4S0T5u6NVsAcyCsvRxNPZ4l7wYn5+2FMTT2fcKCir9biTXIr3xrZDTJCH1iKB21HfhdtwNafE3sOAu5McHs5yjIsPxeErORgbH4qYIA/8c/waYoM94OEkx+EruQYrB1D9FdvEA+dviF9hqb4J9HDCyHbBGNkuGDFB7jbNDAeAuzuG4M3hLeHn5gS5VIIf9iZj49kbWDguDt5uDnCQSqFUq+HiIINKEKBWA84OUr3XmFXXn1ezS/DljkSdFaBuN21DPDGwZaDBFkXdIn0xuX9zdAzzgZujDDKppM6u42veM1RNadan+wgiIiIi0o9BaTsTBAFDP96J3BIFsorELQNLRER0uwjzdYVMKkFJhRIyiQTX8iuDMiHeLihXquAkl6GgTAGlSkC5UgVvV0eUVqjg6liZWSiTSpBZWA5fN0eUVCiRW6Kw83dERFQ35FIJmt48V6YX1M/7EV83R02/4hBvF8hlEsgkEijUalRVtZZKAWm1oIRSJUCpVkMulaK4QgkXh8pFUPUhIE1EdDuJ9HdDSk4JlLpWvonIWI9sIiIiIrI/BqXrgZbT16Fc2bDLnxEREREREREREdnD6XeHWdwmgYiIiIjqBoPS9cCh5BzIpBKM/WKvvYdCZDK5VIIHu4fDw1mOxQZKexFR3ega4Yv0wjJcyW64GWBV5cpr/t/H1UGTudwq2BMh3s7oGumLzMLKcuotm3gixNsFakGASi1ALpWgoEwBV0c5nORSqIXKXuAKlRpOchnKFCpIJRLIZZVlxV0cZcgrUcDTRQ6JRIIyhQoTv2UP+vog1McFucUVKK5Q2XsoVE2ncB+0aeqJK9klaNPUE19st3+p+zBfV+QWV6CwXGnvoTQIbo4yPNIrAl4uDugY5gOg8trucmYxbhSUIcTbBetOX8eGM7X7Lj/VNwpJWcXYn5iN8Z1DAVSWKm7i5YLSChXmrz2HzhE+CPBwQnyYD67nlWLZ3mRcyS5B1whfHEzO0eyrma8LUnJKax0jxNsFpQoV7uzQFHd1CIGLowznrhfAy8UBZQo1/N0dNaXmHWSV2dECgMo7WwGlFWo4yqWoSpyWSiQorVDBxVGKB7470OD7IRPVRyHeLkjLq/1+FpuTXIqYIA8kZBShVGHa9cF9XZphxaEUPDegOZoHuOPo1VzsvJjFygki8nd3RKtgTwxoGQi5TAI3RzlKFSqE+rjAxUGGcqUaN/LLcPpaPloFe6KoTIn/HbiCmaNb4+udl+Ht4oDDV3LhLJciwMMJw9sGY/XxNHSO8IGbkxxf77hs8PgfTYjDuE6hdfTdEhEREZGlGJSuR8qVKvxv/1X0i/HH4EU76+y4HZp5I9zPFauPi9fjj7RN6h6On/dfsWofH4xvj39OXMOuS1kijco0x2YMgY+bo97nSytUaDVzfa3Hd781AKE+rgAqy9QLAhD19lqbjVOf5AWjAADP/XoUa05er/Pj13cj2zVBWm4pPF0cMKl7OPq1DICTXKZ5vqBMgfazNlp9nIXj22P2v2dRJFKwoOr3WqV6b7GIKWss3q+rowweznJsfLkfvFwd9G6XW1wBJwcpXB3lVh3PkAEtA7Ds0a5Gt8ssLIePqwPkMikAQKUWcOFGIWKbeIj2nqv5874d5JcqEPeu9X/7deWZfs3xxrCWkEkr+yJmFpVj3JfiLna7MHc4nOQyqNQCCkoV+OfENQxr0wS+bo64nl+KcqUa1/JKEebrikh/N2QWlRvtY/zOHa3xaK9Io8dOySnB0z8fwdnrBYht4oFZY9rg90MpWHUsDVtf64eogMoex7Z6P9ra8ZlDcC2vDCM/2yXK/gbFBuL7R7poPabvZ3N0xhDIpBJ4uThAEARETq193jDlHDDnv7Nwc5ThWEoedl3KwsM9wrH+zA38+UxPNPN1NTiOHlF+UAsCDiTl1HrOkJOzhsLTWftc/eLyY/inWt/qGaNbo3uUL9o0NdxHeduFDDy67JBZx7e3qSNi8XjvSMikEqTklCIpuxj9YgKQmFmE73ZdxiM9I3H+RgEupRfh6X5R8HDW/7l2uxn28U5cSK+7fr9vDGuJ1k09Ed/Mx+D1hSEpOSUW90wXS48oP+y7nG3XMYhpTFxTDG4dhA83XMAbw1pidPtgZBaWw9PFAY4yqdnXUZte6YswP1fNtfTn2xLwwYYLthi6XgvHtcebK0/aZN9dI30xuV9zdAzzhqNcis+3JeDujiGIDvTQbCPWvUNNTb2c0bqpF57pF4XOEb6i779KQ72OsEYTT2f0iwlAVIAbxsaHIsDDCRVKNWKmr9O5fYdm3lg1uSekUvZwJiIiIiLzsa5NPeIkl+Hx3sYnZq0R6OGEdS/1gZ+7E2b8fRpBnk54fmALAGBQ2kaqVuxaE5Te8UZ/hPu5oVShqpOgtEQCvDksFl0jfQwGpIHKlerVTeoejmmjWsHZ4VZgUyKRQFKH96yfT4xHqUKFwa0CtR5bc/LWJIOfmyOyb/YmvB3NHN0aD3QP0wpA6+LuaNnHxNDWQdh4tjK7ysvFAfd0bobx8aGiBElDvF1qPSap9gf2yuAYfLz5Il4bEoOCMgW+3ZVk8r7Pzh5u0nbG3hdiWFojoKRPgIeT1tcyqQStm3oCAFZO7oG/jqVhbHyoxdU4po9qZdHrGjovl4YVvJkyIlbz/zA/V4T5ueLUrKH4cnuiKFmsSe+N1LzPZFIJfNwc8XDPCM3z4X5uAICYoFsT06b09DMlIA0AzXxdsfalPlqPdY/yw6J7O2g9Nv/udnj7r1Mm7VMsW17rh0Ef7TD7df1bBqBzuA/u7xoGb1dHeLuKc16Ja+ZdKyCtz5cPxMO32vlMIpFgXHwoVh5N1Ty25sXeJu1rxujWtR579862tR4bE9dUK2gMAMuf6g4AyCupwMnUfDy09KDBY7UK9sQXD8TXCkgDwGtDY7DjYibGxDXF7DvbaH0+GNKvRYBJ29UnT/WN0nx/Ve97AGge4I73xrYHALRs4qH39bezDa/0tSr41MTTGTcKyrQeWziuPe6OD0FucQUyCsvRKtgTMhGDNm52LEn74sBovDq0pdZjtg7ebX61HwYvMv/caooj0wfDz/3W9dOYuKaa/wd63vrsOj5zCDrM3mTyflsEab/fQn1qX7PayvMDovHKkBjIpBLc06UZVGoBM1efxi8Hrlq8z4tzR8BRLkVmYTkEQUCAh5PWOfWNYbG1XuPp7AC5VGJ27+CHeoTjp32V98tBnk5wkssQH+aNOzuEoGOYt2ifkVTb/rcH1XrMUS5F0nsj0f7djSgsq1xUvPnVvvBwdkBgjb8DIiIiIiJzMChdT42ND8Gqo2mi7vPy/JFaq1nn3FV7ohDQviFs7Dyc5Yjyd8O1/DJNmVgxxYV6aUpIvT0yFvPXnjfrtZ8/EA+VWtBM9t/bpRlmrj6jc/sn+0Ti5/1XrC5HuOmVvrUmVAyRSiVYObknMgrKMKJdsMFta050G7Piqe5oF+KFNu9sMPk1QGX2r7Eb5R8e7Yo7luzWemzWHa3x7n9nIQhAdKA7EjKKzDqupR7tFYEuEb64kV+G6EB3HLmSi0+3XLLZ8VY81R3do/xM2lYqleDzifF47tejZh3jm4c6ayYrP743TrMvMfz7guEAyYuDojG+cyiaejmjXKk2KyhdX3wzqZMokz2dwn3RKdwXarUAJ7kU5Urzzw9P9ImyehxkO59PjMeItk10Pufh7IA3h8daHZQ+N3u4xX+PifNH4siVXNzz9T6rxmCqe7s0Mzko3SncB4mZRShTqODv7oT8EoXZpZ+nj2qF5gHutYIn3aN88UTvKOSVKjAuPgQX04sw7JPKKjgfjG+PcfGhOs+JvzzRDQ98V7t0/IKx7XBHXFP8vP8KVh1NhVQigZuTHK6OMvSLCcAdcU0R6OEEhaqyfL0u749rh7dW3vrZuDvJdX5u927hp/msdnWUGc0wNtcn93ZAyyYemuzBxfd31Dzn7eqIvjEBuDB3OErKVfBwlkMmlZj19xfu56bJ/jZHQ8v4qhkcIvP9+kQ3TNTxfjNmZLsm+OKBTsguKseOi5no2dwfTbxuBTIDPZ21Apti8a2DBXH6TO4fXeuxy/NHIru4Al3mbbbJMaMD3W2y35r3xIZ4uzriwwlxeP2PE0a3naljYc7o9k0xc/UZ5JcqzB6nqe7t3Azvj29f63GZVIJ5d7dDkKczFm26aNY+J3UPxytDYuB4c/FxzQWQxnz/SBc8bGRxUXW/PtENPaP9MVvHQiayH4lEgsGtgvDXscp5qeoZ8URERERElmJQup6af3c75JcosOV8hij76xcTYPTm+69ne+L0tQJM6h6OUe2C8egPh1DSQHo4ujvJcWTGYFzPK4OPqyO8XPWXoKxyad4IOMhuZfn+fjgFb/4pXqmzns39sGRivObrp/o2x7I9ybieX2bgVZX0lTN1ksswvE0TrD9zQ+vxz+7viDFxTTFtVGuUK1Wa7Fdzy6fFNvEwKyBdpVO4j0nbzbyjtclB6TPvDtNkhOyZMhC9Fmw1eTymTNK2C/WqFXge3DoIjxjJ2rueX4preWWilcXtHuWLlwfHaGVl9o0JwKQe4eg8V/8kX4tAd7QIckdabin6twzEn0dSTerh1irYE51N/H1VGdU+GM/9qv/5qr6i00e1Qv+WgXB2qHxf/fZUd5y5VoABLW9lrC+Z2BH5pQpM++u0WWOoztikrEQi0WRTOzvIkLxglE2zeTa/2le0lgueznKse7mvzmxwa0ilEpyfMxyHkk0LDm5/vT+OpeSavHihsRrRtgnWnb5hfEMd4sO8cfRqnrgDqmFUu2CMam94MRBQmamVmmtZj8d5d7eFi6PhigqGyKQSdI30RVSAGy5nFlu8H3OOd2/nZvjtcEqt51ZO7oHU3FKEeLugU7iPzs+KdrM2aDKCTPHIzWzx6EB3bHmtH3xcHXH+RgE6NPOGa7VKEy2beGj6oN/dMUTvNVmvaH+tfuoA8NNjXdGnhT8kEgme6dccz/Rrrnc8jnL9n3/3dgnD+E7NcOZaPj7dfAmv1ch6rDKqXVOcvVaAw1dy8f642oEOa0mlEjw3IBrPDagd5KriJJcZreRhiKXZqZYswrK1qAA3tGnqBT83R1So1EhIL8ID3cMwXM9iFDJdXDNvi153Z4cQAICfuxPGxt8e/Ut1fQ5IpRIEeDhh/ct9UFCqRJcIH3y14zLeX2/6Ity69vyAaLMXoIzvFIohrYJwNaek1oLW6h7TUfFMJpXgxDtDkVtcoVVlJ6OgDFdyStAlwteqa9QQbxedAenqXhzUAi8OqqyM1mbmehQbub9vHeypd/G6qfrFmFd5ome0v1XHE9PM0a0x+7+z9h5GnfnAyN9P9yhfTVCaiIiIiEgM7CldzyVnFaP/h9ut348F/UAFQcAfh1O1elK9OiTG7JXWtrT51b5oHuCuNwip6yZ/cv/meG1IjKb/ahW1WhCt/6qnsxwnZw2r9fjxlDyM/WIPRrYLxszRrdF1fmWvzV7RfvhmUmeMXrwbabmlOPHOUL1BgJziCsTPuVVGLsTbBXumDNQ7lj0JWTqzrmr6+N443NUhxOZZNz/sScKsfw3f6EcHVmadVVdSoUTrmRvwSM8I9Gzuh6d+PqLztb2j/fG/J7rpfO7OJbtxIjUfQOV7omqfVar3wTam+t9WUy9n9GsZiJ7N/bD+9A2sOVXZu3pQbCDcnOTwdJEjzNcV+xKzse1CJsbFh2LWmNZGezumF5RBoVLDSS6Dt6sDCsuUegOyarUAlSBgb2I2lu1JQotAd/Rs7o+PN1/EyZvfM6BdgtccKTklGPnpLq0swjvimmLe3W11lk815tPNl/DxZsvOJZacz0yd8LO0d7I1E4qj2gejoFSBTuE+eGlQC5u/B7/ZmWi0akPNspa3K2OLm6ob2joI3zzUGYIgQKUWIJdJbV7adM+UgSYtYFCq1MgqqsBLK46Z3LN3eJsmeGlwC7QK9rR2mACAa3ml6KljcZEt+pUrVGq0mKbdB/H3p3uga6TxHpQTvtqLQ8m5Jh3nz2d6mNXXUhAEqAXjAdMKpRrnbxSgbVOvBpe92xikF5Sh23zDvdCt8eczPbBkWwKkEgk+mhAHHzdHXMsrha+bI5wdKnu2pxeUYeWRVNwdH2LydQmZr0yhQuyM9Wa/ztJrKTHYo9+to1yKi3NHmLz9ydQ8jFmyx+Lj3d81DCPaNkHfmABRv9+3hsdicn/9i3pMoe+64Pyc4Vqti8xh6fe4YGw73Nc1zKzXXM8vRY/3DC/0PTd7uFWL0ap0mrPJpFZJro4yk9vn1JXbpa/0qmd7Ij7M8GJltVrAX8fSEB/ug0h/tzoaGRERERE1ZgxKNxC/HrhqVY9EayZ9r+eXYuv5DIyJawoPZwdkFZVj2l+nIJdKNcG3uvb5xHgMbh1oNIsmKasYAz7cjtl3tsFDPSKM7vd4Sh7u+tz4JIq3qwPySvSXYTs7e5hWhlR1CpVak6G97UIGlu1Jxvvj2iHYywVKlRpKtWB0UuPY1VyM/2of3BxlWDm5p8Hs5jKFCq1nrodagM5MtU7hPvjigXgE2aDMoC5qtYAeC7YgvUB/ufTqWdL6ZBeVY9pfp2tljR94e5De76W0QoUXlh/DIz0j0LtF5Yr8K9nF6PfBdgR6OGHvlIG1FivoM+7LvTiekocj0wfX+x5nN/LLUFReGdC2pvSjSi2guEIJR5kUakHQ+zduqj0JWXjw+wMw51No75SBaGpBFnFqbgm+3nFZ09u9eYAbEqu9Fzyc5Nj91kB4uVrWR/jZX45g7SnzMmrfHN4Sz/Rtbtegk75JN1sEChsqUycmF90TVytj7sMNF7BkW4JVxz/xzlCo1AIWrDuHhIwiPN47yqTsaEOOXs2t1WO8emb32hf7aPqSi6nmz9Lf3QmHpw8W/TgAMPLTXTh7vQD7pw6Cv7ujyef2tLxSkypzVC1CoMYps7AcJ1Ly0C3KF/0+2I4cEwIrxgyKDcRTfaPQ7TavQFGfWBqUtudnpD2CZZtf7Wd2KW1Tgp+61PzZ1regNKB7TNb8TVjyPS57tItWJSKxjmfOAl1jzlzLx6jP9GeWV9nwcl+0bFK/ykKL8XcX28QD528UGtymiacz5o9ti84RvsgrVqBMqYL0ZsUntSDg2V+OYsfFTKvHUsXZQYoQbxe8NrQlRrQ13u6KiIiIiMgWGJRuQDIKy9B1nvmZG5P7N8dbw2NtMCLgYFJOrVKw+6YOtGgSQp+He4QjLa8M7k4yODvI8ObwWJv2VCutUKHVTN0TVK2DPfHjY13h5iTDb4dScPZaASID3NA+xBvRge4I8qyf/f2Ky5WQyyRaQXy1WrBbMKxMocKPe5Px3rra2ZpbXuuH5gGmT3xVBR6qWDIppFSpIQBa5dyNUasFVKjUFmdF0C3vrD6NH03oYz+qXTA+fyDe6HaGZBWVw8/NERKJBGq1gJNp+Yht4iHK73H+2nP4Zudlg9s80jMCm8+l44WB0bi3i3nZLbbwz4lreHH5Ma3HJBIg6T0Gpat8tuWSSRVC3h/XTufv1NKJTWuyrkx1JbsY+xKzMb5TqMlBW2vU/FmceGeoVusCMSlVahRXqCzav7Hf2ch2TbBgXHuLqkRQwzN40Q6tVh+mah7ghnfuaIO4Zt42+zsn65QrVWg5nUHpmp7oHYnvdicBABxkElyaN9Ki/VgyVlsGpXe9OQDNfK0PuNo7KH1o2mCzezxX99exVLzym+4e2WL/bT/2wyFs1dOObGzHEMy7u50oWdlis/bvLum9yvdMzaz6Ia2D8GD3cPx74hoe7x1ptBpNYZkC7Qy04qrZ7kOft4bH4rHeEVa1xSAiIiIiEgt7SjcggR7OSF4wCr8duoq3VpqeNW2rgDQAnaUwg71c8P64dmaNsboj0wdj1dE0DGwViCh/tzoP8ro4yvDnMz2QV6LA+RsFuFFQhmAvFzzUI1yr3LKuns/1la6sY3tmZzo7yPB0v+Z4vHckiitUWLL1Ei5lFGHOnW3Nnix6b2w73GlCdrshlgRjpFIJnKW8sRfDu3e2hUwqxdI9SQa3WzKxo9XH8q9WkloqlaCDhf0kdXl7ZCu9QelIfzd89WAntGzigVlj2oh2TGuNiWtaKyjNgLS2B7uHmxSUHt+pmc7Ht7zWD4M+2qHzOUe5FCue6o4t59Lh4eyAx3tHIre4AgEedbPAKdzPDeF+dVeK0VEuRYVSDaCy6oEtA3VymRReLpYF2kO8XZCWp7sH95KJHTG6fVNrhkYNzMh2wfhsyyWzX/fb0z20PnOo/nGQ2n4xTkPz65Pd0D3SD68MicGuS1noG2N5r98wX1dczSlB10hfZBWVa1VreqRnBJwdZPhqR6LBfUzsFoZfD1y1eAxAZSDwwwlxon3mbH61H9ILynAiNQ8L118QZZ+memVwjFUBaQC4u2MoJJDg5d+Oaz2+7JEuVu1Xlw8nxOHnfVcwrlMIkrNKUFimgLuzHH5uTjapyFIfuDnKNNdwG17ui2Gf7NQ899WDnSCTSkzuue3h7IApI2Jx7GouNpxJ13ru58e7ok+LAK2F5lX5JlXHV6sFFJYpLa4ERURERERkCwxKN0D3dgnDvV3CkJJTgj4Lt9l7ODp1aGa4N5Ehfu5OeLJvlIijMV9Vj8jBrYPsOo7GripoMG1Ua4v30S7ES/P/D8a3F2NYZAcz72iNGaNbYcH68/h6R+3A7tN9o+plFYKaqiZP7+rQFGPjQ1GqUMHPzRFxzbzNysSn+kNtQkGZwa2C9PYJ1lX5IcjTCdNGtcaYuMrgZvV+foF11ErBHg5PH4z2szbCzVGGQCsn1W3p7+d6YcXBq1h7+gbOXS9A/5YBeHdMmzoN4FP98fyAaLOC0suf7I5ukb7sB94ASKUSdI3wxcHkHHsPxWRfPRiPZ/531Kp9tAh0xyU92f89m1cGod2c5BjetolVx9nwcl/cKCjT9KFNyipGQkYR4sO84efuhHKlCjFB7th1KQt/HUvTuY/po1pZFZQe1iYIi++Ph6NcvGuw6EB3RAe6o1O4DxIyijCklXX3ix+Mb483/jyp+fqVwTF4ok8k3JzkKFeqsO7UDagFAU28nNEtUpzy/3d1DKkVlB4Qa1k5cEN83Rzx0uAWACBaWfC6MG1kK8xbe86i1+54c4Dm/y2beODVITFYtOkiOoX76L1WNOSZfpUl5y+mF+LZX46iU5gPJvUIR9ub98DVP2tq3itJpRIGpImIiIio3mH57gYuv0SBuNn6SzrtmzoQwV7m9181R83yVlVlvywpe3V4+mBmlRDd5tq9swGF5UrN1xO7hWH+3e3sOCLTqdUC0vJKRSkPWVeqn6t7RPlh+VPd7Tia+seU1hlDWgfhWwP9hQ8l5+CjjRfwSM9IDGmtP4B9OyhXqiAIYOsDalBMvaYVqzww1Z0nfjyEzed0lxfWx57luwHrSguPjQ/BY70iMXpx7V6/e6YMRIi3be8bdREEAb8evIq4UG9NoK2KpX2/gYbVjuRKdjG2nMtAz2g/xDapmwzi5QevYuqqyspmW1/rhygz2ic1dgqVGi2mrTP7dWte7I02Tb1qPZ5ZWA4fV4c6aZVCRERERFTfMVO6gfNydUDyglFoN2sDCsuUWs+9MaylzQPShrwyOAYfbzZe8rTK3LvaMiBNRDg0fTA2n0uHTCLBiHbB9h6OWaRSSYMOSAy0QZZMgyfC0r0uEb5Y8VQP63fUCLCfITVE30zqhKd+PmJwm5WTezbo8//tqm2Il9lB6YZs0T0dAABP94tCiLcLwnxd8c3Oy5h9Zxu7BKSByuzOB7qFi77fqgzThiDczw2P9a7b1lD3dw3D3R1DIJVIRM0kbwwcZFLsmTIQvRZsNfk1K57qrjMgDcDqkutERERERI0Jg9KNxKlZw7AnIQtRAW7YfSkLgZ7OJvcqspWJ3cLMCkrf3zXMhqMhoobC2UHGnq120qO5OGUhGxNTJmpdmPVL1KgNbdMEyQtGIbuoHA98dwDnbxQCAEa1D8b8u9vZtD862dYz/Zrjk83m9wy3p8d7R+L73Ulmv65bpK/m/1NHtNL8v3/LhrUgrWukLw4m6S+5/mD3MAS4O+MpO7eDaghYtUS/EG8XfPtQZ7y4/Bie7BuFx3pF4IXlx7DrUlatbbe81k9nuxYiIiIiIqqNQelGpFd0ZQ+wCZ2b2XkklcxZEbzupT63dTlTIiJ7eXdMG7zzzxkAqFU2kwBvV0dMH9UKW85lYEyHpppSl9U9NyDaDiMjorrm5+6E9S/3tfcwSEQNMSj33IBoi4LSSybG22A0de/7hzvj7+PXMOPv0zqf7xcTiCGtrevzTARUtmc5N2e45uufH+8GAFCq1NibmI2CMgUGtwpqkOcRIiIiIiJ7YVCarPZorwgs25Ns8evHxoegVXDd9M4iIiJtD/eMQPtQL/i5sbSgPk/0icITfSozrpQqNWasPqP1fMsmHvYYFhER1bGvHuxk7yHA182x1mPrXuqDr3YkYl9iNl4bGoOMgnJ8sT0RpQqV5vnGUEJ46ohYeDg7YFL3cIztGIJfDlxBcbkKvm6OWHUsDak5JegVzaovZFtymRR97VyVjoiIiIiooZIIgiBCt0S63UVMWaP5f/KCUZr/p+SUoM/CbQZfW317IiKi+q5cqcKEr/bhZGo+VjzVHd2jOAFORNRQVb+PMaa+3LccuZKLcV/uBQBse70/Iv3d7Dwi21CpBTR/e63m64tzR+htqyEIAlRqAXIZ+yMTERERERHVV8yUJptq5utq8PkenMgnIqIGxkkuwz/P97b3MIiISARfPhCPyb8ctfcwzNIp3Ad7pwxEdlFFow1IA4BMKsHGV/pi6/kM3Nelmd6ANABIJBLIZWwHRUREREREVJ8xKE2iCvF2MWv7j+/tYJuBEBERERERGTGiXbC9h2CRpt4uaGrmvVdDFBPkgZggtskgIiIiIiJqDFjbikSlq8fZ2I4hWl9veqUv7u8ahm8f6owmXs51NTQiIiIiIiIiIiIiIiIisgNmSpMolj7SGYu3JuCjCXG1npNKb5VR69PCHy2CPPDe2HZ1OTwiIiIiIiIiIiIiIiIishNmSpMoBsYG4a9neyEqwL3Wc72i2TeaiIiIiIjqp1XP9rT3EIiIiIiIiIgaPQalyebujAtBqE9lv7PXh7a082iIiIiIiIhuiQ/zsfcQiIiIiIiIiBo9lu8mm5NKJdj91kCUKVRwdpDZezhERERERERa7ukcit8Pp+p9/vHekXU4GiIiIiIiIqLGh5nSVGcYkCYiIiIiovpo1pg2Bp+P8Hero5EQERERERERNU4MShMREREREdFtzdXRcBExSR2Ng4iIiIiIiKixYlCaiIiIiIiIyAAfV0d7D4GIiIiIiIioQWNQmoiIiIiIiG57X0/qpPe54W2b1OFIiIiIiIiIiBofBqWJiIiIiIjotjesTRNI9dTplul7goiIiIiIiIhMwqA0EREREREREYDXhras9diSiR3tMBIiIiIiIiKixoVBaSIiIiIiIiIAT/aJwpy72mq+vqdzKEa3b2rHERERERERERE1DgxKExEREREREQFwlEsxqXu45utHe0XacTREREREREREjYfc3gMgIiIiIiIiqk8+mhCHGwVlaBXsae+hEBERERERETUKEkEQBHsPgoiIiIiIiIiIiIiIiIiIGieW7yYiIiIiIiIiIiIiIiIiIpthULoeKC8vx6xZs1BeXm7voRAR1Qs8LxIRaeN5kYioNp4biYi08bxIRKSN50Wi+oXlu+uBgoICeHl5IT8/H56e7FlGRMTzIhGRNp4XiYhq47mRiEgbz4tERNp4XiSqX5gpTURERERERERERERERERENsOgNBERERERERERERERERER2QyD0kREREREREREREREREREZDMMStcDTk5OeOedd+Dk5GTvoRAR1Qs8LxIRaeN5kYioNp4biYi08bxIRKSN50Wi+kUiCIJg70EQEREREREREREREREREVHjxExpIiIiIiIiIiIiIiIiIiKyGQaliYiIiIiIiIiIiIiIiIjIZhiUJiIiIiIiIiIiIiIiIiIim2FQmoiIiIiIiIiIiIiIiIiIbIZBaSIiIiIiIiIiIiIiIiIishkGpYmIiIiIiIiIiIiIiIiIyGYYlCYiIiIiIiIiIiIiIiIiIpthUJqIiIiILFKhVNt7CERERERERERERNQAMChNRERERGZ79bfjaDVzPa7nl9p7KERERERERA3Ssj1JGL14F3KKK+w9FCIiIptjUJqIiIiIzLbqWBpUagG/7L9q76EQERERERE1SO/+exan0wqwZGuCvYdCRERkcwxKExEREZFZmB1NRERERERkPkEQdD5eplTV8UiIiIjqntzeAyAiIiKihqXfB9vtPQQiIiIiIqIGZfGWS/ho00XEhXqhY5gP3rmjteY5PbFqnQRBwLI9yYgJ8kDvFv42GCkREZFtMChNRERERGapUKo1/5dI9G+XW1yB1NxStAv1qoNREZEY/rf/CqQSCSZ2C7P3UIiIiIgalY82XQQAnEjNx4nUfPywN1nzXEGpAkXlSrg7GZ+u35uYjdn/nQUAJC8YZZOxUv0nCAIe++EQvFwc8Ml9He09HCIikzAoTUREREQm+3TzJa2vF29NQICHEx7qEVFr267zN0OhErBycg90CvetoxESkblUagEPfncAcpkEuy5lAQDGdGhq0qQo3d4EQcDZ6wWQSiRwkksRFeBu7yERERGJQhAESAytwBXZmlPXsebUdVyYOxxOcpnBbVNzS+poVFSfXc4qxrYLmQCARfd0gFRad3+vRESWYk9pIiIiIjLZx5sv1nps5uozOJ2WX+txhaqyBt2Oi1k2HxcRWe7IlVzsu5ytCUgDwAPf7tfb85Coym+HUjDqs90Y8ekuDPxoh1YlDSIiooZqwbrz6PHeVmQVldf5sfst3G50G16ikSAI+GzLrQXjL/123H6DAbB0dxJe/f041Gr+cRKRYQxKExEREZFJNp9N1/vc6MW79T733a7LthgOEYlEqa4dSDyRmo9LGUV2GA01JD/tu6L19YYzN+w0EiIi27ieX4rLmfw8vN18tSMRNwrK8O3Our+PuVFQhgEfbsfaU9cBVAYfyxQqzfMVSjWmrDpV5+Oi+mXumnNYffya5ut/T1zDxfRCu41n9n9nsepoGnZczLTbGIioYWBQmoiIiIhM8sRPhy16XUmFyvhGZJFypQo/7UtGSg5L+JFlfjt0FXsTsnU+p2YaDpnpheXHtCbOdflxb7LNJixziisw8dv9WH08zSb7b6iuZBdDxcwlqgeu55fit0NXjZ4n6pMe723FwI924KmfDmPbhQwAwNXsEr6nyCwKlRpfbE8wefukrGI8+8tR/H4oBS+uOI7YGes11/sM+hEAfL87qdZjDy89aIeRaCuuUNp7CERUz7FJGBERERHZXEZhGQLcnfD38TScTivAi4NawMvFwd7DavAGfbQDqbmlAM7g8vyR7CNGJiupUOL8jUK8tVJ/ps2ui1mIbeJZh6OihkZXlv1fx9Jwf9cw5JVUQBAAHzdHzXML1p3HVzsSAQDJC0aJPp4PNlzA3sRs7E3Mxp0dQkTff0NzNbsEk385gjPXCnBHXFMsvr+jvYdEt7F9idm4/9v9AICkrBJMGRFr5xEZV72Nxcaz6dhYrWrQgJYBWPZoV3sMixqYvJIK9Fyw1aKFum+uPKn5/8/7r+Dtka1EWzRYplDBSS6t077ZJI6jV3N1Pn49vwznrhegVTCv34mo/mKmNBEREdFt6tjVXPx1LFX0/QqCoCk3V6XrvC1o/vZavPLbCXy/Ownv/nNG9OPejioD0pViZ65HCVemkwnKFCq0nrkBY7/Ya3C7eWvPIYElvEmPMoUKF9Nr/338tO8KFCo1OszehI5zNmn1ma4KSNtKXkmFTfff0AxatB1nrhUAqCzrSWRPDy09oPn/VzsSUVCmsONoTPP88mN6n9t2IROHknPqcDSky59HUjH8k504cy3fJvsXI/y75tR1UStH1Qwhz7LgviohoxCxM9bjtd9PiDMoO7qUXoj1p2+v9iG/Hriq97kRn+5CubLhVKMgy6TllfL3TA0Wg9JEREREt5n8EgX+OXENd3+xF6/8dgKHTZhQyy81feJwy7kMPPvL0VqPV69yeCwlz+T93Y4EQUBKTolWho4xFUr1bTchY4ldlzKxNyELFUq1VrDsdmJOoDmRfTStVq5UYcfFzAZVrtYYlVpA7Iz1Op87d70A89ac03ydmmuf9gJ7ErLsctz6QBAEzFx9GgoVywtT/VGz3LWhoIqtHU/Jw8ojqRAEAepq4yooU+Cer/eh9cz1OHo1F2tOXjewF2DCV/twMImBaXt6/Y8TOH+jEKM+242i8vq5OFNSK4xs5f5qZDb/sDfZrHsG4Fbp51XHGn67iyEf78Qz/zuC6X+fMvvn0FAZ+4sqbSDtsxIyijDp+wP47dDV2+Z3J4aTqXnotWArRn+2295DIbIIg9JEREREjZhSpcaRK7lQqNSaG72Hlh3Ei9UyPy5nFRvdz9Vs04MK+sqJNQbZReV1csM8f+059Fm4Dd/uuoyMgjJsv5Bh0nF3X6r/QZhDyTl44sfDdpnELSxTYNL3BzHxuwOImb4OMdPX4ZPNF2+74BWrNNYdQRDw7r9n8fDSg3irWglOQ9vXd2UKFZq/vdbgNj/sTdb8/67P99h4RLdU//E98N0B/Rs2cuO+3Iuf9l2x9zCIDLLn6e6uz/fgtT9OIHLqWkS9vRYz/j6N/h9sQ/tZG3EwKQclFSqj1USq3PP1vnoVDM0prkBaXqnxDUVQWqHC6bR8vP7HCSSZcD9ha+kFZaLsp3olKWU9XNyj6zJu/Ff7oFSZs9iy8V0M/m//VUROXYvj1RY/77qUied+PYqc4turkso3Oy9DrRbw1E+HsWDdeYPbHk/JwzurT+PCjUJRjm3OIoyHlx7ErktZeGvlKXyy+RKUKjUyCspQplA1iGtie1l9vLL6zSVWtKIGij2liYiIiBqx+WvPY+meypXwAR5OEAQBWUXaN+ViTUmk5ZUixNtFlDJ39c2RKzn4/VAqfjucgkd6RmDWmDZazydlFeOvY2l4vFckvFyt75X97a7K39n8tefx4YaLqFCpsfj+jrgjrqlmG11ZvquOpeGhnhHo0Mzb6jHYQoVSjQlf7QMAbD6Xjl1vDkAzX9c6O/6+xOxaj32y+RIAYMcb/RHu52bw9YIgmNV3ryogGebrisd6R5o3WBsyZ7KI80GWe/7Xo0jIKML5m5N8q49fw6f3dcT/9l+Bn5sjRrQL1myblFWMAR9uBwD89WxPdAzzMekY+xKzcSI1D0/3jaqznpDf7bps1vYFZXUXrCksr//lgOvC0at59h4CUYPy837rFnG0fWcDNr/aF9GBHiKNqLbiciXe/usURrQNxvC2TfRuFz9nEwDg6Iwh8HVzFH0cWUXl+HJ7IlRqQWsB0v7L2dj91kDRj2fIspv3OFXECGK9sPyYVruDpXuS8PKQFnBzlEMmtexzVqyP56rd6NrfkSu5GPvlXvzzfG8AlVWu3J10jzklpwTLD9qvUoGYvt1Z+5rkrs/3YOkjnTEwNgiTvj8IAHCSS/Hh+DhILfwd1jd/HDHcguuL7Yn4YntVy5R0hPi4YFL3cK1tyhQq7L+cjUeWHQIA/LjvCp4fEI3Xh7W0xZC1nEjJw+Ktl7QW0Hy65RI+3XJJ83WvaD/88kR3zdcZBWW4/9v9CPZywQcT2iPYy8Xm4yQi22CmNBEREVEjJQiCJiANAJmF5bUC0gDwxp/Gs/dMmUz57ebkxi8mTOzVh2wKc4z7ch9+O5wCoDIDcPnBq1or7kd+ugufbbmEuNkbRe/tVHEz66FmL9bkbN0/Q0uyEteeuo79l7NrldYUU4VSjWd/OaL12Esr9PdqFNuCdefx1M9H9D5v7G/yUHIOOszehD+NTAJVdzwlDz/sTcbs/86a/Jr6h1FpS/138romIF1l5urTmP73aUyu1uKgXKnSBKQBYPL/arc/0OVyZhHu/3Y/Fqw7jyd/OoLFWy5BrRZsmllyKb0QH268aLP9W0oQBHy08QL2JNReeEJE9UPNhTMrj6bivbXnGk21ksGLduKnfck22//XOxKx+vg1PPM//dcy1ZnTqsMcL684ju93J2kFpAEgNbe0zttUvPuv9vWVGB9/1QPSVdrP2ojmb6+tN5mb+u7LTqZW9tW+kl2MuHc34p6v9+ncbsbq07YamknUFt5vrDqairs+34P0gjLkFFdg/tpzmLf2nM5tH/vhMF7/40S116Yh6u21FvXfrk/KFCqL+qfP+Lv273zKypOagHSVJdsSLBrXIRPagVV35+d7sPlchsFt9iRka53H3lp5EomZxdidkIUe723FoI+2I7cBZsCXKVRYfTzN5LEXlinw1p8nseNipuaxenIqIrIYg9JEREREjdTygykmb7tCxNXyYmXF7UvMxtRVJ7E3IUtnsDSvpAJ7E7IsntiwxtRVpxA/ZxNKKpT4ZmciSqtNwlX1aBPbmWsFWr1Zz14rEGW/l9IL8ewvR3HfN/vR/O21iJiyxuyJBUP+t/8KIqasQcz0dbUmH4xl8xWLWA6zZlC/pq3nDU+MPPPzEeSXKrQmuIwpLhd/cnbLuXT0XbgNR67UTZl8TnqIS1dJ5W92aGf5qE34oReUKTDwox2arzefS8dHmy4i6u21mPT9QSRlFYsaHBAEATP+Po0hH+8UbZ8RU9Zg4rf70W7WBp1VDEyRUViG/FIF/jlxDYu3mjeReiW7GAVljS+z2lgPXCJ7UKjUta7lEjKK8PXOy42q1P7M1bYLeGUWlet+vLC8Vm9sQPxWHWUKFVYcvIrdBhYRVF9gJZaqRY1TV53S+hvStZjwHQMBxzKFCk/9dBgRU9YgYsoa7LqUqXdbfb7bZdk1vti5uYYq3nyxPQFfbKu85tV3rVhig+tTU7239hzi527C9XzzSsz/cTgFr/5+AsdT8tBt/hbEz9mEb3RkSVenayFpzcUUQGW7qxv54pR+t7WhH+/EKJH6CP99vPYCDAB49ffjyCsxLWA64+/T6DB7o6YSVnU5xRXILNR93jLV4EU7UKZQISWnBNsuaL9nEzOL0XHOJjy09KBmwYitF8YoVWq8tOIY/raiF/uc/87ipRXHMWmp8c++xMwitJu1Eb8dTsHDSw/q3IbXfdQQsXw3ERERUSNSWqHCrwevYt6aszAnVjtl1Snc1zVM7/NbjKxktkR2UTn83J1qPX4wKQerjqZixaHKoPrygylo5uuCXW/eKgm4/OBVTF11CgCwcFx73NOlmejjM0XrmRtqPXY503ZZ4DsuZuKBbpWl117+7bgo+7yio1/4hK/2oXe0PxaOb4+m3qaXRkvMLEKojwscpFLsTshCm6aemK5jZb4hgiCgVKHS/Gynj2qFJ/pEmbWPmkzpsZdRYHjSxJLYrC2qKT/+42EAlT1jHeVSzLqjDSZ20//etXZcjEnb3gEL+qvf9/V+vc/tTsjCgA+3o1WwJ9a91MeaoWmcu15odXlbXfbeDEbf/+1+JC8YZdZr80sU6Dpvi8nbq9UCBAAnUvPg6ijD8E92wVEuxYU5w/HrwatYtPEiRrRrgteHtoS7kxxyWcNcw//cr/oz7SOmrDFamrNcqcJzvxxDhJ8r3hweC0d5w/w5UP2hVgtoMW2dvYcBoPIaw1DVlPqt9of374dS8ObKWxWP/nm+l+b/YlcpXrDuvM6AXnXX88uQUViGQA9n0Y77x5EUrD11A0Dl9f+6l/qgqbeLzgD4Xj0LnI5cycW4L7V7hE/6/iDOzxkOZweZ5jFjvbi/2pGIJ/uaf00q1vWgpuKVgf0tXH9B6+u1p65jZLV2IUDl56A9nLtegK9vBpK/3J6I2Xe2Nel1c/87i+9stOgXACZ+dwAHk3LwxzM90CXC12bHEcPVnNr3baZKLyhDkKfx9+aqo2koU6jwxQOdjG6r69rwuV+Pws2piyYL+9cnu6Fnc3/zB3xT7Iz1Bp/feTETkVPXonWwJ85ev7Vo+/uHO2NQqyC9rzucnIO9idl4tn9zo9d8JRVKvPb7CWw6mw6lWsDq49dwV8cQ874RVM7V/HKgMhngdFrlWAvKFFhz8jqGt2kCn2otF/JLFRhUbQGqPs/9ehSJmTEY1qYJfjlQWYI90ITfc0ZhGb7deRkTu4Uj0t9wCysisTEoTURERNSIvLfunM5MPFMcTMpB10jdN+IfbzahXKtEgvM3TM/e7TR3c60ghCAIOkvNpeSUYva/ZzFjdCuUK9WagDQAvLnyJLpF+RrtB2wpc0v1yW3Yq+zY1TxNUFoMgiDgiZ8O63xud0IW+n2wDZfmjTRpX5vPpuOJnw6jQzNvjGzXBPPXnjd7PNUXG1SZu+Yc8koUJvU3S8gogpNcWqtP9Z0mlDQ3lJ2aXVSuVa5989l0DG6tf5KjrlQo1Xj7r1NmB6XF8P7683CSS/Hy4Jg6P3ZjI8D87Lbqk276nDNhG1OVKqyvWmAsqyR+ziasebG3yT0CTfm8SckpwYYzNzB3je7ynhVKNSKnrtV8/b/9V/G//bcqh3wzqROGttHfu9UUpRUqPLzsIAbFBuLpfs2t2pcxplQOWbItAV/uSMSZd4dpBWOAypYKq6tlTn23OwlfPhCPXw9exeR+zdEz2vJJZbp9TTNzcZqtKFVq3PP1Ppv3XM8vUcDL1UH0/er6bKiZGTxmya3rnZrl0q2lq6y1Ll3nbUHSeyORkFGEoZ/s1Kq48vrQGDzRJ6rWuceQnBqth0Z8usvk1+aXKPDplkta7Yyqm7n6NBaOj9N83WvBVoP7M6WSiS6GMpvNsfJoKj6c0B6P/XDI+MY3PfvL0Vr3W4oaizW/2ZmIcfGhOhcLi+XbnZe1Sm3/tO8KnugdhTA/VwOvAmb9c8boYghrHby5OHDCV/uw680Bte4j6ot1p6zLiK1Q3vq9G6sUcPRKnlXHql4WfOK3B/DfC73RNsTLqn0aU/Pa+PEfDyNx/kidfdX3JGRpqnQs2nQRcc288eUD8cguqkC70Nrj/HJ7ItadvqH1WJlCZda5DIBWz2wAGLJoBy7dLFH+55FUrJzcE4mZRRiyaIfeBIPCMkWtc9qiTRfx8eaLEITKReLVe3EDQGpuCQI9nJFXWoHFWxKgUKlxObMYB5Nz8O2uJHi7OuDLBzqhR3M/s74fIktxySsRERFRI2JpQBqA3r5jplp1NBXDPzF9oggA3qnW0+yvY6lawYGalu5JQuTUtTpXS1s7dkMOmpnFuOJQCq7o6fdsrQNJ4vZLvWSk36BCZfrkW1Vm+/GUPLMC0lVB/5dWHKsVkK6yZFsCIqas0RtwOZ6Sh4gpazB40Q70WbgNOcUVmP3vWbzxxwkIgoAzJpQ67xMToPl/eYVCMy5BEDDxW+3yak/8dBjnbxTgww0X8Orvx/WOq6TiVgm5drM2iFoW3RrmzFM/+8tR5Jdqlzi+lF6IL7cn4pPNl+q8f+TtIL2gHBO/3Y/Vx9MgCJU9opcfvIr31p5DYmaRXXpaTv/bupK0giAYre6QU1yBHu9tRVG5ErsvZWktBNG1v/u+1Z8tXqXPwm16A9KmECOjcsWhqziYlIP31p3HWismlFNzS1BSoX9xgCAIiHpb/2dodSq1gDuXaC/WUajUWgHpKpN/OYpdl7Iw8bsD+HlfMsqVfM+TeZaL2CLGXIVlCqTklODTzZcQPW2dzQPSAPDFDst6shqj66NbZeDz4K+jlpeXtVbk1LUY8vHOWi1APtx4EfPMPCdb8olX9Tn52h8n9AakAeD3w6k6WwSJORbAsooo+pxMzbe6tUrNBQvz155Hp7mb8csB8SuiAJXBO129n/t+sM1oeWdbBqQFQcBTNRbnmrKQ1V4m/6K/Eoopjqfkaf7/9Q7Dpc8tXYChz9N2qlAxU0//9JptI06k5KHngq24Y8lu3LF4d61rbV0tYj6rEWA2xYlqvwNA+178yJVcvPvvGQz7eKfBinfVF09WVzXkPQnZmnuI3w+l4Ltdl9H7/W2Imb4OXedtwc/7r2DFoRQcrHZfmleiwP3f7sfHm0xIRCASATOliYiIiBoJMfvvWiI117zeYADw474rOHe9EB7Ocmwx0tPXkPSCcpRWqODiaN5qZUPO3yjAV9sT9fbbMqTfB9txfOYQeLs6Gt/YDCk5pfj9UIrRcmG/HLhiUkZ1aYX9gwvxczahbYgXdl3S36OwStTba7FnykCEVCspXlKhxF01JpDi52zS/L/mqnZ9HGUSLN2dhF7NffDA94eRVVSO8usX4RSsOxO4+gKMmCAPPN03qtYk35PVJroKy5SY8NU+nSWKL6UXopmvq9mr7S1lbsbOT3uT8cKgFgCAjIIyrb7C7Dltued/PYoPJ8TpfG5vYjb2JmZj8dYEJFSbsPraSP9EW0gvKLM663r7BdN7d7Z951ZbhNXP9UJcM+9a2yjVQp397anUgs4sG1OVKW5lJunKWDPF5cwiTQ9xqQTYM2VgrYzyxEzDi4xqupBeCLVagPTm92ZKYGbG6jM4nVaA98e3N+tYRMZsPZ+OgbHiViD5akciFqwzv2qLtbaey8DUEa1E36+u9jDVMx9r+nn/FZxKy8eHE+IQHehu9fGzDSwUMsfP+6/g7ZGtRL1mr27gh9txOasYsU08cP5GodHtv9t12eQqFnklCuMb6bDyaO3expYSY4GqvvP99L9Pi1qRSa0W8M4/Zwy2/3h//Xm8MaylzrLSL604JtpYdEnLK8XGs+laj+UUVyCrqBz+NswaN5cgCKK8/15Yfgx3xDUFYHyBasbNXvViVVxIyyu1+nrKEr8cuIp5d7dDXkmFyfflp9Ly8dWOy5jcv/K8cFVHqysA+GJ7It4cHmvWeKRG0kOX7Uk2uo+alQ50eWnFcfi5OWm1dzDFp1su4ZUhrIJFtsdMaSIiIqIGrkKpxsL15/HEj7rLMNd3B5NzrApIV2k1cz0ipqzBvsRsvL/+PCKmrNFaEW6u4Z/ssiggXeXB7w8Y38gCb648iZjphnszTvvrtEk3rOZmq+hToVRj87l04xvqkFuiMCkgXaXXgq04cPnWhNzoz3Yb3L7IxMUa0/46jXf/PY1hn+5BekYGbvzyJnK3/4CKTONZIwvWndfKMCwuV+IfPWUua/a3Xn/6OoZ8vBMjPzNeZcBYNomtlFeb9N5+UTu4KHYmxe3kv5PXETtjvcFgYIKRagbGZBSUWfV6AOg23/S+zfpY2r/yzs/36Pz57DbjnGGtsV9YlzVlbD43MbMIF24U4mp2CfZf1g425Jco8OGGC5qANACoBaDHe1txMV072LLmpGkLcKr7+/itTMppf5lWYvm3wylG+64SVTHlWgQAHvuh8hr2REoeXv3teK33giXsEZAGLM+mNWZftZ9JfqkC601YdHc8JQ+jTLi+MMaU1gDm2GmkdHB15l5mXM6qDN6bEpAGgPfWnbf7wl5zOBrpfavLmWv5Jm0n5iXdp5svodPcTQYD0kBlyeJu87fgm52JtZ7TVb2jLnSeu1nz/6W7k/DH4ZQ6H0NKTgke+G4/IqasQeTUtVpjEoMp916RU9diwld79f4Oa1ZSMkaM60lLvLj8GDrM3oQBH25HfonCpIpD768/r6lOM/kX8bK8pSIE+U3dg63mIojEwExpIiIiogbux73J+GJ77Rt5sfT7YJvN9m0L91cr6XrX53uw6J44jI0PNWsfR65YX2bvdJp4/VwtUa5Uw8HIxNVBE8pJP/nTYaTllqJMocLlrGL4ujniowlx6BcToMmwq57ZWBfu/WY/4sO88e6YtprJR2sp1QIkEikqsq4ifcXbcInoCK/u90Dm7qPZxlDGwIpDVzUZ7G/8eQJrT+meLI6etg4X546Ao1yKwjIFnvlfZSm+y5nFRnuTXc/XHQhSqtSQWzBJaSpltcloWY3vXynyRHVjseOi6RPu+y/brqx71/lbzM7MPZycgxmrz2DWHa3NKmtqyCebzS9xWKX522tx5t1hcHO6NX3xqBn9NK11IjUfS3cn4bHekRa9vmZS0Ku/H0fHZt5Izi5BkKdTrXYHA1oGYEBsIO7rEoZpf5/Cfyd1l/we+vFO7Jt6K2P6483ml1x89fcTGNU+GE5ymVmZfIM+2o5po1pjWOsgBOrIbiOq8vKK4yZveyg5BxO+qmzHsupYGgI9nPDFA/GQSiVo7u9uVp/mdBEW5FgqIaPI5p/Lce9uNHnbcgPZ1KZSqK3fR3VP/3zE4GdThVINqQQ2/RlWt3hrAp7uG2XStrb+3Roz69+zZr9m1Ge7NT9vfVmfYjP3M2n+2vN4qq9pGetiMZS12/adDVoLW8d3ChW9T7s+R67kYNyXtmtNlWzGvdOh5FwcSs7FpO7aGfTF5UqzzkMAkFVkn8W1VQuFk7KKETfb9DGXKdRwdYTBNlDbzmdgQGygyfsU42/I1n+G7607Z5NqH0TVMVOaiIiIqIH73cart6/U0eSFrbz6+wnsMiMj41peqWgTAXkl5pdaq9lrylJiBZM2nU3H2esFmuBvTnEFHv3hEKLeXouIKWsQMWUNKkzMhBLT0at5uGOJ4SxpcwlqFYpPb4FLZCf4j34NDv7NIJE7QpGdCmVBlsGJhDNpBVCq1Pjt0FW9AekqS7ZWBujm/Kc9sbjCSN9NfVkNE7+z7Ur46tnQNcvOiZ091VjsTay7TF4xfbblEsZ/tQ/nrhfg3m/248iVXHsPCQDQ5p0NeOLHQ5jz31m7BJtm/3fWrPP5qdR8TSZzzXL5q46mYcbqM/h+d1KtgDQAbLuQiZmrzyBm+jq9Aekqh5Mrfz/brKg28tRP5mcAlSnUmPH3aa1FYERA5bVH9QoNa8zoo14VkK6SUViO8V/tw9gv9iJu9kasPm56j+TH6nDhii4fVevLaajEdl05ayCoYgqxrikN+XbnZfRduA1Xs0vQYfZGDP2kslXID3v194QWS0pOCUZ8alpG+apj9uvVbY3TaZXZ0mO/NFz9Y5WIpcbNtc6M84U1CssU2HjmBpQq/X/XNSstPbT0oCjVZ0zx6wHb3dtHTl2D/h9uN/t12TcDyhkFZXjzzxNoY+GC5MPJOXX2c7SW0oTFODP/Ma3KTJUKpfWts2xdOctYv/GCMsvaGBBVx0xpIiIiogausEy8knOfb6vMFKjKANh+wfqy2vXBpO8Pav7/xQPxaBfihWa+rrW2W37wKqauOiXacTvM3oTpo1rhiT7Gsy8yCsvQdZ54Zc2+3XkZrw2N0Qqk5hRXYPO5dIxqF6yVdXi7qpn5LJHKoMzPgKq4MtCTv+93VGQmozTxECRyR/gOmQy32N4691VYrsQjyw5hd4LxYORnWxOw4Uw6LtQovzvr37OY9e9ZdA73wbJHu8DDWTsr7IMNF3Tu72CSeZm25vboq17mrmbZOUszpav2aUnGgFKlhlIt4FJ6EdqGeFqVdXA9vxSTvj+Ih3uEY1KPCIv3U9M/dio5qcsrvx3HsDZBGN422Oi2izZpZzZJ67j3nyGbz1V+Hn2/2/YBCl2q94Y2JLe4QrNgJnnBKKOlS63xwvJjyCmuwDv/nLF4HzsuZlq8+CAxsxh7E7PQs7m/xcenxmXCV3tx9GoeXh0Sg0d6RYi675dWHIdcKsWo9trnsuSsYvi5O2p9ZhrKbKsLX25PxOm0fK3FZO1CvHAqLR8Pdg/DnDvb1lnWJQCM/GwXXh7cAi8NamHScdVqAVNXncK5GwX4YHwcmnrbtiLC1ewSzFtb2VKm780KTZczi1FcrkSuhX2czSFAwA0TA2UHk3JwT+dmNh6R+EYvNm0h56u/nzC7wpRYJv9yFBfnjsDEb/fjsA0XxbWbVZkta85bcNelLHSdvwXbXu+PSH83G42sUlXZaFuwtER7p7mb8fyAaCzZlmDV8cd/tQ+L7omzah91ZdRnu7F/6iCD26TkmNbO5PyNAjz3y1EkZlpX4Su7qBw/7rPddWUVhap2xbW8kgq8tOI4dlzMxLJHupiVIU5Uk0QwpZA+EREREdVL285niF7GVC6V4NcnuyOvpAJP/SxeD6X6pnqwWKlSI3qa4T7N1mgV7ImVk3vA1VF/ILj3+1uRmitun85Zd7TGI71ulZy9c8lunEjNR+tgT6x+vhda2PB7ru8EtQoSaWWpbEFQQyKpvPEuvXwE2esXQ11WBMcm0XAOaw+3tgORu/U7qIqy0WTSojqZSH5pUAu8MiRG8/Xfx9Lw8m/H9W5fs7yxIRFT1pg9nsPTB0MQgC7ztHvaxYd5Y+Xknmb/TB5ddhBnrhVg91sD4SiXIiWnBHKZRFOKWJe/j6Xhy+2JtYL521/vjwgLJwhfWnFM07PQ3DLXhljyM7Y1U76/muP2cXWok4BAQ7D7rQEI9am9mKm6y5lFWv2fL88fiai319p6aFZr5uti8sSqMS0C3fHvC73xyeZLOJGSh+cGRONQcg4m928OJ7m0TgNxVLf+PJKK1/84ofm6ZZBHrfO1GFZO7oFO4b6V/z+SitduHnPh+PaID/NGfqnCpqVvxfDpfR1wZ4cQk7c/nJyD8V+J8z3p+yzIK6nA4q0JOhf+DGgZgG0XTK86ZKqezf2w9JEuWH/6hs5rHAeZBAoD2az20NTLGXuNBKqqqNUCHv3hkFktPeqDy/NHWrwoLb2gDF4uDoidsV7kUdUfUQFuCPN1xQsDozXnIjFFv72W7XEaEFOur4cs2oFLGUVWH+uxXpFYuqduFmeufq4Xjl7NRYi3Cy5lFOlcHB0V4IauEb6ICfKwuM0N3Z4YlCYiIiKqx1RqARLozlZTqQU0bwCT3fXZ2dnDsCchG0/+dLhOjndh7nA4yWUQBAFZRRUI8HDSPGfLIJZMKqmT0osNRVUQWl1egtxtS6EuLYDctylcIuLhHN4eirwbUGSnwDmsPSQSCSRyR+Tv+x2K3GvwG/Y8JDLbZ5m3DPLAhlf6QhAEFJYr0X6W8R5oX0/qhP4tA+Ak19+XGhD/b210+2AsmRhvcJu0vFL8cTgF93cNQ6CHEyKn3jp3PdIzAj/sTQYAfD4xHp9tuYTHe0diQGwgAjyckF+iMNoDbvqoVnioRwQc5eZ1qHrm5yNYf6ay3PqR6YPh5+5k5BXGiV1xQSzHZw6Bp7MDUnNL0czXpVZwsEyhatSTyNYK83XFzjcHALhVZaFmtYV7vt6nVbng5KyhJr13byfz7m6Lb3dexjtj2mBAS2bZNDSlFSqcSstHp3CfWv1Y63IxzqDYQMy9uy16vLe1zo4ptoXj2uOeLs2QXlAGXzfHWllp1Yn5s/V3d0KvaD+UVKhQXK7E3sRs0fZtqTeGtdRbDaY+SnpvpEkLbLaeT8djP9TNPYbYBrcKwjeTOpkcnE4vKMPEb/dbnQna0AyKDYSfuyOe6BOFIA9nCBBQUKpEhUqF6EAPi/ZZHxc2kn4X545ARmEZAjyc9N6D9VqwFWl54i4+r49igtzx/MAWuJxZhCvZJXiwezg6hfvU2k6tFrTOLWq1gLu/2IPT1wrw3cOdeX14G2BQmoiIiKieUqrUGPDRduQVK7Dl9X7wdHZAcblSUz73m12X8eX2RDuPkqzl7iSv1beMbE+Rk4b05VPhGBgFmYc/lHnXUX4jAX7DX9Aq0a0qK0Lppf3I2fwNfIc8Dfe2pmXH1Bf9WwZgfKdQSCDB38fTsOlsep0c98k+kegU7ouc4gp0CvfBsJu9IetCz+Z+uKtjCMJ8XRHs5QwvFweUVKjgKJfC09kB5UoVcoor0MTLGQ8vPYj9l7XLn3cO98HIdsGIa+YNiaSy1KGXixxOchmyisoR4ecGqUQChVoNmUQCd2c55FIJypVqSCRAy+kNK7A75662kEsl+GJ7gmjZskSmcHaQ4pl+zRHp7waVWkATL2d4OjvA1VEGuVQKZwcp5DIpnORSKNUCHGQSgxVHyPYmfX8Auy5loX2oFz6fGI+MwjL4uDoiNbcUDy09aHwHZJIgTyeMiWuKb3fZp10BWc7PzRHjO4dCJpHgi0Z6n9Y1whdtQ7wwuFUgHOVSJGUV440/T9p7WPWOr5sjcoorMKR1EHpE+WH9mRt4pl8Ugjyd4efmBKkEcJLLoBYEOMqlmmtLLmZrHDqF+yCzsBwChNv++tpJLkW50rQ2ODUtvr8jWjf1hP/NhcPZReWQSSWQSiSQyyRwksuQmFkEpUpAxzBvFJQpUFSmhLuTHKWKyvu/coUa/h633nM1F9VR3WNQuh6ImbYOFSrL3phERERERGRc9RLdgqBG3o4foci9hoA7p0AilUFVlIuCQ3+h8Oh/aPLQx3AMCEdJwgEUn9mOspRT8B30NNxa9bHzd0FERERERERElqjexo3sw7zaZkRERERERA1A9bW3glqlCUjffBLlqecgdXDW9JWWufvAo/OdcGrWDgUH/gQAyD0D4BzWDk0mvg+3Vn0gCAK4ppeIiIiIiIio4WG1Qftj3aN6YN/UgfYeAhEREdVDV3NKcPcXewEAM0a3RrsQL+SVVCC/VIHU3FIcTMrBvsv27wNHVB9V9frL27McLlGd4BQcg4w/34Vb20Fwi+0Np9BWqMhMhrIoB3J3XwCA3MMPcu8gKLJTAQCOgVFwDKxcRV0905pME+rjgszCcgxoGajp21yXwnxdMbR1EK7nl6GptzMCPZxRplCheaA7DlzORpCXMxau193DMsjTCekF5XioRziCvVwglVT2xS6tUKF9M2/kFVfA280RAe6OaB3shctZRfB0ccDptHzMXH2mjr9Ty7k6ylBSoUK4nyuuZJfYezh0G+sS4YNQH1e0CvaAu5MDBAjwc3NEWl4Z4sO84efmhMSsIrRt6gVWXbSfTnM3a/7/42NdcfZaAXxcHVBUrsTcNefsOLLGpVO4D45cybX3MIj08nZ1QLCXC85dL7D3UOo9dyc5mge6Iy23BE/3bQ65TILsogq0bOIBV0cZPJwd0CrYAwkZRSgqV2LS92yFQFTFUSbF/LHtcCO/FGoBiAnyQGpuCcqVasQEeeBKdjFSc0sR5OmM6EB3/HviGrpH+WHr+XS0CvaEg0wKD2c5fN0ckV1UgTPXCvDhhPb2/rZuewxK1wN+N2viExEREVXn4ewAFwcZHOVSPNYrQhNkq1JUrkTbdzbYaXSNw5HpgyGVSPDtrst10vft7o4hGNwqCCPaNoFaECCX3QpytpqxHqUKlU2O+9qQGBy+kosdFzNtsv/6SlmYjcKja6DITIYiJw2CohxOTWMBAA7+4ShNOISSC3vg3n4IpA7OAACpkytkLh6V2dU3s6gB2CUgnfTeSM37PmLKGqPbP90vCo/3jkSgh7PebRIyCjF4kfj9nQ9PH6zp9aWLIAj4ad8VdI7wwZXsEjz7y1HNc9NHtdIEM5oHuCExsxhSCXBsxlB4uTqgTKFC7AzjfZrPzh5mcr/Zke2CAQCXM4vx55HKRQgf3xuHuzuGmvT6msL8XAEA8WE+CHB3wuRq31998fdzvdChmTeKypVwc5TV+kz5fncS5vx31k6jaxjOzxkOZwcZBEGo9fMDbvXbJf0m92+Ob3dexuHpg+Ht6mjWa6veZ2Q/I9s1wdpTN/DqkBj0iwlAv5gAzXN1FZTuFumLGaNbo3mAO1rNNP7ZUF/98UwPdInw1Xs+qc6UawCqOydmDkVqXglaB3s26t/dv8/3RssmHnCUG78Gzi2uQMc5m+pgVPXT6PbBmNQ9HD5ujsgoKEeLIHcEeeq/HtelY5iPjUZHtpQwbwR2XMxE/5aBensVP7z0YKO/D//qwXi4OMrRO9ofV7KLEenvBolEgnKlCk5ymda2giCgXKmGo0wKiQRIySlF3w+2AQAe6hGO2Xe2tXgcQ1oHAQAmdguz/Jshm2NQmoiIiKiecpRLcXTGEMikEp0THo4y2wXJXhrUAtfySvHHzWBNY+PiIMO/L/TSLA58c3gsLmcW2yybM8jTCVte6w93p1uX31Jo/07v7NAUKw6liHrcmoHCtLxSfL0jET2b++Fwci6+250k6vHqC0EQALUKcg8/+A19Fpn/vA+Zhz+C7p0Duac/AMC97UBU3LiEopMboci8AtcW3aHMT0fhkf/gM+gprYC0PWx7vb/W+/6z+zvixeXH9G6/+dW+iA70MLpfPzfxF8Renj8SUiOpixKJBA/3jAAARPm7ax5f/3IfRAe4I7OoHD2b+6NXcz/sv5yDDmHemveLs4MM52YPx1M/H9YZ8HtreCwm929u0dhV6lvl2C0NSNc04mbAu75o09QTHcO8ERfqBQBa56Hq7uvSjEFpA758IB7ODpXnBX1BiDZNvbT+Rg9OG4Su87bYdFwX5g5Hy+n1OzA3vE0T3Ne1GbxcHNAxzAdvDY+195DIQh/f2wGP9y5Ah2betZ6bd3dbTPvrtE2Pf2neCDjouf5NnD8SMqkEm8+m44mfDtt0HGLoElFZpcVYUBOoPG8XlSutPuaW1/qhecCtz2ClSg25TIrEzCLc+/U+ZBVV6Hydr5sjcop1P2eNt0fG4sk+UbiSXYL+H24Xff+24uXqAC9XL5O2PTRtMLrM22x8w3pkQMsALH2ki0l/m1V83ByRvGAUgMrr8Mipa80+bpivK+7uGIJPt1wy+7V1pZmvC6YMb4VATyfEh/mgsEyBrKJyrWvwmCDj1+PUMEzsFoZfD1w1uI1cJsWgVkEGt+ka6StKULpdiBdOpeVbvR9TLb6/I+6Ia4preaVwd5YjMaMID353AN893AXdo3z1niOiqn3O1AxIA5Wfe1XX1EDlosOq8wfdHhiUJiIiIqrHXBz1B8YcZLapX1mVnVmmUGFAbKBWRmNj8Ol9HXBnh5Baj381qZPNshn2Thmkd+V0lTvixA1Kj2jbpFbmaoi3i2blcVJW4yzVq8k4kslRknAQhcf+g1ubAShLOnYzK3ooZDcnEn0HP438A3+i9NJB5CQfA2QO8B0yGe7tBpl93E/v64Bvdl7GmWvilDGU1bjJHxPX1GBQ2pSANFA5aRji7YK0vFKzxtM8wA0hPq7YWWNC5YsH4o0GpGtycZTh0rwRAKAJLkwd0UrzfO8W/jpf8/Pj3VBaocLexCwMaBmI4golPJwdzDp2TS8MjMZ/J69hUvcIq/ZjqTeGtcQHG3SXEBfLmhf7mLSdm5McO97oj34fbNc89tyA5vh8G/uuAaYtNnh2QHNsOnsDiZnFaBfihUAPZ0T4uSLZRqXRz7w7TOdkn7l2vTkAfRZuE2FElTa+0hd7E7KQX6rEi4OizQpsUP3mJJehU7jubL4HuoWjRaAHFq4/j6JyJaaMiMUjyw6JduyEeSO0KswAwDt3tMa7/1Yupqm6zhrc2nBwoD6Y1D3crO1XPNUdoxfvtuqYXz0YrxWQBqD5eTYPcMfh6UMA3LqOupJdjN0JWRjfKRRP/3wE2y+Im+WnVQ3G3w0DYwOx9XyG1jaP9YrE0j31awHl9FGtjG9Ujbye9BtYfH9HvGDgOrK6ZY92tepYlp7zlz/VHU08ndE+1AvnbxTa9Pro1SEx8HZ1MKnlSt+YANzTORRdI30R4O6k9f15uzqaXfWjoQr1cUF6QRkUKsH4xgbsmzoQPd7bKtKobGfzq30R7OViMCh9R1xTk/b1eO9IpOSUWH2v/8czPUyqHmWt9S/3gYezA0K8XQAATW/+2zHMB2dmD7f58anxY1CaiIiIqIGSSCRak3FiiPBz1dxoOzvINCVuG4M3hrXEcwOiDW7z0qAWNlmdbywgDQC9ov2x9sU+eGTZQWQUllt9TH2TxlVaBLobfL4hqt73uTTpGLL++wjefR6EZ6c7kLf7VxQeXw+pswfc2gyA1KEyYO/VbTy8uo2HMj8DEgcnTcDanB7S/73QG21DvNA10hev/3ECexIM93o3JZvMw7n2rVrncB8cFqHH5J0dmppVrl4iAf55vjfKFCqtfqIAMNTCyX99mW7GuDjKNNkI1gakgcqV/GdnD7d4PNZ6tn9zmwelzRHu54bZd7bBzNVnsHJyT3QK92FQ2gyezg7Y8lp/rcekFk7O390xBDNHt8aLK47prBCw+60BcLuZ9X5k+uBa701TTR0hTtZyxzBvfPVgJ/i4OsJRLmWm2G2qa6Qv/pzcU9R9vjm8JbpF+tUKSAPAo70i0aeFP0J9tEu7H3h7ELrNt22VAmtMMzOw2TbEtKxcQ4a2bmLSdlX3AeF+bgj3cwMATB/VGtsv7LB6DFWibpZ1rW7pI10AAAqVGi2mrQNQ2be4Lmx7vT8GmJipPaFTM7P2be7CPVu5I64pAjyccN83+w1u98jNqjbWuqdzKDaeTUdeicLgdmPimkIuk2DOnW01n2mDWgWhf8tAm10f/fx4V/Rq7o/0wjKDQelx8aF4a3hLBJpZhrsx6hHlh+VPdUdCRhEWrDuHUB9X/LA32aJ9BXu5iDs4GzFlwe+8u00rM+3sIMOCce2tDko7O8jQPtQLJ1Ntmy0d28TTpvsnYlCaiIiIqAF7tFckOoX7YMySPaLs79uHOtd67Pyc4XWyIteWmng6Gw1IA5UlusQOSm97vb/J27Zu6okdbwwQpUfigNhAg88PamX4+Soj2jbB+E6hKK5QYd6as0gvsD5gbitVQeTic7tQnnZWE5AGAO/eE6HMz0DhkX8hc/OGa4vuEJQVKDq1GR4dR0LuFYiVk3tg7Bd7IZFITApIJ703EhUqtSZTMdjLBb880R0KlRpTV53S9Cqu6YFu4cgoKNf7t/bxvXHwcauddfHn5J46s/nPzzFvxbq5MbLVz/WCm5Mcbk5yDGkdhE1n0zXP6QoSNDS2CEgPbhWIzecyDG5zYuZQm2eQvj+undmveahHBB7qESH+YG5Thnpx+rk5IjrQHQ4yKdqFeqFPtD86hHmjtEKlaS/xw6NdcexqLmKDPbEvMRuR/m6IrrGoyM9AP3djnu7XHGUKlcWvByoX0fzv8W6agAKRGOJCvfDn5J5Gz9G6Agfm9nIVW/tQL/SO9sfDPSMQ5OmM1jPXo6Ti1vvMyYQevTXFhXrhxM1AxKf3dUCojwvGfbnPpNc+2ivCquBozXOOtVY9q3/hgoNMipOzhgIArmaXYNGmi6IeW5dmPi74elInPP3zEaPbepkZKK8vmdIA0D3KD8kLRhmsDDVrTBtRjrVwfBwWwnhP7U/v66DzWkgmleDje+Pwym8nRBlPdd2j/CCVShDs5YK/nu2Ju7/Yq/X8XR2a4o3hsZosUbq1kCY60B3fPdwFp1LzLQpKj2xn2uIYWwj2csaONwbAUS41+nf5ZJ9Ik/ZpybncWoEe4rdiqm6QkTkEIjHwjoGIiIiINHTFR6r3+2modr01wKTtgjydsfutAej9vjhlTCP93RDp72bWawyVbL+ncyie7BOFMD9Xoz1Ea5ZorEkikeDy/JGIettwz7cvH+yk+f+YuKZQqwV8uPGCWZm2dan08hHk71kOVXEO/Ea9AgAQlApI5A7wH/Uy0n+fifw9y1F25SRKEg5A5uoFj44jIQgCAj2csfX1/vhk8yXsvJiJIa2D8FivSDy87CAyq2WvD24ViIXj4yCRSHSWznWQSfHhhLhaQekvHohHr+ibPa11BG8e6Rlh0WSgue9RCSyfIB0UG6gJSpuz4OJ289KgGKNBaWMT2zUXAFjins7mZXSR+NqGVJYhrS7E2wUrJ/dEgIeTzkoaro63zg8yqQSdb/aeHWKgMoGpAZXqFo5vD6DyHPJIzwiLs572Tx3EgDSJ7q9ne1kVSPV3d0RWUQW2vd4fKTkleGjpQRFHZ9g/z/fW+nrflEHYfjEDnSN84efmaNGCpAh/N01QekTbYIMLXmp6fWhLs49nK8dnDjFa7tjzZiUUMTLEjfns/o6Qy6QWV34xxpRqSfXF0RlDRN+noSz0za/2M/heuLtjKL7dmYSz18VpjVOl+kKXjmHalaXWvdQHrYKZJVqds4O01nuxXaj57823hsdicv/mYg3LJC2DPPD7Mz1wPb/UrOzfaaNaa/5/fOYQdJi9Sed25lbDkUgAwbrq55h9Z1tsPme78udfT+pkfCMiK/GugYiIiIg0fN1su/LWHna80d+sTMiaJSCt8fdzvUTbFwDMvaudWZOQxlgy2SuVSvDm8Fi8MiQGF24UYsWhq/jffv29tmxJuHlXX31CyyWqEyoyk1BwYBVKEw7BNbobJHIHCCoFJDIH+I9+Dfl7V0CZfwPubQfBu/dEzT6a+Vb+7hff31HrOD8+2hUjP9ul+fq7h7tYNF5j5fDFyk4xxty58OpB7Hs6N4OvmyM6NPNmOUMbebB7GF4d0hK+bo4QBAHf7LyM99adt2hf9aWXrymZ48a4OcowNj4UI9sF4/5vDZcfrU+mjojVWqCyZGJHdIv0Q4DImS7D2piffdSu2iSzpRlhB6cxIE36vTI4Bh9vNi3T1dlBijKFGkBlBRBryx7vfHMAcoorEOrjikh/Nxx8exC62qmkt5erA+7sECLa/sy9FrRXi4qa/n2+d73rvxt081xsq8/L+hCU/rfGIgldHu4RDl8dVXqsFenvprecvikZ+D881gVd59n2fZu8YBQKyhSaxRCNlZujDMUVpldGeaZfcwR7OWOEnuzmQ9MGY9XRVJOvUes6IA0A3z3cGV4uDvBy0f7dTu7fHF/qWWD91YPaQVlD5yxzz61yqcTqntxNbZzB3xiqYFH9x78yIiIiogbOmqzH6u7pHGqTyQh7GNwqEO+OaYMNL/fV9MMzhyUlb2t6aVCLWjfA1hJg5dJqMy17VH/w1UFWuWp+7l3W/6wsUdnzWQKJRAJF7jUoslNRkZEEAPCIvwPu7Qaj/PpFFB6vzCiXyBwgqJSQuXrBZ+AT8B/zpiYgLahV2G4g67d1U8syJvrGBGj+H+ylHcDt0dzPon1ufrWfRa+rzpozhlQqwdA2TRiQtqEQb1fNuVgikeDpfs2RvGAUEuaNwH8v9Ebi/JEm7ccek3/6WLqQo8q0ka1wZvZwzLmrrcXvHWuMMrKgxBA/dyc83ruyDKS3qwNGt28qekDaUtWzwSZ2C9N6rlukL35/ugea+eqf/HxzeEsEevBcQPq9NLiFydvueGMAvn2oMy7PHylKlR5XR7nWQsNAT2fsfMO0yjnWWDiuvc2PYY7xnUJFXdBoqU/v62BRdqWtdY301fz/jrimBrcdaEFZW5nIwe7Vz/VCezN/jqb83J/qZ7trhiBPZyQvGIXkBaPQ9Ob18PInu5v02kAPZ5yfM9zm2cuNPSANVC5mOThtEA5PH4zO4T5Gt58yIhYP94zQ+zkf4OGEp038uznxzlCzxiqGHx7tollwXNNbw2Px8+NdsWBsOzzVNwqj2gXj+QHR+O+F3hje1rRFfoNNbIVVXfXzjSE9ompf69YMllc5MbPuf7ZE1uJyViIiIqIGLsJfnMzeYK+G3zcrxNsFbw5vaXU2yr1dwvDWylMWv/63p7qjm46bSWtVLxFmqLTvVw/GW32s7x/ujAEt676n1OL7O+KF5ceMblfV87ngyL8oOLgKUmcPKPPT4dFxBDy73A3PrndDVZKH4jPbIPPwg2vzLoC0cpJbIpVBcvP/giBAIpUh2NtwYOXRXhFYtifZrO/l43vi8NGmixgUG6gp212lbYgX1r7YBweSsvHuv2dN3md0oDteGBiNxVsTAAD9WwYYeYUO9SR7lswjl90qn7hgbDtMWWX4HPXmsPpTrtVcD/cIx4/7rqBjmDd+eLRrrQU+5mb7VPffC72xbE8yVh5NRbsQL5xKyzf6miUTOxrdxpDXh7ZEhJ8rBrayTXlYSzw/IFrrazcnOfZMGYgJX+7FpB4RmkUNm1/th3u+3o8TKXkAgK4Rvlg8sSMupRehe5Rpk6tExrg6yhDk6YwhrW27yCHMzxWz7miNWWZ87prrni71p21C8oJRdX7MFwe1wCuDW0CpFjDrnzPoHe2PEVYs7LGl5wdEa2VIL76/I/49cU3v9l88YP71tVQqgbuTHEXlSovGWJO/hxOiA9xxMtX4Z5c+30zqhKeqtXs48c5Q0RfS6rN36qDKa28zrkWdHWT49/leeHPlSaw6mmbD0dUPAR5OWm2DxFQVYP7moc6In6O7JHXlduIunquLv68HuoXhlwNXEdvEA58/EG+0lVWfFqbfQ70xrCU+2HBB6zFLqit8cm9HdJm32eA2C8e1xz1dmkGtFpBfqsAPe5MxLj4UYX635nzCfF1xNacEQGUljucHRGPJtgSzx1PTvLvbWr0PIlPYf6kcEREREVnFw9kBB98ehEd6Rli1n45h3la9/uEe4Va9HgAuzRth8UrqDs28sWfKQNHKIy57xPLMPnP7SJtizp1ttEqEGboNHt7W9Mm/3Tr6be+bOhCD7BA8eaJ3JO6Ia2pyUL3w2Frk71kO3yHPIPiRT+E3dDIK9q/EQ02z0LV1FDw6jYHM1QuFR/5Fedp5nZMHVY8Zqzjw3IBoxAS5Y9rIViZ/P37uTph/dzsMahWkM+urdVNPsyZEqlQvI/fDo13Nfr25UyhiLXwh3WKCjJevrOm+rmFGtxGrFGm4X93//l8f1hKJ80fir2d76ZzI3FSjYkDLIA+8bEJG5ocT4tA2xAsf3ROHI9MH4/OJxs81vz3V3eqfpYujDJN6RFhcIttUJ94ZanKp2OcHRtd6LMTbBXumDNTKsneSy/D3sz3xbP/mcHWUYdaYNgjydEbvFv4s8Uii+etZcdudGBLXzNsm+x3bMQSbX+1rk303JK8OiYFEIoGDTIp5d7eza0Daw0hrgcduVrEwlaUZ/BYtINRDEKyvmzS0RruHugpIV7HkM1Uuk2LRPR3whJm/s5r+e8F4KXN7+++F3ugS4SP6GlJ1tT8cXzdHg1VbNr9memUmS6o4vTnc9IWTT/aJ1Hu99vrQGJycNRTJC0Zh3t3tkLxgFNa/3NdoQFoMllTmr8wuj9L5nJujDGfeHaZZ2CSVSuDj5ohXhsRoBaSB2i1Xnh3QHHd2MFzpwRQPdLN+PofIFLyDICIiImoEAj2dTS41ZSsTOjezKIMAqAyCXpw7Ag4yKbxcHGr19DWF2C3bBsQGInnBKHg4W1BcSKSxjGofjMX3d8TZ2cMwqUeE9iH0HMPFzAmzUB9XJC8Yhd1vDcDgVoFY8VR3s7PmH+xuPECmywfjb5W4nDm6NaaNqgz4Dm8bjANvDzJaMrci/TI8Ot0B1+huqLhxCXm7f4Vry56Y8tyjWDm5J1yCo+HefihUxXkoTzOcFWVs4sff3QkbX+mHJ/vqnkiwVHSgO1ZO7mFWSdEgT+uyF8yZ5NrxRn943AYlDcVmzs/4p8e6aX3t42r9z3vOneL1J7f0vG4tQ8HVpt4u+PWJbnhpUAtcmDscG17pi5cHx5j1ffu5O8HF0fD58pGeETapemErXi4OSJw/Ej2NlDiP8nfTG1zRt3jnzeGxODVrmMXtDOj21dTLePZzyyYedTCSSlIbVAv55YluWHRvB0QH1t33ARj+rDFWirqhGGPh93Fi5lCceneY3sVAw9oEmdW2qEk9aVvi4ewAB5npf8PPDdBdXvni3BF4YWA0drzRX6SR1Y3Xh7XEkokdcWzGELMrAfi7O2kqztRnQZ7O+OOZnkh6bxQuzRthdSC+ilrQXs6w5sXe+HxivNZ93FcPdkLyglFmlTOPDnSvVX2luk/v61Drsfgw4+XDgcoy29NGtcao9sH43+Pa18tz72qL5we2sFvp9Y4mfg81TRkeq/N9pxYqq9aY4oMJ7dGnhT9+fKxycbKroxyf3tcRyQtGYc5dlmU7zxbx3oHIGAaliYiIiBqJ7lF+GBcfavHrHQxkPLUz4Qa+bYgXRlqQDfFQj3AEe7lo9bsbYEnPNrGj0jftfnOg2a8JcBen5FmwpzPuiGsKV8faN6j6MnstDWiF+rjiu4e7oLsFARhL+kp/NCEOEzo30/SYe6x3pFYwJMjTGZ8/EI9dbw7A2dnDtF4rCGoISgXK087CsUk0ylJOI33FNPi364v1//4FDw8PbN68Gdte6AqX5p3hN+JFeHYda/YY60qncN9aK+ANGdk2GE/3i8I3k3T3FjPGnIyY+tL7tjFr4uWMza/2Q6tgT4xuH4xxnSw/j1epuYjFGm2aeuEFHVm1tmRKFljPaH+8MiQGTvJqwVUjwab4GhVBAjycdGbrjO8Uiq2v9cM7d7Q2YST1zxt6SrcnzBuBHx/rilXP9rRov7b6nKXG7a0RsfYegpbqCzIuzRuBxPkjseieOIv3d3zmkFotOmxBVwD16PQhWl8vHNceLQLdMaR1ED69t4OoxzelnK81VYb0ebin+Zl7e6cMhNfN6+EtejI+p4/SfX7X17vYmmxic4Lfhjzbv7nZ43hjmO73n6NciteGtkS4n/jVnWzJ2UGG0e2bwkekn2l95yCTYvpoka5FalxcBXo4Y1T7YMy9q53mfszSRe4v6alW0zHMW2cVs5r3m008nfHiIO19DIoNxDPVsop7t/DHoWmD8cm9HbDzjQF4sHvdZfVOuHltPqxNEDa/2g+z7miNx3pZtlhAIpHofN8ZW1BYXaiPK35+vBv6xdSuwjCxaxjcTQxuVxkbH4KHRLx3IDKGPaWJiIiIGpGP7onDyqOpFr3WUDDy96d7oNXM9Xqf//XJbnqfM6RrpC9eG1p78tzcGynAdqXnvMwM8t7buZnVpV6nj2qFVUfT8JyBVef6DuFuSWa3HZgaeAvxdoZUKkX/lgHYdj4DEomksp+0XIpOvQbizNYvUVSQj9b3vI6TP8+BVCpBbm4ulixZgrFjx+KrB4fimf9V7ksQ1Jpe1DU1pFCLVCrB1BGmlxGv6f6uYSb3sTZW1pzEER3ojnUv9RFlX68PjRFlP9W9NrQlVh1NQ1peqej71kWwsDZpzZjpqmd7QqUWEOXvhuziCkTpKOf4bP9oLFx/q0/g2yNj8VRf3ZllDUWUv+6ylXKZVOcEJpEtDW1t30o+NcUEuWNCp1AEeTprFmSOjQ/FyHbBiJ2h/1pXl+VPdtdqqWFLLw+KwdXsEtwdfyvAUzMwd0+XZjbrab36+V545ucjOGGgl7Eli0rFljh/pNYCGmcHGZLeG4l/T17HwvXnER3ojk/v7aj3+v7DCe0x6rPdtR4f3d7yMuRiLegxd/GDLRYJ1Df/e7wbHvz+gEnbTuxaf/q9m2vVsz0x9ou9Vu3D2rLvhjjIpFj9XC+8v/48LmcW40ZBGQBghIEg9843BqDvB9sAALHBHnh1SAye7d8cKrUAtSDorNQU4OGEuzqK06rLHIGezrgwdzgcZVJIJBJEB1pfHvzB7mH43/6reKZfc7g7yUQrnS2TSjA2PgQ/7bti0vYvDmpR54tPiRrGjBURERERmaxrhC8OJueY9Zodb/Q3OGFirMRpp3DLylf9/nQPi16ny7t3WlaqyhS73xqA3u9vM2nb0XHW9857ok8UnuhjuEy0vqD04vvtU2rXHPd2Nm1SSBAESKWVE8YjXJOwatsKAIB724Fw8A/H0w/di8+unoC3Xwds/WUuAECpVGLmzJlISEhA586d0bp1E0QHuiMho0hvQBoQr/9uQ+DsIMPcu9pi+t+n7T0UssKR6YMxZdUpbDqbrnns0LTBNstuX/VsT4xevBuCICCrqMImx6ji7GBZUbfqiyge7RWhVR7Sz0AFi2WPdMGfR1LRzNcVD/eMsOjY9YmXqwN2vTkAfRaa9rlFZEsujjKcmz1c7+LGzyxo2WINiUSCDybUzox2dpBhcKsgbD6XruNVuvUwI7PNWl6uDvheR5DR01mOgjKlzY8f7OWC1c/3xtGruVYHx2xJ1/2MRCLBmLimJpUCb9PUCydnDUVKTolWcPqZ/pYvVhJrgZ+xBVt3dwzBx/d2QGZhOfzcHCG9Dapb9G5hWqC+d7R/rUzchiQu1NvqfdQs3y22uGbe+PXJ7gCAs9cKsDshE4/01J9NHObnikd6RuCHvcl4dUjlgkpL+7bXBa3KPCKYc2dbPN23OZr5ml4tS2z/PN8L7UX42yIyF8t3ExERETUyvz3dHWPNWEHcMcxb1NJtw9oEibYvc+jrGVfX+7b3zXRd9mWsbvdbpvdEnjbaeJavUqnUBIrfe+89PPLIIxgW6QxJ0gG47PkSQ0u34ZGxw/Hoo48i9UoS2rdvj4kTJ2LQoEH477//8M8//6B169YQBAEbXu5r9HiNf9pO230mZlIJNs2ruH3983wvq/fh5+6Ebx/qjNPvDsPC8e2xcnJPm5ZbD/J0xqFpg3G4RrlYU+kriVpTiLeLxRN/3tUy3965w/TeeANiA/H5A/GYMiJW9ElHe2nm64o5d7bBgJbMjCb7q7m48dcnu+HI9MFImDfC4p7BtrDYjAD5xleMX1vUhboOPMaH+eDC3OG1Hjd1waG5bBxH08nT2QFtmnrhwwlxiPR3w+ZX+xlsc2SMGFmVpqgqQx/g4XRbBKSrdIv0rfVYzT7GvVv4Q27F79DexPhtvjCw7oLyrZt64qm+zbXac+kya0wbnJ8z/LYMjEokEpsFpE09b96OP3eqHxru2ZiIiIiIdJJIJFh0bwdcnj/SpO2b+Yh7M3R/1zBR92cKuY0nXiQSCZbfXPltTF313KxvZZVDfVzxwfj2Op9r6uWs9bWnjnJsNcnllUWdjh8/jhMnTmDt2rX447flyMtIw6OTJuLInh1YsmQJXnjhBfz555/o2bMnQkJCMGDAAFy8eBFRUVFQqVSQSCSQSSVY+khn67/JRkQuk4oSGCXd3PS0IJhzV1skLxgl6iSQu5Mc93RuZnHFCktM7Gb+ef7XJ0xr8zAg1vIg6rA2TXBfl2Z4f5z5fe4bo0k9IvDJvXWbhUqkzz2dK9t2vDGsJXo294efu1O9CxK5OMqw/uU+RjMqT80aipgg+ywCrOnjezoAqGw9UFec5DJ8PamT1mOv6+llby1btecxxfhOodj2en+rg8r3dmmGN0T4+VQtFNR1r7H8ye63VdWf6n58rKvW159PjMeodsHwaCAtjUwhxiKD6v2Z6xN7L+hujAJNWKRqywX9RMY0nrMzERER0f/Zu+vwKK4ugMO/lbgLCYQAwYK7u2soWrwUKcUp0AJFKmgLlGIfVqw4pZTiUkqRQnH34i5BIsST3fn+2GZJiG2SDQlw3ufhYXfmzp27m2QzmXPvOSIetVrFriE12XXxMVN3XU2y3Wi/tNemTUxmrKuc3CbxYKg5mZqi0Rzp1UzRuXJutp1/FG/bgGRqUL8JbcvnwsfdjrbzDwMwu1MZmpU0rIC68TSEcVsupVizSlEU4021oUOHMm3aNHx9fcmXz3AjRaVS0atXL/z9/Vm3bh1t2rShZMmSzJ8/P14/MTExxsA2QG1fDyr4uHD8dgAAhTwd+PfJS+P+9/E+XklvZ/4ZUZdVR+4wd9+NRNu8qUkW75q87nZ8Vq8grrYWzN9/k8fBEXStkoculc1TLy6zjfmgGKuP3k3VMa/XPk3KsIZpD6xo1ComvYHfB28TJ1sL/hxS851ZAS7eXt+3Lkn3ankpnEkZXUxVOLsjhbM7UtDDnoFrTsfbt75vFcrmdslSwb86hT34d0LjN/4zXjqXs/FxIU+HDMvUUTCLBP/TQ6NW0b9OAX7449909eP+XxmKwfV9WXPsnnH7uj5VqOCTcLXw+8LaQsOGflVZfPAWQxsWwsfdkAHsxFf1KfSVoWyAfRKTBd8mnSvlZlUqr73iykqfWyJj9ayRjx//TPr+D8AqEyeLCpER3v5PZCGEEEIIkSRfTwd8PR2o6ZuNwLBoQiNj6LvqlHG/hUaFp6N1Mj1knKImpnI1RQ6nN/MaUqo1OLxxoTcWxKua352fu1eg+8/HaV8+F182KYyriUGfjFTBx5Xbk/wSbM+fzT7BSobEqFQqHj9+TPbs2Zk6dSonT57k77//5saNG+TJYwjoeXh40LZtW+bNm0dAQAA5cuSIF8xWFCVeQBoMkzTW9anKidsvyOFsQ1SMnjpT98U77/sop7NNkqn1prcvJYGsdIitj9ekRA7+uuxPq1SUVcjqLLVqOlbMzZpjab85mhQn28xbFfeueheCOuLtp1GrTE7jnxV8UMqLyBg9Q9edBWBw/YKUy5M1A3+Z8bvaw8EKvxI5uPAwiI39Mzbzypuqm52VudtbGX9+PB2taVjUk12XDH+TvM8B6Vhlcrswu1P8jDFWWg3jWhTj76tPaftfpoa32Xv6p4pIAxtLDa3L5uT3Uw8S7CvoYc+Cj8sbJ28IkRmyVp4cIYQQQgiRIUp6O1PTNxtNSuSIl85pUmvTV5Qt6VaecS1SrtFZyMSb37/0Ni0dtilMXcWcXlPbluSzegUp+Foav2GNCtGomCfdq+Z9I+OIVaeQB1fGN2byhyWzREDaHE6dOkXDhg25efMmAL///jtubm7MmjWLW7duGdu5ubmRJ08eIiIigPhB5eQCzOV9XMnpbENedzu+aVY0g17F2yWpoHSrMm//DbyswNPRmk6Vcieoqfq2+761+VNk/9y9gtn7FEKItGpWMgfl8rjQs3peBtf3zezhZCkqlYo5ncuyf1idDP/9ZurEwdmdsna5gpWfVKJGQfc0HftxlfiZVtQSoTTJx1V8WNS1wns3yTK2Hnqs12tsi3dfYt/zfiVy8OfnteJ9bwiRGWSltBBCCCHEe2Z+l3L0XHaCgXUL0Kac6UGnuoU9Afhm08Vk23k527D9sxo0nXUg2XY2qawfNaxRIWPaO1c7S16ERgGw/bMab2yVq7OtJZ838DWugLz8KJg8brbYWmbeZfXbXodLr9ejVr8Kij558oS7d++SM6dhVamLiwvbt2+ncuXKaLVaOnbsSIECBRg9ejTW1tbGtN5p8WF5b8ZtvZTu1/C261I5D1N2pi+lpBDptapnJaoVSNvNeiGEyAjWFhrW962a2cN475l6mR9bMiarql7QneoF3Vn4900mbr+cqmPzuNnGey4xaZGcD8t582E5b8KiYrDWasxSk1q8XYY0KMjpuwF8WM4bD0drsjtaUy6PS8oHCvEGSFBaCCGEEOI9Uza3Cye/qp/mQK4pAeeiXimnZ0zt2XvXzMeeK/74etozqJ4vlb//y+RzZZS3KQ1lVqQoSryANED+/Pnx8PDg8ePH5MmTh+joaMqXL8+qVavo2LEjW7ZsoVOnTlhYWHDkyBHs7e3jpe5ODUdrC/4ZURdLzfudQMrB2oLbk/wYuu4sv528D8DBL+tk8qjE++TaxCZYvOc/h0IIId4PDtbJ344vk9uZ03cD42374LWA+5sqFyTePnEzBmTmxGmRuTwcrNk5uGZmD0OIRMknkxBCCCHEeyg9K4uLejmyqmclTt4JYNqfV9PcT0pp52Z3KsOA1acBaFTME61GHW+1yvIeFbFP4aaOyNpUKhVRUVG0a9cOBwcHqlSpQu7cuXn69Cn+/v7kyZPHWBu6ffv23L17l5EjR9KyZUtatGgBQFRUFJaWaU9dntPZxiyv5V0wtnkx9v37lHJ5nPF2sU35ACHMYMUnFSUgLYQQ4r2R0p9h41sUx8vZhrLj/wSglm+2BCtdv2xcmJN3AuhW1SeDRimymu7V8rLyyF1cbC0ICItOtE0uV5ssnzFACCHkLp4QQgghhEi1agXceRmR+B/Dpkrphkyzkl74lchBSGQM9lYJL1tr+mZL1/lF5nh9VfPdu3epVasWx44dY+rUqWTLlo2goCDGjx/Pp59+St68eSlevDgAw4YN48iRI/Tv3x9PT0/Kly+froC0iM/OSsvRUfWQxTfCVPmy2XHzaWiaj/dxs6VGQfksF0IIkTRTLkvqFfbI8HGYS+NiOfhy/fkk9xfP6QRA23LerDt5n4F1CyRok8vVlkMj6r6xEkYi8+XPZs+lcY04dz+IDguOJNrGQi2T/IQQWZ98UgkhhBBCiDRxs7cyPtYm8gfwsh4Vkz3elJsoKpUKB2sLueHyjtDr9cavZWBgIIqikDdvXoYMGcKaNWu4fPkyU6ZMwd3dneDgYPr06UO1atWoWrUqgwYNAmD9+vXkyJGDdu3acf/+/cx8Oe8kjVolP2/CZL/2rpJim141E9Z9n9iqOL/2rsLmgdUzYlhCCCHeIYPr+xof53O3Mz4e2aSw8fG4lsXf6JjSw8nWgmoF3FJsN+XDkpwf05DyPq6J7pfrtfePraU22UkaszqWSWavEEJkDbJSWgghhBBCpEnROPWUE6trVss3G7lcbbj3IvxNDktkUXHrR8+YMYM//viD0NBQihQpwtixY8mePTtWVlaUKVMGKysrRo8eTeXKlTl16hRHjhzBz8/P2Ndff/3FsGHD8PHxyaRXI4QAcI8zOSkpcVPknxvTkIgoHR6O1hk5LCGEEO+Qj6vkoUZBd/K42aFRq/h++2WeBEfQq2Y+mpXyIjAs6q0rxzKhZQnqTN2XYHvHirmMj2Mn5woRVw6npL/XY1fZCyFEViZBaSGEEEIIkSZ2VlpOfd0AC03S87V93OwSDUoXz+mYSGvxLotdzdGvXz/WrVvH1KlT0el0LF68mCpVqnDlyhUsLS2xt7cnX7583Lt3jwYNGlCrVi1q1aoFGFZa63Q6HB0d+emnnzLz5QghTNCzel46VcptfO5obYGj3GAXQgiRCiqVinzZ7I3PRzYtYnyc09nmrQtIA+R1t6NKPjcO33xu3Hbwyzp4u9hm4qjE2yC3m3yPCCHebhKUFkIIIYQQaeZql3w9X0VJfHu53C4ZMBqR1d24cYNz586xc+dOypUrx8GDB7l27Rp+fn5oNBpUKkPq6MjISC5fvpzgeLVabVxtLYTIum581xS1SlKLCiGEEEmZ07ks607co1WZnDhYW2BjqcnsIYm3xN/D6lDzh73xtq3vWzWTRiOEEKkjQWkhhBBCCJFh+tTKz8HrzxJsl0DF++nu3btcv36dcuXKsXTpUgYOHMjIkSMZNWoUAKdPn6ZMmTI4OjpSrFixTB6tECItxrcsnmhJByGEEEK84mpnSe9a+TN7GOItlNvNltuT/HgWEsm/j19SNIcjLilMFhdCiKxClhkIIYQQQogMU72gO38Pq5PZwxBZhJubG+XKlaNLly4MGTKExYsXGwPSR48eZc6cOYSFhbFixQq6deuWuYMVQpjE7rWVXR+W9c6kkQghhBBCvD/c7a2oVsBdAtJCiLeKBKWFEEIIIUSGSqzulSyUfj/5+vry8uVLVq1axYIFC2jXrh0AERERzJo1i8DAQCIjI/H09ARASSr/uxAiyxhYr6Dx8aERdSX9qBBCCCGEEEKIREn6biGEEEII8cZ1rpQns4cgMoBOp0OjeRWQ0uv1xhrQOp0Oa2trFi1aRIUKFVi7di3Xr18nf/78zJw5k+DgYPbs2YOLy6t645LmXYisr5Cng/Gxl7NNJo5ECCGEEEIIIURWplJk+YEQQgghhMhgPiO2AdCtqg99a+fH09E6k0ckzC1uQHrjxo188MEH8QLUcdvs37+flStXsm3bNkqWLEmOHDn4+eefE/QjhMj6FEVh7fF7FPVypKS3c2YPRwghhBBCCCFEFiVBaSGEEEIIkeGeBEcQGBZNoewOKTcWbx1FUVCpVNy7d49mzZrh5eXF4MGDadSoUZJt9Xo9wcHBaDQaHBwM3xcxMTFotZLMSQghhBBCCCGEEOJdI0FpIYQQQgghRLpduHCBevXq8cEHHzBlyhRsbGywsTE9lW/cVN9CCCGEEEIIIYQQ4t0id32ygMjISMaMGUNkZGRmD0UIIbIE+VwUQoj43obPxY0bN1K1alUWLVqEq6srgYGBHDx4kD/++MOk4yUgLYRIrbfhs1EIId4k+VwUQoj45HNRiKxFVkpnAcHBwTg5OREUFISjo2NmD0cIITKdfC4KIUR8We1zMbFVzePHj2fRokVs376d5cuXc/36dfbv309oaChDhw5l/PjxxtTdQghhDlnts1EIITKbfC4KIUR88rkoRNYiyxGEEEIIIYQQJouJiTEGpF++fGnc3rBhQ4oWLUrp0qU5c+YM9erV4+TJk0ycOJGffvqJgIAACUgLIYQQQgghhBBCvKe0mT0AIYQQQgghxNtBURS0Wi1Pnjzhk08+ISQkBDs7O9q2bUu3bt3YsWMHJ06coHz58sZV0TExMdSqVStV9aWFEEIIIYQQQgghxLtFVkoLIYQQQgghkhUTEwOASqXiypUrlChRAltbW9q3b4+1tTWTJk2ib9++AJQvXx69Xs+9e/eYNWsW48ePp1mzZlhbW2fmSxBCCCGEEEIIIYQQmUiC0lmAlZUV3377LVZWVpk9FCGEyBLkc1EIIeLLjM/Fp0+fMmDAAAC0Wq0xML1+/XoqVarE2rVr6du3L6tXr2bo0KHs2LGDOXPmALB582ZGjx7Njz/+yNq1a+natesbG7cQ4v0h14xCCBGffC4KIUR88rkoRNaiUhRFyexBCCGEEEIIIbKW7du307p1a3r16sWsWbOM23v27Mnly5f5559/jNv8/f35+uuvefHiBevWrePWrVucOXOGihUrkjNnTmL/5JCa0kIIIYQQQgghhBDvJ1kpLYQQQgghhEigVq1azJ07lxUrVjBjxgwA9Ho9Pj4+aLVarly5Ymzr4eFBnjx5OHLkCKGhoeTNm5dWrVqRM2dO9Ho9KpVKAtJCCCGEEEIIIYQQ7zEJSgshhBBCCCESsLOzo2XLlgwePJhvvvmGrVu3olarad68OZcvX2bx4sU8ePDA2F6r1VKmTJkE/ajV8ieHEEIIIYQQQgghxPtOm9kDEEIIIYQQQmRNrq6u9OjRg4cPH9K9e3f27NlDyZIlmTFjBoMGDeLOnTuUKFECrVbLuHHjmDlzJnZ2dpk9bCGEEEIIIYQQQgiRxUhNaSGEEEIIIUSyLl26xPDhw7l27RonTpzAwcGBX375hb1793Lo0CFcXV0ZNGgQrVu3zuyhCiGEEEIIIYQQQogsSILSQgghhBBCiBQdPnyYAQMG4OjoyN69ewFDjenQ0FBUKhX29vbE/mkh9aOFEEIIIYQQQgghRFxS4E0IIYQQQoj3mE6nS7BNr9cbH8cGmitWrMi4ceO4d+8en3zyCWCoF+3g4GAMSKtUKglICyGEEEIIIYQQQogEZKW0EEIIIYQQ7ymdTodGoyEyMpIDBw5gb29PkSJFcHJySrR9REQEq1evpmfPnuzYsYNGjRq94RELIYQQQgghhBBCiLeRBKWFEEIIIYR4j126dIk2bdpgYWFBVFQUOp2OpUuXUqlSJbRabYL2AQEBXL16lUqVKmXCaIUQQgghhBBCCCHE20jSdwshhBBCiFS79yKM3ZeeIPMb3x5xU3LHun79Os2aNaNJkyYcO3aMK1eu8ODBAyZPnkxISEii/bi4uBgD0oml/hZCCCGEEEIIIYQQ4nUSlBZCCCGEEKlWY8peei4/wb5/n2b2UIQJ7t27R7Vq1bh8+XK87bdv38bX15dp06ZhYWFBq1at8Pb25ptvvsHZ2TnFfjUaTQaNWAghhBBCCCGEEEK8SyQoLYQQQggh0uzknYDMHoIwgaurK8+ePaNXr148e/bMuP3y5csEBwcTEBBAuXLlCAgIYN++fZQvX55z586xZ8+eTBy1EEIIIYQQQgghhHhXSFBaCCGEEEKkynfbL6fcSGQZiqJgZ2fHnj17uHnzJgMGDCA0NBSASpUqERMTg7e3N6VLl2bXrl14eXkBsHfvXubMmUNgYGAmjl4IIYQQQgghhBBCvAskKC2EEEIIIVJlwd83jY9VqqTbRev0PH0Z+QZGJJKiKIqx7reHhwcLFy7k119/Ze7cuQAUKFCAQoUKkTdvXtq0aYOlpSXR0dGsXbuWH374gUaNGpmUxlu8O/65/ox/rj9LuaEQQgghhBBCCCFEKqiU2LtUQgghhBBCpGD9yft8se5svG3T2pWidVnvBG0bz/ibK49fsvvzmhTwcHhTQxSJ2LVrF7169aJBgwasWLGC6OhofvrpJ3r27Mnp06f56quvOHfuHD4+Pjg7O7N//36mTZtGz549AUNwW5XcDATxVlMUhc9+OYOlRs36U/cBuDyuMTaWUjNcCCGEEEIIIYQQ5iFBaSGEEEIIYTKfEdsS3b7m08pUye+WaNvB9QsyuL5vho9NJO7SpUvUrFmT4cOH06tXL+7cucOiRYtYtGgRW7ZsoX79+ty+fZtTp05x8OBBPD09adiwIWXKlAFAr9ejVkuCpXfZqbsBtJ57KN62wtkd2P5ZDdRqmYwghBBCCCGEEEKI9NNm9gCEEEIIIcTbYfHBW0nu67jwCLcn+SW6b8buaxKUzkSPHz/GycmJjz76CGdnZ5ydnZk2bRoPHz6ke/fu7Nq1iyJFiuDj40Pr1q2Nx8UGoyUg/e6LitEn2Hbl8Uv+ffKSIjkcM2FEQgghhBBCCCGEeNfIHSYhhBBCCGGS8VsvmaUfSdRjPreeheIzYhs7LzxKso21tTW3b98mLCwMgJiYGCwsLPjyyy959OgRgwYN4sGDBwmOk2D0u0+nV+i36iSz/rqW6H69/KwKIYQQQmSoDdc28OmuT3kZ9TKzhyKEEEJkOLnTJIQQQgjxHntTAeIFf9/g8qNgfEZsI+/I7Vx4EPRGzvuuqzN1HwB9Vp5i67mHCfYrikKBAgWoU6cOo0ePJjg4GK3WkCxJq9VSunRp9u7dy759+97gqEVW8Ovxe3y18QLbzz/m0I3nibbpt+oUOv07FJhWFDi+GPb/AAemGZ4LIYQQQmSibw59w5FHR1hyYUlmD0UIIYTIcBKUFkIIIYR4D8Xo9Axdd5aaP+wlLComxfapCV4HhkVR979gaazvtl+hycwDxudD1p4xuT9hmgGrT3PmXmC8bSqVCg8PD9q1a8eDBw8YMmQIMTExhIaGsmfPHipWrMijR4/o3Llz5gw6E0TF6InWJUxX/T55GBjO8PXnWHPsbrLt7jwP4+9rT9/QqN6Am3th2+ewdwL8NRYub8nsEQkhhBDiPRKli+Je8D0g4d9XslJaCCHE+0BqSgshhBBCvONidHq0mldzEdccu8vI388bn28//5gPy3kn28c1/xCTz7fkn9vcfBaabBtJC5y8wzeeM2nnFSa2LE4xL0dCImNwsLZI0E7R61CpNcbn/1x/RsmcjsbU24qioFKp6Nq1KzExMcycORMPDw+8vb25c+cOGzZswN3dHXhVQzqjPQuJ5PTdQGoUdMfaQpPyAWYUo9NT6bvdBIRF06R4dpxtLehfpwAO1hY42SR8f99VQeHRJreNiNJl4EjesOc34j//tQuMfABW9kkfExMF+miwtMuYMd0/ATYu4JY/Y/oXQgghRJZRb109AiMDjc///PBP4+PUTAKO1kXT769+lPUsS99Sfc05RCGEECJDSVBaCCGEEOIdtvPCY/qsPEkFHxecbS3xcrJm2eE78dqoTOgnMjrllaWhkTHYWWlNWoX6toWkHwSGc/ZeIFP/+JfPG/rSrKRXvP0BoVEcvP6MBkU9zRJo7bjwCAAfLzlGpbyu7LjwmG2fVaeYl5OxTXR0tDEgHfX0NmorWyZvjaB5KS9yudoChpXSiqJgZWVF79696dKlC+vXrycmJobGjRvj7e1tDEa/iYC0oiiUn7Db+Pzi2EbYWb25P0kuPQomIMwQkN1x4TEAa44ZVquc/aYhTrbmD0z/cuwuno7W1CnsYfa+34S37Wc1WYnd7N08ENr+nHj7yJfw/X8TdkY9NH9gOvAuLKpneDxGShoAhokDf4yCAvWh4qeZPRohhBDCrOIGpAEa/NbA+PjXq79S3L04rQq2SrGfXXd2ceTREY48OiJBaSGEEG8VCUoLIYQQQryjFEWhz8qTABy/HZBku/HbLtEmhZXSKhMi18sO36Zf7QJsPpOwtvHrbj5NfiV1VlNt0h7j4wGrT6PTK9T29TAGMTsuPMKVx4aUe9cmNsFCY54A74vQKGPw9JtNF1nftypg+NreDYwEwP/3CUQ/vY2i06G2tKbKo6tcXzIEa2trwBCYjv3fwcGBbt26GfvX6XRoNBr2XvHHydaCUt7OaNSmTFNIvagYPZN2XIm3bcDqU/zcvWKGnO91P+2/wfevnT+u0/cCqF0o6cDxxYdBDP7lDMMaFaJhsewmnfPqk5eM+C8rwe1JfqkbcAYy5ef5nbRjWMJtF383BKX3fg9XtkL3HWDtaNi3uv2rdk+vQM5y5h3P06vm7e9td2Q+7PzS8PjqTglKCyGEyDRn/M9w4skJuhfrjkb95jL7fHPoG3SKjg99P0y2XaQu8g2NSAghhDAvqSkthBBCCPEOitbp+eX4PZPaBoZFs+zQ7XSfMzbN74PA8HT3BbD/6lP6rjzJxtMPiIhOmEL4UVA4W84+JCYT6gMP+uUMpcbt4kFgOANWnzIGpAGzvJeJOX7tofG9ValUnLjpj/9vY9GHBeHefDjuzT7Hwj03j7dNZ8eOHej18d+XxFZCazQabj4NofvS47See4j8o7bjM2IbB8xYR3jJwVv4jNiG71c7WPLPrXj79v6b9HkUReHm0xD0evOs1U0uIA2G77fk9Ft1imv+IfRacdLkc/oHm/+G4a6Lj6k2aQ+Hbjwze98GCoVVd7EhIoP6zySH5ya978CPsH8SPLkA/8x8tf3OP3EaZUQkP8739tU/MqD/t8jlra8C0kIIkYXoFT0x+pjMHoZ4w7rs6MLMUzPZfGPzGz/32MNjmXp8KjcCbyS6X1EUfjr70xselRBCCGEeEpQWQgghhHiHPAuJZMDqUxQcvSNe3eiUfLv5YrL7N5x+kN6hJfBvnEBuXFvPPaTt/EN0XXKMHRceM3jtGQp/vTNemyk7r1Dl+z0MXHOa5a+lI3+Tqk3aw9Zzj+JtS+p1pcfTDd8Rdu0I+/71N24buuIQMYGPcarSHqscvljnLkG2liOxzObDxIkTCQwMNKnv8w8Spg3usvgYPiO2cfpu0ivsXxej07Ph9H3uB4QRGaNj3r4bXHwYxLitl0zuQ69X+PfxSwJCo8g7cjt1f9zPV5sumHx8UkyZKPEgIPk2IRGpvyGdESuSe604yYPAcDotPIrPiG30XWl6kNw4riQCrFpiqKs+zU6rEfxmORZIPOO1YbuCEhOV6nNnirlV4I+RSe//a9yrxwemZvx4ErO6XeacNys4sQTWds7sUQgh3oDQ6FCOPz6OXnnzEwoBlOhonv20gPALyV/3xtV5W2dqrq3J4vOLuR10G4Cw6LAMGqHIam4F3Uq5kYkO3D9gcttll5bRclNLLj6/yKrLq/h4x8e8jDL8jXH26VkehqacmUoIIYTIiiQoLYQQQgjxDhmx/nyCIKmpfv4n6Zsuiw+mfEMmWq+w9ZzpN0gazfibkMj4gb7z94MYsPp0ounGfUZs49qTl9x7Ecbcfa9WDozbeomD1zJq1agh+JYaURmwctvSqxB2haqx879U3gAxL58SE+SP1skTAH20YVWu+wfDuHr1Klu3bk2x34hoHYN+OZPk/lZzD5k8xhVH7jBk7VlqTNlLkxkHmLzzCn6zDpp8fFSMnlJjd9Foxt+UGf+ncfvqo3fxGbGN5yHJrzq+8zyU8hN2M3vPNWN/LyMM9aPjpl9PSnIp18/eC+R56KsA7Ky/DOeIiNYR/N85UmJKrfW02BHneyI9RmpXcdmqO0O16wAopjZM9tCcXQU7R8WLTutDX6Aa64xqQjaUQ/8z/ST3T8DG/hBivpX4KXp2HfxNnxiRpIyYYfDsmvn7fBttHZLZIxBCvAHnn56n8urK9PijB6svr86UMbxYuYqn06dz+8PkUyPHUhSFC88v8DLqJTNOzeCDjR9QYlkJKq2uxORjkzN4tOJdERYdRsuNLen3V79UH9thawcmHZvEaf/TrLi0AoDnEc/NNq7U/p0jhBBCpJcEpYUQQggh3iG7Lz9J87Fjt6QvcDNv3w0GrD6dqmOKf/sH4VE6ngRH4DNiGx/MTj6I2WD639SYsjfB9o8WH03VeU2hKAr+LyP4cVfq6r5uOvOQdSdMS52e8hgMgUynSm1QaS359+A2duzYAYBVDl+0rl4E/mO4sau2sELRG9Kc586dm/DwlFcH33pmvtre/1x//t+Y4WYq+l134h5PX0bi+9UOXkYmvRq53ITdbDh9P9F98/bdoNYP+3gWEsnUXVdZ8PcNfL/aQYkxu/B/aVoa6gZFPY2pwjedecDBa884eSeARQdu0mLOP/HaTvvzKs3+d4DCX++k5JhdBIQmvmL4un+I8XHB0TuY+se/Jo0loyUWX+2t3YaFSkdR9avMA7etO9H4xng4Mgdu7gNA0etQ/5D3VV+7vjLtpP6XYVE9OLMSphaA2RXg+l/wOP0r4ROl18Pj8zDbTHWgQ54a+jSXHSOSX739vpCb4UK8N3rv7m18PPn4ZIKjgt/o+RWdDv/JrwLJoUeOpHjM5/s+T3LfyssrOfH4RJrH83T2HII2v/nU0O+aX//9laqrq3LogekTKVNDIf2/p7bd2saNoMRTcadGRIzhmvb1jDfjD49PdV/XA65TaXUlvvz77S+dce7pOdZdXScBdiGEeEtIUFoIIYQQ4h1w70UYZ+4FZvYw0qTINzup9N1f6e5n5O/nWH/SELR8HhLJsVsv0KWjHvF32y9TceJfzN57PdXHDvvtHLsvpX2CQCyV6tXluj46glPr5/PhgG/oO8OwmtWxYmuintwg8MAqQ3u1Bl1YEP/ef0bOnDlT7P+f6+ZbYR4Zk7DutymG/XaOChN3m9R2yNqz5Bu5LV6d6SM3nzN5Z/x60d9tf/W84kTTvrdO3gmg+Jg/8BmxjUG/nOGjxUdpM+8QE7ZdTrT9hQevbqaXGf8n/sHxg98xOn2CtPiJfS89D4nkk6XH2XUx5RXPmXqzLSIQgMsHN6b+2IDbMLdy/G3PrsLK1jC/miG99oX16R1hfEubwvzq5utvdVsY5wJjnODnpmnr498dsPc7uL4bjs5L3bGnVsCxhYbHumh4dBbelvTpyTm5NOl9QQ/guQk38UOeQoh/yu2EEJkqNvVwrPVXzfy5n4KbzVvEe363W3f8n9/jcajh9++z8Gfce3mPUstLUWJZCbbf3M7uu8lfn3T/ozubrm9K9VjCz5zh2ezZPBz+9gcEM9v4I+N5Gf2S3rt7ExSZsCRNVmDu67fXg9K/Xv2V5+GpWz294rJh1fWO2zvMNq7M0nl7Z8YdHkfJ5SUTTa0vwWohhMhatJk9ACGEEEIIkT7Hb7+g7fzDGda/Kam7s4I1x+6x5tg9vlh3Nt720183wMXOMlV9hUTGsPBA+l53z+UnuD3JL83HK4o+XlBabWGNR5uvebppEiuWLMS5emdsC1ZBHxpA4IFVRNw9i8YxGxF3zmLjU4Y6DRqneI6kAq5xrT1+FycbS0IiYzh+6wXFczrSvFROnGwtjG2WHbrNgQxMoR6XXoF8o7az+tNKlMvjQocFKa90MsWKI+mrTT52yyXmdC4LwM4Lj+mTRK3nIWvPMK1dKVQqFXq9QqMZf/MsJIq/rvhz6/umqJJJE335kfnrlZvOMK6QgDSk3l7bJfn9/pfgtx5QrLX50mTfTcdnoqKAPpka4nf+MQSFc5RKXb9rOqTcJuwFnFoGkS/BszicWwtFW8DmAYb9Qffh8mZ4cdPw3DU/+FQDnxqQtxY4eKZuTIl58N/3bk4zrTJPzh+jkt43vajh/483Q96aCb83nt+AS5vgL0P9c+p+DdUGw8NT4OID9h4ZMWIhhJmYYwWqqfSRkUTdSDjJ5X+jGrOxauLrdb48YFrA+Kt/vqKEewnyOeczeTwxAQnL1JgiJCoEOwu7ZK8V9tzdw/OI57T1bZumc5hCr+iJiIlg4fmF3Aq6xbmn55hQfQJVvapm2DkT83oQ+kXEC5ysnNLVZ6QukgorKxifr7+6ni/Kf5GuPs0tsa9/7V9rs+6DdRR2LZzi8YqicC3g3SghEqWLP0Gv0upKjK06ltYFWwOw7OIyFp5fyLLGy8jvnD8zhiiEEOI1EpQWQgghhHjLzU3DSt7UGL/VDPVYM1GZ8X+y54ta5Mtmb1L7iGgdxb/9wyzn1usV1OrUBdmevoxEURRUKjUxQf5E+d/Ewi0XGntXLD3z4VKrGy92z+elvQtOFVvjWLE1ll6FCb2wB5VGg3P1j3Ao3RidoqDX61Gr05cc6cv15+M9X3sCvt5kWAHsaK2lan53dpqwytfcOi00f8r29IhNWX7ufmCSAWmADacfUK+IB81KerH62F2ehby6mbbnij/1iiQdVHwYmHhK9oV/3+TTmqbfDE+PRL+fz60zBA1jA6LhgRD61BAwVasNqbtNoY8BjUXK7VISms5ai3odHF+cfJufaoK1E4y4C7oYUGuSD6jf3J/yeZc1h4enIfK1lLZXd756/M+M+Pte3DD8O7Xc8HxMOleJRYfDwrqGx5/8Cbkqpq+/5Nw/AYmsaEpgeXPIUx26b4u//X9l4z/fM97wL9bwW2Drmv5xCiHMQoUqXiBap9cRFBmEg6UDalXGJnJ89PXXiW7vtF/PtZxwMU/6zt9iUwucrJz47YPfyG6XPV19JeVqwFXabG5Dvdz1mFFnRpLtBu0dBEAFzwr4OPmYfRynnpyi686uCbb3/rM357ueT+SIjPPZns/iPTfHitjyK8vHe/4y+iUzTs6gsldlKueonMRRyUtuEkHqOjL8l9TPS9stbTnf9Tx6Rc/i84sp7VGaCtkrJGi39eZWzj97s18rcwiKDOLKiytUyF4BtUqNXtFTbmXCCXTfHvqWM/5nGFR2EFNPTAWg5aaWfFPlG0plK4Wvi++bHroQQog4JCgthBBCCPGWO2jGFMxrjt2lbTlvtBrDzY7jt1+Yre/MVPdHQ0CobG5nBtX3xdfTnhxONgnabTn7kIFrUlcXOzn5Rm1nevtStCrjnWLb5yGRlB2/C5VKjUqlIuzfQzzfMROVlS0qtQbbQtVxrt4J20JViQ58xMuTW9DYuWBfvC7W3kWx9i4ar7/fT9yla7X4gcqg8GgOXHtK/SKeWFto0v36giNiMiUgnRVdfhTM9zsu89P+mym2HbD6NOtO3Gf/1firjj9ZdgIrrZq6hT2Y1q40Npbxv0Y9lydeu3Li9supCkpPNGGFfAKH50D+uonHXX/vafi/zleGQO2OYanvHwzBYBOC0jE6PXoFbj4LoZCnQ/ybvXo9/JDOAP2O4fDchBVEEUGwsZ9hNbM+Bj7dk/Tq4uXNU+7vlgmB64wWFSdIvLhB2oPcaz+Cy1ugQAPosBq0iWSrWFTP9P7uHDSkLU/NpIUpeQ3fk7XS+P0ohDCr11dGzzo9i1mnZ1E5R2UWNlyYoecO3rwlyX3frtbTbmT6g+JBkUF8tP0jdrc1oSRJGmKnqy+vRq1XOH1hN9RJuX1AZAA++KT+RHHodDo0mlfXItG6aLru7IqiV1AlMklt/dX1tPFtk65zJubkk5N423vjafdq4l5ETASn/E/Fa3fiyYlEV6xH+/sTsGIlzxuW5R+uE62PpkOhDrhYu5h0/sUXFrP4wmK2tdpGbsfc6XsxGSxaF82fd/5k1ulZAIlOFFh/7c2mzo/r1JNTHH10lE9LfopWbXpY4kHIAxqvN2SBKutRlja+bRh9cHSS7Tdc38CG6xvibRt3eByQ+HsihBDizZGa0kIIIYQQb7Ft5x4RrTNf+sORv5+nwOgd/HnpCTvOP8rQtOCZ4dTdQLouOUaV7/cwJU4d4pDIGHxGbDNrQDrWkLVnKTh6O09eqzn8unITdhvTdUc9u0vY1UO41PuU7B9Nxa5obSLunSdg/zIAnCq1wbZgZUJObyf0yj8o+oT1nMdsvcLKo3fjbft0+QkGrD5N4a93EhaVTHpikSamBKRjvR6QjhUZo2fHhcf8fCh++vhfj99Ltr/AMNPrCyd17mTdPwaTcrHxcDKZE/ZOSHtAGrhz80qy+6f9eRWfEdsoMHoHvl/toPGMA+QduZ0D1+K8np1mqM95YjHc+tu0tmdWvUr1vbAuRAQnbBP0IP1jMtXPfobAfFq9Puvg9VVnkS/h2TVD8Pr+yfj7FcVQG/ynmoaANMD1P2FCNkPbuKITX/WfrH2TXj2+blodevZOgGcZm01ECJGyAX8NSHLfkUfmKcORHnbhCiq9QvmrepxCk7+u1uiUhJ+N/3kS9iQjhmc0aq2e+XN03GjcBEVRuB5wnY93fEzFVRVpsr4Jz8JfTVR9ve5wWsQGpBcuXMjHAz8md2dDQDaxgDTAmMNjiNZHp/u8cZ16copuO7tR/7f6jD883tj/0P1DE7Qdf2R8gm0AdwYO4PnChTz7pC8zT81k7pm5dN7eOUE7vZL8788v9qctjbc5vhYAehN+v5ddWTZe6vm7wXcTtImto/6mhceE03VnV+aencu6q+tMPm7P3T3GgDTAKf9TyQakU+v3a7/TdUdXAiLSllZfCCFE6qgUc+Q2EUIIIYQQZhcRrePgf3V66xdNmNI3IlpH4a93JtguTLekW3m+2XSR+wFpCJCkwcpPKlE8pyM6vcK6k/f5pHpeLP5ble4zwpCa9uXp7YRe2IPaxgH3D4ahtrJFiYki6Oh6wq8dwbZIDZwqfQjA4xVDsfDIi2uDPqjU6V/5LLKW25P8uPcijMfBESZNEKmY15VZHcqQ3ck6yTaKopB35PZ42+wI56L1JyaNSaeo0Kgy7k/IKZWPMnffDTwdrfjhw1KUye3M46AIGkxPPEicneessJzE7JgWfDFoGLnn5c2wsZls8HlwjrOSanJeCH+DWSd8asDHmwwpxVMrPAAm+7x6XnWg4X/X/PD0X7iyDYLi3OB2yg1lu0C57nDvKKxNeJPf6ONNkK+24fGsMq/qYqfG0Otgnw3GpLJmqN+PkKsyZC+e+nMKIdJl1eVVTDo2Kdk2GblyUdHruVK0WKqO+WSQhpe2CQOJ1pEKy6e9mgg4oI8Gf5f47U5+dBJLTSLZIeJ4uWcv9/v1A6DIlZSzl0TdvcvK38dSZf4h4zarwoW573+dO646wqxgZzk113O+GsuqirPx2n8Fx2bNsPTOmeI54oqJiUGrNaxi7d27Nxs2bCDaM5qXd15ilcMKn+E+qLWJr3OqlrMa8+vPJyAigHGHx2GhtuBa4DVcrF3oVbIXFbNXTFW69p/O/sTsM7NNbh/3eylKF8XuO7vJ2/RVMLndyFerc+PWHgZotakV1wOTnsjkYuXC3x1MnLQWx/qr6xlzeEyqj0vM+a7nab+1PZeem15e6fWfr1LLS8ULwP9U/ycq5qiYqpXLqbXn7h5javlYBzscTLEG+Lqr64wrnM3l9fejxLISAGS3y8765utxtHQ06/mEEELEJ0FpIYQQQogsyv9lBBUn/pXZwxBvWPDxjYSc/QNFUcj56Xzj9piQFwQf+Y3Ih5dxKNcc+2J1UGKiUCWWFle8kyyIIYfqOXpU3Fc8zNZvPtVD9lglXHGUGXwiVpvctp1mL1MsXqV8rRAxh+PW/TNiWGmggtyVoc4oWPbBmz99y3lQupNpbe8dN9SlLtUhYVDanPLXhS4bwP8KzK2U9n5GPYLvcqTt2PTW3BZCpFpswCc5GRWUfhkexIvNGwn7Nvmg+Ov+zQlffxw/QNd9l44mJ+PfQvV3ggH94rf7vNzndC/ePdn+A9f/zqPRhpWepgSlLxcuYsqw6Thcg05jCEwvWuWI491XE6JyLV6EfbVqJvUDEBUVxc2bN5k9ezbDhg3jo4Mf8fjWY25Pv41tflty909bGuvPynzGpyU/Nbn9TyfnsH7/PB65mbba+HTzvwneuROnpk35/vL/WPvvWn79/lV2oLhBaYAzXc6g+W8SV0rfq05WThzscNDksccac2iM2VJmb2u1Db8Nfqk65vWfr9LLS6NT4mdZ0qq0zKw7k5reNdM9xtdFxERQYVXC2taQfGBaURRKLi9p9vHEfT/GHh7Lb1d/Mz4v6V6SVX6rzH5OIYQQr0j6biGEEEKILMpcqd5E1qQoConND3Ws0BK7EvVBF03gwVc3RbT2rjiUboLW0YPgI78RE/wUNIYba4ml7xbvHm/VU/62GsJ2y1Fm7beW+qxZ+0uP/ZaDmWXxP5Paxg1IA2hJR9pqs1Pg7uHMCUgDBN03vd3i+rChN9w9CkfmZdyYbuyBOZXSF5AG2D0m7cfu+NLwmhUFdDEQHaesQtgbXM0uhIjnr7vmn4S56MwCjtarnOqANEChRKouvB6QBvAIgjpn4//u2XIz8frVik7Hg+HDudPlY2NAOi5dYCBKdPpSX3+5Tg+KQsEHSryANMC9T3pyuXARXu7ZE29MidHr9QwYMICiRYty9uxZHB0dCYwJxDqXNbl65yLkXAiP16WcAtohTKHLXzq8nr9672adnkV4jOkZiopP2cLMBTqqXkr5d3yxO3quVanKk7Hj+K1TNdb+uzZBG8+A+F/HVZdND0AGRaZtYpM5azjvubsn5UYpeD0gDRCjxCSbZj+1FEUhKDKIcYfHJRmQBph+cjrnn54nRp+wrNDwv4ebbTyJeRz6OF5AGuDcs3OJpjwXQghhPhmXl0MIIYQQQqSLTi8Jbd5ViqKg+q92a8S9C4TfOoXayh5L99zY5C+PQ+km6EJeEH7zFBoHdxxKNQLAwj0XDuU+QKXWoHXMZuxPUne/b8z72fCtxQqz9pceedT+5MGfQdH9UVI5h/qw9cAMGtVb6PIWqJXCzdyAOzAzzgqkp5dh/+SMHdfT5OuGm+TYT2k/9uh8w7+4tDYQGyDRWoOFDTScCJa2UKAB+F8CtwJg65r28wrxnjI1iDd472DOdz3P9pvbmXZyGq0KtqJ3yd7pSie8ev8s5phprokqmSSTfbfr8XeGi3kMv7OuBySe/jlk3z6CNycMWF8uXIQcE8bz6KuvAci/6w8sc6dtFXLpWwq/Tkp+ouL9fv2xyJmT6MeP4b+gtF6rpeD2bcbzqtVqBg8ezO3bt7l58ybWNtbUPK9HrVPYV8oWr65ePFjyAAtXC9zquSU4R85nCp326alwzfC+fXBMF2+F8oZrG+hUJOVsHlH37+N69g4ATU7oOVRUTfYXCsXvKOwtqTKuClfrFXI+g29Xvwpcl7yhI7Hb3v+br+P7tmpUChS5rzBVPwW/fH642SR8HXHle6RQ9bIeXZtQNPZ2KY49oyQWUE7JovOL6FmiZ4rtFDNeXw7dP5Rdd3al2G79tfWsv7aej4t+zLAKw+Lt23k7Y8tUJVVD3G+DH8c7H+dl1Evmnp2Li5ULn5X9LEPHIoQQ7xMJSgshhBBCZFEWmlcrpa9NbGKsPRzr6ctIKkzc/aaHJcwgNiD9kf0FftwwlrIVq3Li8kkC/W/hXOMjnKq0w7FCSwLDlxN6cS9aezds8pcHwDrXq5qocYPb4t1UPKcjQeHR3HsRbrxV6KgK57Z1JwpFLCWShOnbv/IrQr0inmjVKnK52sbbN2bzRZYeum18vsnyqwwcfdrdsv6I65p8uDs78enDD7io+BCNluj//oS1IiqTR5jFPT4H+6eAY064+Du0XwknlxpWA9ccZqg3ff3P+MdsGZRoV++8uCv2YiIM/zb1S9hu+C0JTAuRStV/qW5y29+u/sbYw2MBmH92PvPPzqeIaxGy2WajYZ6GNM/f3ORrnpNPTvLJH+bLnpFSX95P4WIew2MFwwrR11MS60NCkjw+NiANcKNhI5NSeqdH9INXS8F1ioImJoYbDRuR58RxUKuxtbWlaNGiTJgwAT8/Pz6pUZOvXxreg347dFzJac+Uoq6c3/wMl2rOqK3jT46cvjD5wOn3x75PEJSOe017+MFhHKwcsOo8wLiv0ANVvDTcVtFqtlVUUeesnr7bk/76FLmbMNA6ct2r9nezKcw+M5uWBVoCYBGtUPK2woU8KoreVei1U8+hIio+OGbo52r58micnckx6Xvsa9V649fhM07NSPUxM0/NNAalDz88nGzbsOgwbC1sk21jClMC0nEtv7ScgWUGYq21Tve5TZVcbfPXV3d3KtIJdxv3jB6SEEK8F6SmtBBCCCFEFrbh9H1sLDQ0Lp6wfmZEtI7CX2fsDPJ32aTWJehQ0bAi5ElwBJW+y9j63WM+KMqH5XNhb2UIqp07d44PPviA6dOn07p1a4asOcHyZct4sWsubo0HYl+yAZEP/yXoyDp0L5/h/sEwLFxzpuqcQxv60q1aXrRqFVZaNXv/9afH0hMA5HGz5c7zMLO/TmEefw6pSUFPB+NznxHb8FE9Yp/VF5k4KiFEqnTbDj6m13AV4m132v80f9//mz6l+mClsTKpnrSpnK2c2dN2DxYaixTbllxanLUprBhOyeBeGh7+V8c4bjA0MT/XV7OjQvwA18EOB7kVdItf//2VHsV74LL/HP4jEqbtTkyB/fuw8PQ0Pje1pnRq6RUF9X9B1Sn+/tzxzskzoGfPnpSsXhKvvF5c2HuB9m3a8Jl7Nnq6vVpNHK0oRCsKR0prmO/3KihtFaWw4seE732PQRpCbF8FcOPW9dUrekotLwVAszzN2HpnKwBTx4aR2zLh5LtYj50he2DSr6/dSC1Lp8VgG5ncuwB/9SrLb9qzdPtLj88TBc9k+owr25AhuPfulWI7c/4cpFVjn8Z8XPRjOm1PfoV6Oc9yLG28NN3nS8trbuTTiKm1pnLs0TH8w/0ZeWBkuseRmG+rfMufd/6kZ4me9Pijh8nHzak3J0NqbgshxPtGgtJCCCGEEG8pvV4h36jtZu9322fVKeblxP2AMCZuu8yOCynXjHubtCnrzY/tSiXYnpGB6WOj6uHhGH/m/9atW+nXrx+nTp3Czc2NdSfvM/y3cwTs+5mQs3+Qs88S1Fa2hP17iOiABzhVbpuqc3o5WXNoZL0E22NXoszec42pu66m63WJVyrldeXSo2BeRiR/49pU+4fVJo/bq/SQ0To9jb9axF9Ww5I5SgiRpTjkgM9OG1KCC/GOi9HHUGZFGePzH2r9wLD95v+ddajjIRwsDZO2onRR1F5bm5wOOVneeDnKmQvcHzmSy/oH+D5M33mu5IRvPjZMJEwpKA3QdYgGrxdw2xNjWum4ql/Q89kW01Zv29erh9fkSWjs7YGMC0qD4bqw/4P73ImOpl3BfOzKFsW9M0+I9tWQrYUH1jmtaTbMn6lP/fnRKycNHBziHa9XFNbV1LC+uiEoP+nnGPIl8qfDxPZqzuZ7FbifWmsqVhoraueqzT8P/qHP7j4oegWVWoWiV7g39x6VrygMcHengJVVml5bu5Falv0Yg40JyVX+zZl4LfGUpLSq/a+7fzF47+DUd5yJ4k4YSKu0BuKHlBvC9JPT033+jNIsXzPq5a6HjdaG0h6lsbPIvFTuQgjxtkpdkS4hhBBCCJFlqNUqulX1MXu/xbwM6Qa9XWyZ91E5s/efWWoUdOf2JL9EA9IAno7WdKqUthp+yVEUhWwOCW+meXh48PjxYx48eIBKpaJ5cQ/GfFAUhzJ+qLRWRNy/CIBtoarGgLSSRO2zxHSr5pPo9tg0gwU8HBLdL1JvafcKrO1dhd/7VjWpfbUCydctBIwr6mNZaNTYkMJSHxP52xY0Sz9CiBTUHCYBafFOCY4Kxj/Mn7DohJlW4gakgQwJSANUXVOVA/cPoFf0lFtZDu8bwYwbdoHbJcpyp8vH6O6nPyAN4PFfOWyPANPW8iybruP7ZTp67Ep4rabWKyYHpAFC/vqLq+UrpNwwDRRFIe76pJPh4dyMimJuTm+6xsCqR5aMtnWi8IUYore9oO7hGNo7O/OhszODHz7gWUz8AL1apaL9gVevLbGANMDotfFf/9D9Qxm4ZyAjDozAP8wfAJVaRXRANNdGXEMXrqO7qyvOmlersGPHnZr1Vaa2TEtAGiDa3z/Jffde3nvrAtJgCCg/CnmUqmN0eh2brm+i5caWjDk0Js3nzsoBaYCtN7cyZN8Q+uzuw0fbP+KM/xn23t1LjN48k1KFEOJ9ICulhRBCCCHecmfvBdJizj9m6Wvn4BoUzu4Yb1tQWDSlxqWuLlhWY6lV8+/4xinWffN/GUHFieZbLa0oCn9+XgtfTwdu3rxJYGAgxYsXx9LSEn9/f7p27Yq1tTU//fQTHh4eABw5eZrqdRuTrdUorLIXiNdXaurW7R1am7zuSc/eVxSFvCNTXmlfo6A7DYtl51FgOHP33TD5/G+jmR1KM+iXMya3vzSuEc9DohLUbQ4IjaLNvEPcfBaa6HG3J/kxbssllvxzK9H9E1oW56PKeRJs1905gubnRsbn+SJWokfN5XGNsbHUJGiflB93/cv/9lwHoKb6LMstJ5t87BvVZQMEP4LclcEtf/x9L5/Aj76ZM663RdGW0GI2XN8NRVpAZDCotWBlWHnHjb2womVmjvDtUaS54b20dkq5rRDvsBEHRrDt5ja6FevGwDIDidRFYmdhR6QukoqrKmbKmExZxZzmvquraXcw9bWp2438b2KZolDqlpIgIGuq/WsH8cvfs5k3J32pyBNzKSICD62WCxERjHvymKW5csdLlb3o+XN+Cwrklzw+OGs0BMTEcCMqivK2idcbjn3NyX09Phmk4aVt4teyiqKAHh6teURMUAy5++c29hWs06FRgZ361bXOgdAQtKioYpf0tW67kdoM/f4AQwrxz/pqaZ6/OTVy1qBUtlI4WzujKAqVVlfK0HO/SR42HrT2bU3tXLVxtXLF1sIWewt7VCoVUbqoBDWYBTTP3xwnKyfKeZSjmHsxFEXB2doZa0387FmJ/X0Xo49Bq9Ym2C6EEO8C+XQTQgghhHjLpSJOmSKtOmEiHSfblGv3ZXWXx6UckAbwcLDm6Kh6Zkvj3aCoJ76eDsybN4+RI0fi6OiIm5sbs2fPplq1anTr1o3Zs2fTs2dPpk2bhl6v5+cF81Fb2aKxc4nXl0ql4svGhWlSPDu5XW1TTN2eXEA6tr9b3zdNMTC94pNXN9SGNy6MoigsPXSbsVsupfDq3z5V8rlxYHgdlh66zbFbL6ia3412FXIx+JcznH8QZGzXsWJuRjQpjK2lFlvXhH9SudhZsmdobXxGbIu3/efuFajg4wpADifrBMcNbejLgLpJr2LW6KONj30iVhsfpyYgDRD3J+FvfeKZAzJVznKGgHRyAUB7jzc3nrdR4Wbw4c+gVkOxVoZtNs7x29hli/+82mA4swpCn2bcuPr8A/OzeI1llRqq9AfnPFChp3l/yQrxljvjfwaApReXsvTi0kwdy5uQloA0QO/tOg4UUzFmddqOj1Wr/UzsC5jnM0inKGj++zzbEhzEiEePWJorNxYqFS90Op7GxJDb0pIYRUGrUvGhszOznj3lfEQ4NezscdFqKa81XPMkNlGy2kU9x32TH6vmtbcjbj8qlQo0EBMcg12YQs+dOtYHBXI5IoI/X4bgoFHzffYclLCx4VlMDCteBBCp6CljY4N1In+/AGR/kfHrsGJrWm++sZnNNzYbt2tjFD48rFDgocIPH6oTTen+NvEP92f+2fnMPzs/s4fy1oj9flhxaUWajj/Y4SBOVjIZTgjx7pGgtBBCCCGEMMrp/O6lGT03piEatek3gjwdEwYLU0vR61CpNcz/qBy3b99m5cqVLFy4kPz58zNw4ECGDBnCjz/+SPv27VGr1UyZMoVSpUrh7e2NhYUFHh9+i9YhYYrnT6rnxVJrvgo8KQXq87glXA2jUqnoXi0v3avl5VFQOH9d9uerjRfMNqbM4mxrYaz7/XWzovH2Le5anopxJip83zptdfLqFHoVSLVI5OZkcgFpAHJX5q42L+ciMykga+UEkUEpt0uPr5+DxoQ/U1Uq+PoZjHd/ta3XPlhQO6NG9napP9YQkE6OZzEo1w1OLv3vmDGQrTBs7JMxY+q6BbIXhxyl4NHZtPfzbSCMdTbXqKD7Tgi6Dw9PQ51Rr1aSCyESeBCSxhzHGcQuPGsmf6x3VqHeWfOMrdx18/QTG5C+FBHB8xgdE7Jnp8J/q55r29nz9ePHLMudm2z/BZ5vRUXiY2lJLgvLBH0ldv04aHPKAfiStxT+LmE4VtEpqF67FlIUBbtCdmh/ecbYK1fxtrCgpLUNY7NnZ/GL50x96s+y3Hlw12pp5ujIr0GBSQakAcatMP8K88Ss/CGGAX01lLilcN1LReV/FTruf/V+rJmi40Z2uJpTxQsHFZdyqbjm/XYHqUXGO/zoMI19Gmf2MIQQwuwkKC2EEEII8ZazsUjdKsmk9KmVP9UrLrOqjf2rUTi7Axq1CgtN6oO4+4fVptYP+9J07tiA9OpPK3H/3l2Cg4OpWrUqbdsa6kLv3LmTOnXq8P333+Pi4kLbtm1p27Ythw8fBqBKlSr4jNhm7Cde33Eq4+V2teXui4Q1HSHxYHJaLO+RfCrOHE42fFQ5zzsRlN4/tE6S+zwcrSmdy5kz9wJT1ee4FsX4ZpOhNviIJoXj7WtbPhe/HL/HlccvTe9QY4Hb0OMMGJPOdPqv3UwOVmxxVCX+vRTPsGuwohXcMU+5gESZEpA2trWAgadgZRv4YCZ4lUn5mPeGCUEMlcrwvn0w89U2dRp/B5T5CBp9Dw9OJkwJrtLAwJPgmtfw/NO9MM41beexcjLPyuV8taHTr4bHWivD/yXbpr9fId4zhV0Lk8shFy8iXnDyyck3fn7rqDd+yrfarpfBDHn4EFeNhhleOY3bh3t4MPDBfbrfu0tDBwfyWFgy//kzCllZ42OZMCidVgO26jmVX4VrkMI9TxUK4L/ZH12YDmsva+wK2+FWz42R+y24HBFJfQd77NRq7NQazkeE8ywmxriSu7mTEyfDw4lSFCxIPFDubMKljTlYxsCC/yUfAM//GPI/Voj7+/lMXhUHiqso+EBBrcCmymqeOsd/HdoYBZ0alDgTbN2DFObONZzv854a7meTAHdW52HjQVBUEJYaS5wsnQiJDsEvnx+Waku0ai1e9l5E66Ox1ljjH+ZPPud81M9dP7OHLYQQGUKC0kIIIYQQb7mCng50q+rD0kO309VPiZzpSw9WOLtD6gJsiahewJ2D15+l6diczjYs/Lg8hf4LRqdHHjc7yuZ25tTdwFQdpygKKrWGrZ8Uo2PLOkRFRXHz5k0aNmxobOPg4MDSpUtp0aIFM2fOZPDgwRQrVowqVaq86ieRgDSARZzVIIWyOyQZlF7bq0qi2xPTr3b+BLWiC2d3YPknFfFwSP+q8dQ69XUDyo7/M119ONtaEBgWnXLDOKwskp+80LtmPvquOpWqPrtUzkPZ3C4U9LTHShv/62lnpWXn4JrsveJP96XHTe7TztqCr5sVZfxWQ/r0XUNqpmpMED99N0D7qK/ZYTUy+YM+O/Nf8C4Db3xW7JX6Y9zyw6AzZh/KW09JY8pYVRI/B/nqgHtByFUJIoLAuzxobSCbLyjKq0Bx/jow6iFY2EJEoKG/19OwqzVQqiOcXZP68Y24Y/g/T3W4czD1xwOU6QIfzEp5JbkQIoHeJXvz07mf+L7G9zTL1yzevhLL0pZFJLXa+ralR/EeeNh6UH9e2TdyzrfV62m2K9jY8rGLCysDAtDHaeNlYcHK3HkY/+QJR0PDOEQote3t+dLDEwC9oqA2w4QgRVFYMtMQTH0RE8NHd++STa3GVavhblQwikrNYA8PytjZUcbGMMEySlE4EBrCr4GB9HNzR6tSGQPTY7NnN/b7Nip9S6H0rVdjb3g69Su7py1K/JgYNRwspqLAQ4V72VT4O4NtBDiHQoVrClEa+NdbRYk7ib93a2qq6eSfj2wdO+PWvkOK4wiJCqHLji5cD7xOPqd83Ay6merX8jbrXbI3DfI0wFZry/OI5+RxzIOLtUvKBwohxHtIgtJCCCGEEO+AMc2LUczLkWG/nUtzH/bW6bs07F+nALsuPWHL2YdpOn5G+9K0LGNYtTFv3w0m77ySquM9Ha0o6uWYpnMnZl2fqjwJjqDqpD0pto0NIqtUKnRhQUybMokyZcrQvXt3pk+fzoULF1i4cCGffvopAMWLF2fWrFl06NABV1dXxo4di7X1qwBwbEA6h5M1zraWfN7Al+oF3FHHCbYnd2sweyL1ipMyvHFhWpf15t/HL+m/+hQ9q+flq9fSV2cUaws1EdGvgmd7vqiFq50lp79uQL9Vpzh883ma+j0ysh7n7gfR7qfDJh+T0r3WxsWz83P3ChTNYfr3mEqlongKkz1q+WajTVlviuc0vd+4qb99PR1MPi7W6zeWLyt5Uj7o9frDcRXyg+pD4PyvcGxBqsdj1GRK2o8V8aX1Br13+fjPXfNB/nrgNzXpY17/4bH8r569TTI3Y5v+kPqgdOFmr8714RL40Td1x4MhGN5iduqPE0IA0L90fzoX6ZxosGVJoyX0+KNHhp3bzsKOI52OxNuml7kliYoNIqtUKgJ1OmIUBVeNBhetlt5u7lyOiGTMk8f8nscHK7WaaEXBRq1mfPbsqIEAnQ7X/9J4x61FndaxHAsLo7KdnWGNsKKgBxa8eE5uSwumeeXEVq3mUkQEKwMCGPfkMQu8c5Hb0pL9ISEcCA1ha3Awfd3c6ehi+L7TvjaelMrRvI+0eqh93nAt4P084TWBpY4kA9IAHf/Wo3Ad/2/HYunmjkP95Fft2lvas6HFBsAwSaDk8pKpHnOnwp3wy+dH5+2dU31sWtTyrsX++/tTdUzfUn1pnLcxTpZOuNm4odPrCI8Jx97yVemPXI65zD1UIYR4p0hQWgghhBDiHdGqTM50BaWTo1WriNEnH+TwK5GD+kU8Ux2ULuhhz5+f14q3rVnJHKkOSqd3dXRi/Xk52/DH4Jo0mvF3gv0xL58TeGAFbk0GGYPIEfcv8vL4Jp4UdGH2//5H3rx5qVq1Kp07d2bt2rV4enrSvHlzAPz8/Jg6dSqlS5eOF5COq14RDya0THzlUVL337xSEZCOVcDDngIe9lQv0BAnW4tUH390VD0qxam5bAo3O0uOjKrHo8AIsjlYYW2hNt5UdLGzZE2vyiiKwtJDt3Gzt+KzNadN6ndw/YJYW2iomNeVlZ9U4qPFR006TpXCCmCVShWvJrS5qNUqfmxXKlXHfFjOm19P3KOWbzKB4mRYaNPws2JcQfva54CDF3RcbXicq4IhhXPwfUOd4kubofH3sLqdiefIIjeVK/WBo/PT10fHXwwry/PXhTHpy0KRJnbuKbdJjIsPVO4HR+Yang88lTFfF6vUT6ag8fevHr+++tpULeel7TghBGD4XZjU6r8K2StwustpbgXdYvfd3bTI34JG6xuZ5bylspViRZMVCbavbLaa8FntzXKOd4USZ1Xz70GBrAkI5KVeR0ErK6rZ2dHB2YWx2bPT+/49Pnv4gJ+8c2GhUhkC2fz3NdYYrmv16QxIK4rCsoAXTH36lBW5clPW1tYwPuBCRAQFLK2w/S9rRVFrazo6O/MgOop1QYF8kc0DrUqFk0bD3JzelP2v/rW5Vm0L01nmzZuq9mmZJODj6MOQckOw1lpzpNMRnoU/o9mGZikfmAbuNu7Mrjubh6EPUwxKu9u4M6rSKMp7lk/0s0+j1sQLSAshhEiZBKWFEEIIId4RWo2am981Jd+o7Wk6vnK+pGt8HhlVj/ITdie5/7tWJVCrVamuSZ0vmx2rP62cYHsu19TXRLa3yphL20LZEw+eRNw5i9Y5e7wbL1FPbmL58gFnz9whR44cAFhZWTFp0iR69erF0qVLcXd3p2rVqgD07t0bSJhesXfNfKw5dpd+tQskOa6kbsjZpuN9SEtAGsDTMfWB8JNfNwAgdzL1r1UqFd2rGW6ETdv1L7efx09XXq2AG842lmw7/wiAK+MbYx2nxnr1gu6MbFKY73ekPMHhbbq/aWupZevAGmk+/qPKeZiy89/UHRT7Br2eFtryta+fRmsIbNYfY/gHhv93j0m+/16pW6mSoVzzg3NuCLyb+mPz1oIWc8A5zioZd194djVtY3ErCAG3QB9j+jFdt6Q9KA2Gr9fz64a6y1nlB0NjZfiaxLKwhp5/waJ6hucj7oKVI4QHwJTXbp5X/xyeXIC6X2Wd1yPEO0qr1lLQpSAFXQqapb9h5YfRqUgn1Cp1ooGuXM4+pPHT9Z0V+z4tf/GC2c+f8WU2D4pZW/NbUCBT/P3JobWglr09E7LnYOCD+3z35AmjPD3jXVfG9pHe4K9KpaK2vT1XIiMZ+ughq3PnIbuFBZF6PXksLIlWFMLt7Cg0by5WBQqQPzSUdT17cv2ffwCoZmdHRVtbLFQqY4ru9yEgHWgL531U1LhketaTKW3U3HdX8dQJdJr475FKr3Cu+wVi9DHsvL2TkQf+K9kSW35DMdS01qvgjw934WXvZTz29b9RTPVzo595Fv6MYX8PS7bd1lZbCY4Mprh7ceN57CzssLOwS/U5TbW11VbsLOwo4lYkyTatC7amV8le5LTPmWQbIYQQaSNBaSGEEEKId4haraJyPleO3HyRquP2D6udoOZtXO72Vske36Zc2v5g3/NF7TQdl5hxLYqbra/X7f68JvWnxV8tbVe0lnGFdNj1o9gWqIRjuQ9oViEnm1cuZOzYsXz/vWFlX4ECBRg7dixffPEFU6ZMYc6cOeTM+eo9e/1mz8imRRjeuHCyq7+Tuj/0v45l0vIS36iuVUxIF/2afcPq4DNiW7xtI5sUoaCnPT2q56V0LudE369eNfOx9vg9bj4LTbb/d/8W5yuO1hZMbVuKoevOpv7gbIXgrulp0QFDau+UgtJepVM/lsSU7ADnfklfH6mtx9xgHBRrDdFhhvfndZ1+hVml42/LVhjsPeFWMsF4r7LQa6/h8fXd4JAD5lVNfizVBkHe1NcZj0drBZ3Xpa8PU3x+GaYlfUM4np6J1Jn3Lg+DzoGl/auV07auMPoJLG8O944a3o/635pvzEKIN2Z7q+0pp8FVp24y5LtoW3AwQTodnVxereIM1es5GBbKaA9PWjg58SQ6mj0hIdS1t6eMjQ0A5W1tGe3pyZePHlHNzo5a9uZf7akoCnktreju4kqITs/ABw9Yc+8YpbOVoPKUKcycOZMHY76lZNmyaLVacHUlT4kShB8+YqwbbfHfBe/bkqLbMm9eom7dSrA9Rg0bqqoIsVZxqKiKILuUX8//WiS9zyFMwTkE7nmk3I/y3/WxVq2lWb5mr4LSse+pSoVeBdNrT48XkDbsStv7Xj57eePxQ/cPTbTNoLKDyOOY9N8ERzodofLqhJOX0ys24K1WqTnf9Twllr3KSlUyW0nGVhlLAZekJwYLIYRIH6m+IoQQQgjxjlnzaWV61cxncvsKPi7kcTPfbPTWZTJnRnlaVlebKn82e+MKDQBF0RsD0i9Pb+fZlh8JuWgIHrXr3JW2bduyf/9+5s9/lQK4Ro0a9OnThxo1asQLSCclpXTkSaWbLpKKmsfmdGx0PZPbftmkcJrOcX5MQ/rVzs/aXpX564taFM/phJVWQ7k8Lkm+XyqVir++qJXovtfbvU9apfbnNPb7v/7Y+NtdTf+sSVIhv/T3EatVOtNuAyg6UjVNQWNpWBmdWEAawDUv9NxjCFx32w5jgqDfEUOwOjnaOJOBCtQHz2LJt3crYAiQvy0cveDbQCj/ScptcySR4t4lD9i5xd9mYQ09/oBBZxN+vwoh3qia3mmbJLO11VaT6rKqNO/+bU3ntm0T3a4oCkE6HSfCwihiHX/yaPR/+8rY2LAvJITmt2/RwMGBKTm8cNRoOB8eTqBOh5+DI4u8c5klIP1DazVxp3Tp/1thG6rX8cfLl7hpNVyKimDS4EkADB8+nHLlyvHll18yd+5czp49y9atW1m6dCm12rVNUDc6LWa0eDPfH7aVKpFz5kwKX75E/h3bKXLlMkWuXMb3xAlyLv+Z9iM0dPpSy7oaGnZUUJsUkE7JS1uVSQFpgHFVU742cLB0oH6e5OtGp0Ujn0aJpt4H6FmiZ7LH2lnYsarpKrOP6XVrm62lc5HO/Pnhn6xqukoC0kIIkcHe/as3IYQQQoj3jEqlYlTTItye5EdJ75TrblYrkI40r4n4oJRXyo3MrGxu5wztX6VScfyr+kT53yIm2B+VSk3k4+tE3L+MTYFK2BWtSfDR9UTcu4BXNlcGDhxIoUKFWLVqFVu2bDH206NHD7744guAeEHutHC0yVpJjzwcrNk1JOHN57K5nelVMx/V//s+y5/NDlvLtI3dwdqC4Y0LUymfG/mzmX4DVaVSsf2z5NNdv18hacOkhxNf1cfbxca0A7T/pWi3cYZhN6BwM0N65+b/S/9gOpjxhqNKBa0WpK+P1KTKBtNWVnuXg7Y/g081w3OVKk6d7tdYOxv+r9I/4b52idzYzVsLuu+AvodMGm6WolJBs2mJ72sxF7puhc9MqyefoF8XH0nXLUQmm1prqsltf2n2CwPLDORY52PJrp6MS6XJmiul3fv3p/C5s7h9mnzQLSXWRYvi1DLx5bLPdDqcNBqGeXhQxsaW+1FRXIyIAAzXNIE6HRP9nzDy0UMGuLsz0sOQovtOVBSbg4O5GRmJSqWiip1hYqoundelxwup+bzXq6+HWqXidlQU9W7c4H50NHktLWnW7AM2bdrE4MGDAdi8eTMVK1ZkwYIFNG3alD59+vDFF18wbNEifE8cT9d4AA4Vzfjb3mo7O/IsW4pjo4YJJjhq7O2wr1ARxYy/i36o+QNaVequo1sVbJVim+WNl6d1SCkq7VGaU11Ocbzzq6/p5BqTTTq2ZLaS7G23N6OGBkBRt6KMqDiC7HbZM/Q8QgghDCQoLYQQQgjxDlvbqwpbB1anVC7nJNv0qZX/zQ0og3xcxSdD+1cUBYuYcB79/BlBR9bz8vR2Hi8bQtSjf9E6uGFfogEWLl4E7vsZbdgzfHx86N+/P9mzZ2f8+PGcPHkyQX/pXZn7RcOEqzJ/7V0lXX2ml6+nA1fGN6ZNWW9Kejvx55Ca/N6vGqOaFmHeR2UZ36IYa3qZPw2fKYp6OTKnU1kArLRqPqsXv97l+xi7cre34sDwOgyqZ0LtT3WcPx3t3A2B5I83gYOJN/Aq9U18e40vzP/ml2oPjb5L+/F6Xerap/VG/utB6X5H4JPdMPymIS11kQ8SHlO0OTh6v3pe9mPouhnyVI2/svptk6d6wm0qFeStYZ7V+EKITGGjNW3iUw67HBRzK0avkr1MPgaALBqUdvmoMypLSzy++ILC59JQKuM/br17o7aJ/35EKwrjnjym3/37hOv12KrVPI2JYbz/E6Y+9edOVBROGg293dw4GBpKV1dXuri4Aobrz9WBAZwJD8dNGz+wqXntd7Ft+fL4HjuKx7DkawIDrKtuOPahW/w+dr4Mpri1NZNy5KC7qxsrV65kzJgxLFmyhHnz5gEwf/58du/ezaZNm9izZw8jRxpSS6ts05cB6dkbShzk0rFDsvvVSU1AS6NS2UrRJG+TdPUxt97ceM+3t9qe4auDLdQWWGutOd/1PEc7HaVpvqYmH+tu486Jj05QIXuFDByhEEKINyVrLa8QQgghhBBmZWOpoXhOJzb0rYpOUdArClW/38Pz0CgAavpmw9rCtBt6pXM5c+ZeYIrt7KxMu8Sc27msSe1MUcs3m9n6SoqLiwuzVm/hsy4tQVFwazoI+xKGNHdWXoWwL9WI3Pd3069fP37//XfKly9Px44d2bx5M15e5qnPFpe7vRVXxjdm6h//0qCoJ5XyuaV80BtgbaHhx3YJU+06WFvQJYMnD6TEr2QOmpZoikqlIkanZ9mh2wSFRwPvX/ruWCqV6s0E5JtMgmKtYG1nCH1q2Db8lqEGcEao3M9QZ/jpv3BoVuqOVXSpq1NaqmPq+o8V90Z1iXbgEae+sksyqwT7HIA7/xiC576N0nburKbbVhjrHH9bOlftCSHeHosbLU7bgeqst9bGqVUrtHHqO6ssLcmzejV3OnVKVT9un36KY6OGRN1/EG+7hUpFcWtrLkdEMNH/CROy5yCbVktDBwe2Bgfz0/PnjPTwoLWTMyfCwvnp+XMCdTqc1BrORITzb2QkS7xzkcfSMsE5vaZOxSKnFyqNBpuSJQ3nM6HkzLrqr74O2uzZiXn8GIB7UdG81OuNAW9HR0c6duzI0aNHGT58OCVLlqRatWpkz56d7NlfTXJTFAV1Or+2L+yhrEdZJn94gi9/MyGjSVqoVGT77LMUmzXI04A/7/xpllMqpP93Yw3vGrjbuPMs/BmASanyzcnWIvUTDqw0VixptISVl1Yy+bhpq6wTs7nl5jQfK4QQwjyy3tWbEEIIIYQwO7VahYVGjZVWQ+dKuY3bpycSPEzK732r8u+Exim2q+DjwsdVUk672LCop8nnTs7wxoVwsUt4Y81cdDodKpUKvV6PhzoEC7UKjQoGNSnFh6U9WdKtPBfHNuKf2UMY3L83L168oH9/Q9rd1q1bs3jxYnLkyJHudN2JsbbQ8FWzolkmIP02iA0+azVqdg5OPqX3+6Je4WR+Fst8BF/eNs+JcleCYddh9GMYcTfjAtJgWGVb5iNoOB6yFUm5fVyKAm1SESB5vaaxqeLOBkgqhXVibF0Nq6iLtQSLVKwozMpUKhh63bD6PpZl+lbJCSGyhmo5q6XYJpdD2oJiGTmhbF+J1PWdfcy3ZB8zhuxjxyTYZ1u2DPm2bE6yPvTrPL/+Co8vPgdApX01SSr2WrK1kzMfODpxNTKSGU8NE73aODlTy86em1GRLHzxHIDvcuTgU1c37kZFcSEiguxaLdvz5iOflVWi6bodmzbBtkwZY0AawKF+PbTZkp78+XlPjXEOkau1K9m/Gm3cV8rGBi0qzoaHk+2/lN05c+akXLlyhIaGUqNGDR49epSgT3N8XcsWb8CyJsso3LxLuvtKil2N6qgSCe6/zkJtYbZzutu4o1WbvsZsUo1JiW7f224vK5qs4Gino+Ya2hvRuUhnfvvgN051OcX5rudTdaybtRt5nfJm0MiEEEKYSoLSQgghhBDvmc/qFWTlJ5W4OLYRbvamp3tVq1VYaTV816pEsu1UKhXjWhRPsb/U3nA6800DAOyttGzoV9W4vV/tjEs3p9Pp0Gg0REREcPz4cdq3b09UVBR9+vRhxrdf0CRbELUKumNnpaWAhwOtW7emffv2bN68mW3btgGg0WjMkq5bmF8OJxsmtylhTOv9virh7USka2EA/BXnVzuG3YQWc8DGJfED08rCxrCK+U3p8nvq2ut1kNPE74mGE1M/nlgqFXx+xZCq28oh7f28K+yzGeqUN54MxVpDkeaZPSIhhBn0KtErs4eQJveyqegxSEP7ERoeOyff1nv2/3Dp0AGXDu1RJxGktCpYkBzjx5l0btfOnY2PVXHSbMdd72uhUuGltWBtYACbg4IA6OTsTEVbW46GhfHzf4Hpfu7u/C+nNzNz5mRs9hzYqtXEKEqCdN2FzpxGlcjqZJVWS8EDf3Nq5AeM6KZhWT01ixqq+bGVmq5DNNxzA5VahT5Kz6eWn3LVyYmnMTEAFLGyIlSvZ1twMA+zv5oAZ2Fhweeff26cuJkUU1ZpHy6s4nBhFVe8ISzOW5/j668B+LLCl/Ha9+1nvpTvWnfTMjW5WJvnOmp67elYakyfiJvDLgd++fyS3F/ao3SaVi1nJpVKRSHXQsZAf2qCzOb6OgghhEgfSd8thBBCCPGe0WrUVC/onubjO1XKTXi0josPg/j91IOUD0hCSiHaYY0K8cMf/wJQyNMBZ1tLznzTAGsLjWGFsF8RXDNwhbSiKGg0Gi5cuEC7du0oX748er2eKlWqMHv2bC5fvsyAAQP49ddfKVHCEKh/8eIFgwcPplKlSlSt+ipwLgHprKt9hdwpN3oPWPX/h+dBQXT44Xd+sFlKuY++S/sK4KzG0Qvy1oJb+01rr48xve+qA9I2pliOSd+Mf29V7mP4J4R4J5T1LEtb37asu7ou0f2F/5sUlVYqW1uUsLB09ZEoBUJsDddvw3toWD5Nl2RTm1KmZx5KiU3p0vGea9xe/S7WqFTcjIyk6727lLe1RaUybJv7/BmuWg3V7ezp4uJKkE7P3pAQXDVaWjg5oVGpjKus9YpCvp+XEP3En7CTJ7CvXh2HRo1SvFbt3HUKdUOG8PU/X3P08avVtSog4n4E9+be4zun79DpdKgfPmC+lxclbGzo5urKkhfPGTZ1Kg2uX0en0zF9+nTmzp1L69atDWPS6xNN1+3z2zquVamaYHus0/lUzG+qJtzKMHbbCIU9pRZjW7GCMcD++uta3m072ofzCNq4MdnXawqPoV+Y1E6V4l89prGzsEuxzcTqE9l5aydlPMrQoXDy9a7fBb/4/UKl1ZVMaju11tQMHo0QQghTSFBaCCGEEEKk2ifV87L9/KP0BaVTuD/Tv04BulTJQ0BoFJ6O1gA4274KQveskS/N5zaFSqVi//79tGjRgu7duzNy5EicnZ2N+//8808KFizIoEGDGDlyJEePHmXy5MlcvXrVGJBO6iabEFmORoubqxsbvu6GjeUnoH3Hvm8/3gQTPEAXlXJbJenAQzwexdI3JiGEeE98U+WbeEHpom5FqeVdCwWFj4t+nK6+Cx0/RtjJk6i0FjwcPpzo+/fTO1wA1HGyW0dYqdhXQkXt8/FTXtvVqIF73z7Jprd+Xf4/dnKjUdLlcOzr1In3XKVSYVWwAJHXrhOp1zP3+TMq2toyOYcXWpWKE2FhLAt4wYynT/HxLUTtlStwun2bgQ0asDk4iFr29jhrNDg1b07OH6bE69u5VUuTxw2Qwz4HCxsuZM2VNXx/7HsAwq6HcXf2XYrULcKhZYd48OABVYoXp+/9+6zInYeWTk7YqdVc8PJi4cKF2NjY8OOPPxoD0kCS18paFxeivurH1eXzKH43/nv/a3U1v9WIf1yYtQq7ygkDlLaVKhF29CjZvvgcd8fc6MeNxaFBfSJv3OTptFSUz4jDY9hQtK6mlSExJZhsDuubr8fXxZfm+d+fTCNJrfT2dfHlasBV4/Mh5YaQ3zn/mxqWEEKIZEhQWgghhBBCpImX86taphaJ3Exa37cKbeYdTvJ4U1YPO1pb4GhtvjpsqaEoCsuWLaNjx45Mnz4dnU5HcHAwZ8+eRafTUblyZfbs2UOjRo0YNmwYAQEB/PXXX/HSEEpAWrxtnGwz5+ctw6lU4JAdAu+m3FbRp9ym9EfQck76xyWEEO+JYm7FuPj8IpVyVGJRw0Vm61el0WBXsSIABXb/SeSNG9z0a2a2/mPNbaZhrp9CwQcwqOpwypIL++rVUVmk7vemZZ48eAwbhv8PPyTYZ1u+PG7duxmfx8TEoNVqyTFlCrdbtcZCpeJWVBSVbO3Q/ncdXd7WllC9nkn+T5hhaUFle3tKVanCuGXLUL77HufQULINHox7n97pev2x9Ho9nYp04mXUS2afmU3olVBcqruwdsFa7O3t+eKLL3BycODFy5cMePGc37r3oPmDBwxcsZyAoCA0Gg1OTk7Gldsp/T2gbVyXcboF/Pp9/Cwm0am4o51r3lwiLl7EpqyhNIfa0hKHevVwqFcPm5IludutW6reA5WtLS5dTK9V3a1YN075n+L44+OpOs/rFBLWAY/1VaWv8HXxTVf/b6sNzTfQanMr4/PuxbrTr3Q/KqyqkImjEkIIkRQJSgshhBBCiDQpmdOJAh72ZHe0Rq1OeEOpXB5XSuVy5uy9wDc/uDR4fVWzSqUiJiaGx48fc/36dRYvXszly5fZt28fOp2On376iU6dOrFnzx6ePn1KgQIFsLOzM9ahFkJkMToT03J7lUl+f99D4CmrpIUQIjXm1Z/H/vv7aZinYYaexyq/eVZD1shRjU3En1zZvUQPenTogbO1c7r6dvukB88WLED/Xy3oWNnHjUMVpya19r960iHu7rj16c3dufPIb2mFpnZtPL6bSMyGjViVLU21O+cpvnEP23f+QYcOHdi2bRu127SBNm3MmrUntrQNwKlFpwhzDcOxgiM2Ohu8Lb2pW7cuGo2G/UePsm/BArr9+COz1Comr14FgOt/K4sVRTG5tE1Rt6Lxnp/zUWEVrfBHWdNTYqttbbGtkHiAMrGV1cmxLlaMPGtWJ1k7PDH2lvYsabSEEstKpOpcr3O2cgaga7GubLi+wbh9Wu1pNMjTIF19v80KuBRgTr05zDo1i85FOtM8f3M0ag2bW26m+UbDqnFb7dtVO1sIId5lsnRDCCGEEEKkiVqtYvfntVjZM+mbOQ5Wic+B/LCcd0YNK010Oh1qtRq9Xs/z58+Jjo4GoG7dugQFBeHr68ulS5eoU6cOhw4domXLlsyYMYPIyEi8vLwoVaqUBKSFyOpMSd0NYO+Z9L72qyQgLYQQaeBi7ULLAi2TTLdrTtmGDEl3H+U8yjCpxiTj8wGlB/B5uc/THZCOVejoEQqfO0vBgwfINngQrl0/xipf3gTtxowZg5eXF459+mCnVlPU2potBw/w9+XLuA79AudGjSnRaxi5ffLSsWNHqlevHu94cwWk9Xo9qv9qU48YMYLD+w9TKXclnHM6s3/Ifvbs2UNERARz5swhd8GCZKtbF3d3d3744Qe2b98ery9TA9KxbR0tHRnyqYbpLdVM6KDm64+1RFiZp05zauRespi8639LVUDaXGy0NsYAfX7n/FTLWc24730OSMeq6V2T35r/RquCrdCoDX+L5XXKy/AKw6nmVY1WBVul0IMQQog3RVZKCyGEEEKIDFMkhwMHrz9LsN0+iWB1Zohd9XHt2jX69u1LSEgIDg4ONGvWjEGDBtGsWTOuXbtGlSpVjMfY29tTrVo1rKys4vUlAWkhsrAPZsDaj1Jul7tKwm3OeSBnOShi/pSwQgghzEvr4ZHuPuxq1KShjy+77+ymuHtxPinxiRlGFp/K0hKtuzvuffoAr1YQx11J3LJlSzZu3Ei9evX4+8hhJkREcHPgQL788kuuXbtG7dq1uXLlCocOHWLmzJnUqlUrXl/molarefLkCatXr+bGjRssWLCAypUrG/efOHGCe/fu4etrSCF969Yt+vTpQ7t27ShevHi6z//AXcUD9+Rfz7z689J9nqQ4NmuGXdWq6epjRZMV/Prvr2y5uSXVx/Yo3iPecxuNTRItRVxdinahS1HTU60LIYTIeLJSWgghhBBCZJghDXyp6OOa2cNIlkql4vjx41SqVImCBQsyadIkmjZtypAhQ5g8eTLu7u5UqVKFmJgYnjx5woQJE/jll19o2DBj008KIcysyAcpt7F1M9SfBmi9ENRa+HgzfHYG2v6cocMTQghhHk4fpH8CkU3xYlioLZheZ3qGBKQTExtEjop6ldmjVKlSzJo1i+vXr9OlXz8ssmdn3bp1NGjQgDVr1tCqVSs+//xzBg4cmGEB6VjLly9nwoQJHDhwgFy5cgGGutcALVq0ICQkhMaNG9OtWzeGDx9OxYoVjQFpnU5n9vG8rnrO6ik3SoR1yZIptyme/iwppT1K812N7/i68tepPraKV/wJcxnx9RVCCCHeBAlKCyGEEEKIDGNrqeXXPglXHWb2fRS9Xm/8X6/Xs3HjRjp27Mi8efOoXbs2+/fvx9fXlzp16hjbLVmyhP79+7No0SK2bNlCkyZNMvMlCCEyQvP/vXpcsh188xzy1QIzpT8VQgiR8VTa9GXksciT20wjSb2dO3dSqlQpYykZlUpF1apVmTNnDr/99hsTJkwAYMaMGfz5559s2bKFkydP8sknhsC5uQLSsdfKcQ0bNoxevXoRHR3N8uXLgVd1r4sXL87KlSuxtLQkKCiIXbt20azZq8kB6ckmlNEBWO///S/Z/RZ5cuPaubPZzqdRJf9edC/enY+KvMrs4m7jTqlspeK1KeYmpUSEEEK8nbJO3kQhhBBCCPHeaFbS642eL+4NOr1eb6yvF/v/5cuXqVmzJoGBgdSqVQt7e3v+/PNPcuXKxc2bN3FycqJ27dpotVoWLFiAq6trvH6EEG+xlvPBPhvkrgqWGV/rVAghRNZmW6HCGzvX60FkS0tL9Ho9bdq0YfPmzYAh8Nu4cWMGDRrEN998Q7ly5WjSpAnu7u64u7vH68ccAdyYmBhjsPnhw4dYWloaz/PVV19x8+ZN/vjjD4oUKULLli0BsLa2xs/Pj0aNGqEoChYWFma7VlaRsUFpC8+k0717/TAFx6ZNUZmxRE9ep4S1w+PqW6ovNlobVl5eCUCbgm0StPm46Mdo1doEK6iFEEKIrE7uogkhhBBCiDemegF3fu9XlXJ5XN7oeVUqFeHh4XTq1Ing4GAAPLNgswAAtGhJREFUmjVrxsKFC1EUBUdHR44cOUKpUqUoWrQof/zxB7ly5SI0NJTVq1ezefNmfH196dGjB66uruh0OglIC/GuKN0RCtSXgLQQQrynfE+cwH3AAFRWVjg0aIDniBFv5Lw6nS5BELl69epMnTqVEydOMHDgQON2e3t7Y71mPz8/7t69G+84c60m1uv1xoB069atadOmDUWLFmXSpElcvHgROzs7Jk6ciFarZcmSJRw8eNB4rKIoaLVaLCwsUBTFbNfKVXIkHngt61HW+HhJoyXpOod93brxnqusrcm/+0+cPvjArAFpgLKeZZPdb6M11Ite33w9g8oO4tOSnyZoY6GxoGuxrvi6+Jp1bEIIIURGUymKomT2IIQQQgghxLvN/2UEweHRFPBwyLQxBAQEUKZMGby9vbl16xaFCxdm06ZN2Nvbs3//furUqUOjRo3YvHkzFhYWAKxdu5ZvvvmGH374gebNm2fa2IUQZnLgR/hr3KvnfQ5C9hKZNx4hhBAZ4nLhIia1c+/Xl2yffZbBo0kodmXz/fv3mTJlCiqVilq1alGnTh1cXFxYsGABw4cPZ/z48cbg9OzZswkNDaVYsWLxUmObm7+/P40aNcLZ2ZmJEydy/vx55s+fT/Hixfn+++/x9vZm//79jB49mpw5c/L1118ba0dnhOCoYH67+huNfBrhZedFrbW1iNHHsK/9Pvbc28ODlw/SXfdbFxJCyN592NepjdrOLsNThu+7t4+BewYm2O5g4cChTocy9NxCCCFEZpKgtBBCCCGEeG8sXbqUHj16ULRoUc6ePYtGo0Gn06HRaPj+++8ZM2YM/fr1w9PTk4CAAGbOnMn//vc/Pv004QoFIcRbKOwFTPkvbWaHNVC4aeaORwghRIZ4OGIkQRs3JtvG8+uvzFor2BRxU1qfOnWKBg0aULx4cbRaLSdOnKBp06aMGjWKYsWKMXHiRL799ls6dOiAnZ0dv//+O6tXr6ZRo0YJ+jIXnU7HhAkTOHPmDBs2bADghx9+YPTo0fj6+lKzZk2mTZuGtbU1P//8MzNmzGDx4sWUL1/erONIdox6HXpFj4XG4o2dMyPUWluLFxEvjM+XN1lOUbeiWGmsMnFUQgghRMaSoLQQQgghhHgnJXajbuXKlfz111+sXr2ar7/+mtGjR8dbCTFjxgz27dvH48ePyZ49O4MHD6Z27dpAwpp/Qoi3UEwUTMhmeDz6MVjYZO54hBBCZAh9RAShR47wYvESwo4fT7RN9m+/waVjxzc8MoMLFy7wzz//cPfuXSZOnAjA5s2bmTVrFl5eXkybNg03NzeWL1/OL7/8AsCwYcOo+1qa6fRK7Pr23LlzPH36lHr16vHJJ5/w119/sWzZMrZt28bPP//M559/zsiRIwFDzWkvLy+zjul94R/mzy9XfqFFgRZYaazIbpc9s4ckhBBCZDgJSgshhBBCiHfK5cuXKVLEkLIxqRUkCxcupE+fPixfvpzOnTsTExNjrJ8HEBYWhlarxdLSEr1ej0qlkoC0EO+KF7cABVzzZfZIhBBCZLB7vfsQsn9/ovt8TxxHY2//hkcEly5donjx4tjY2DB58mQGDBhg3Ldw4UKmTp3K//73Pxo2bAhAZGQkKpXKeF1qrtXRr1//vr598+bNjB8/nrlz51KhQgX++ecfmjdvjqOjIxMnTqRTp05mGYcQQggh3h/mzfEihBBCCCFEJrpx4waVKlXiyy+/BEjypt2nn37K0KFD6dOnD4cOHUKr1fLo0SPGjTPUmrW1tcXS0hJFUVCr1RKQFuJd4ppXAtJCCPGesK+X+MpirYdHpgSkAYoWLcqcOXMIDw8nICAARVHQ6/WA4RpVURR27txpbG9lZRXvutQcFEUxBqQ/++wzevfuzaBBg7h3756xzeXLl3n58iWFChUC4ObNm/j5+TF69Gg6ZtIKcyGEEEK83SQoLYQQQggh3hnu7u589dVXLFq0iJUrVwIYb/K9bvLkyfj5+dGwYUOGDBlCoUKFePLkSbw2EowWQgghhHh7OX/4YaLbHRo0eMMjia9v37707duXadOmcezYMWOwWVEU8uTJg4eHR4JjzHVdGpsFKCwsjFKlSvH3338THh7Opk2b+OCDD1i7di0AHh4eODo68t1337Fy5UqGDRtGgwYN6NmzJyqVKslrbCGEEEKIpEj6biGEEEII8U558uQJU6ZMYcmSJezYsYPKlSuj0+nQaDSJth88eDDPnj2jWrVq9O3b9w2PVgghhBBCZKTIGze46dfM+NymVClyL/0ZtY1Nhp0zsVrNr4uOjqZp06bcuHGDr7/+mmLFinHs2DG++OILNm3aROPGjTNsfJcuXSIoKIglS5Ywd+5cLCws0Ov1tGrVipcvXzJlyhSKFCnC8OHDOXToEEFBQfTv358vvvjC5NcnhBBCCPE6CUoLIYQQQoi33us3xq5fv85XX33FoUOHOHbsGNmzZ09QNy9uoDoiIgJra2sg6TrUQgghhBBCpCTuNeaLFy9wdXVNsu3Tp09p1KgRZ86coUOHDty/f58vv/wSPz8/s44p7rVyeHg4NWrU4NSpU/j5+fH777+j0WhQq9XcvXuXBg0a0LRpU6ZPn05ISAgqlYqAgAC8vb0BuVYWQgghRNrJFYQQQgghhHirxcTEJFipUaBAAUaOHEmuXLlo3rw5AFqtFp1OZ2wTd+V0bEAakq5DLYQQQgghRHLiBqQnTJhAjx49+Pfff0lqTVC2bNlYuXIlHh4euLm5sW3bNvz8/OLVmU6vuNfKoaGh2NjYMGfOHEqXLs2zZ89QqVSoVCp0Oh25c+emW7durF+/nvDwcGxtbbGzszMGpM1Z11oIIYQQ7x+5isgCIiMjGTNmDJGRkZk9FCGEyBLkc1EIYSqdTmdc/fzNN98wdOhQhg8fzpUrVyhVqhSTJ08mMDCQtm3bAoZA9NtY/04+F4UQIiH5bBRCZDWxAekPP/yQxYsX06xZM6ysrJJNdV20aFEWL17MvHnzWLhwIWCoH52W4O/rn4txr5UHDx7MTz/9xOPHj6lUqRITJkzg7NmzjBs3DpVKZRx7eHg4RYsWxcrKKsEYJGW3EOJtI9eLQmQtkr47CwgODsbJyYmgoCAcHR0zezhCCJHp5HNRCJEaISEh1KpVC71eT+PGjTl48CDh4eEMHTqUDh068Ouvv/LZZ5/Ro0cPvvvuu8webprI56IQQiQkn41CiKzop59+YtasWfz55594eXkZt8ctF5NYTeY5c+YwcOBA1q1bR5s2bdJ07tjPxcDAQJycnIzbateujaIojB8/nkqVKpEtWzaio6NZuHAhAwYMYMSIEZQpUwZbW1u6du1Knz59mDBhQhrfASGEyDrkelGIrEWbchMhhBBCCCGynti5lePHj8fT05Pt27cDMGbMGH788UfjTb+mTZvy9OlTBg4cSO7cuenTp0+mjVkIkTHipksVQgghMtOVK1fInz8/Xl5e7N+/nyNHjrB69Wpy5cpF27Zt6dq1a6Irjvv378/jx4/x9fVN87mHDRsGvFrRHBERQbdu3fD29mbz5s3GdrF1ofv168fdu3eZNGkSxYoVo169egwZMoTRo0eneQxCCCGEEEmRoLQQQgghhHgrxd5su3PnDi1atADgk08+YevWraxZs4ZmzZoRGRmJVqulU6dOPHr0iAIFCmTmkIUQGUBRFDQaDY8fP2bv3r3ky5ePYsWKYW9vn9lDE0II8Q5LakJUxYoVmTlzJjVr1uTJkyeUK1eONm3acPjwYVavXk2LFi1wcnJKNDA9fvz4NI/n/Pnz3L59O942vV5PaGgoH3zwAQC7du3iwoULbNq0idKlSzN+/HjGjh3L06dP2bJlC8OGDSNnzpzJvj4hhBBCiLSS9N1ZgKSQEEKI+ORzUQhhqpiYGBo0aICfnx+HDx/m0qVLrF27lpIlSxIWFsbSpUvx9fWlfv36xMTEoNVqE02XmNXJ56IQCcX9WT548CBNmzYld+7cXL16lf79+9O7d28KFy6cyaMUGUk+G4UQmSVuwHbt2rU8evQIR0dH/Pz88PT05LfffuP06dM0b94cLy8vcuXKxdKlS1myZAm7du0yZvQxh2fPnuHu7g68+lzcvXs39erVIyQkhEaNGuHk5IS/vz8ODg64uLjg4uLC8ePHadq0KZMmTeLOnTu0b98eRVE4fPhwmupZCyFEViTXi0JkLXKFkQVYWVnx7bffYmVlldlDEUKILEE+F4UQplAUBa1Wy8cff8zw4cO5d+8ely9fpmTJkgDcuHGDFStW8PjxYwC0WkOSoLctIA3yuSgEGCahxBX7s3zt2jWWLVvGmDFjOH78ODNnzuTIkSNMmzbN+PMv3k3y2SiEyCyxAekOHTrw+eefs3r1aiZOnEi5cuU4efIkH374IRMnTqRSpUrkypWLu3fvMnv2bEqUKIGlpaXZxnHlyhW8vLzYsmULYPhc7Nq1Kw0aNGDlypXY29szd+5cKlSoQPny5Zk8eTILFixg8eLF5MqVy1h3Ok+ePCxYsIAbN27QrFkzs41PCCEym1wvCpG1yEppIYQQQgiRpcXWvIv1eirBJ0+eMHz4cLZs2cK2bdtQq9UEBgbSo0cPmjRpwqJFizJj2EIIM7p27RpXrlwxph+NtXjxYtauXUtMTAwrVqwwphz93//+x6pVq6hduzZjx46Vm1BCCCHMJjZTx+zZs5k+fTr79u3Dzc2NiIgIOnbsyK1bt/jjjz/Imzcvx44dY8OGDaxbt47SpUvz22+/mX08nTp14u+//2bHjh2UKFGC8PBwvvnmG+bOncuePXuoVKkSkZGR8X4XHj9+nI4dO/LNN9/w8ccfG7fv2rWLgIAA2rdvb/ZxCiGEEELISmkhhBBCCJGlqdVq7t27x7BhwwAS1Lbz9PTk22+/pWHDhrRq1YoOHTowZMgQPvnkE2NAWq/Xv/FxCyHMZ9WqVezbtw8wBANieXp6cv/+fS5dukRISIhx+8CBA6lbty4HDx7kf//735serhBCiHdQ7O+foKAgAC5evEipUqXIlSsXVlZWuLq6smPHDmJiYoy1oV1cXHjx4gVDhgwxBqR1Op1ZxhN7fbtq1Sp8fX3p3LkzDx8+xMbGhlGjRtG0aVNatmyJv7+/MSC9fv16Jk+eTP369enYsWO8gDRAw4YNJSAthBBCiAwjQWkhhBBCCJGlKYrC7t272bhxI7dv3060Tb58+fjll1/4448/2LFjBxs3bmTcuHGA4caf1MUT4u2WM2dOPD09gfhB6WbNmjF69GicnZ2ZP38+z549M+4bPXo0+fPnZ/ny5Zw+ffqNj1kIIcS7Ifb3jkqlYtOmTZQqVYrAwECsrKy4fv06YJg0GRkZiVqt5ssvv+To0aM8ffqUggULMn36dPr37w8kzPiTVnEzCV28eJERI0Zw4cIFRo0aRUhICC4uLvz444/kzp0bPz8/42u4e/cue/fu5eeffzYGzmXyphBCCCHeFLk7J4QQQgghspTXb4ypVCrKli1LQEAAN2/eTLRN7PNSpUpRuHBhfH19AcNNRHPc+BNCZK6AgAAaNGjAqVOnGDp0KF999RU///wzAJ07dzamLl2yZAnR0dEA2NnZMXbsWGbMmEGZMmUyc/hCCCHeYiqVCoDLly+zdOlShgwZgoODAw0bNkSv1zNlyhQA42rkoKAgnJ2dcXNzA8DW1hYw73VpbED6hx9+oEaNGmzfvp2iRYuyYsUKvvrqKxRFIXfu3CxYsAB/f386dOgAwJAhQ1izZg2tW7cGEpbJEUIIIYTISFJTWgghhBBCvBU6depEcHAwW7duzeyhCCHekNi6nQDbt2+nVatWtGzZklu3bnH79m2qV6/O77//DkDv3r25cOECn376Kd26dcvEUQshhHjXLFy4kDVr1mBjY8OaNWtwdHQkICCACRMmsHfvXrp27cpHH31EYGAgH330EcWLF2f+/Pmo1Wrj7zFzu3DhAvXr12fmzJm0b9+egIAANmzYwKeffsq0adMYNGgQYKgT3bhxY+bMmUPfvn2B+L9fhRBCCCHeFJkKJ4QQQgghshw/Pz+GDBnC33//bdxWr149Xrx4wYMHDzJxZOJdIykrs55//vmHJ0+eAK9Wpz148IARI0bwww8/sHbtWvbt28dvv/3G/v37+eyzzwCYPHkyOXLkYPr06ezduzfTxi+EEOLdc//+fW7fvs29e/dwdHQEDPWiBwwYQLNmzRg1ahTlypWjVq1auLu7s3DhQjQaTYYGfh8+fIiiKNSuXds4nh49ejB69Gi+/PJLduzYAUD9+vXZu3evMSANSEBaCCGEEJlCgtJCCCGEECLLadGiBZcuXaJLly6MGzeOJ0+e0KFDB65cucL+/fuB+HVlhUiL2JSVAQEBXLt2jfDw8Mwe0nsvNDSUzp07c/To0XjbHz16xJ07d6hfvz5gSIVas2ZN5s2bx5w5c/jnn39wdnZm/PjxNGvWjFq1amXG8IUQQrzlFEVJ9Bpz5MiRdO7cmadPnzJ16lTj9rx58zJu3DjOnz/P0qVLWbZsGVu2bAEM9aMzkq+vLyEhIZw8eTLe+dq2bQtA9+7dOX78OGq12vh7MSYmJkPHJIQQQgiRHG1mD0AIIYQQQojX9erVi2bNmnHgwAFGjBjB3r17adGiBR9//DHr1q2jZcuWxvp8QqRWbMpKtVrNzp07GTBgABEREZQtW5aPP/6YDz/8MLOH+N6ys7Pj7NmzODk5ERERgaWlJWq1mgIFCuDg4MCRI0coWrSosX3NmjXx9fXlxo0bVKtWjSJFijBx4sRMfAVCCCHeVnHrK1+8eJGbN2+SI0cOcufOjYeHB3379uXx48ds3LiRfPnyGesy63Q68uXLR758+Yx96XQ6s9WPToqDgwOtWrVi+vTp+Pj4GH8/arVa/Pz88Pf3x9/fP94xWq3cChZCCCFE5pGV0kIIIYQQIkvy8vKiffv2bN++nc6dOzN+/HgWLVrE+fPnjStaZbW0SK24NRSvXbvG2LFj6d+/Pz/++CMajYapU6eyYcMGY1vx5jk6OhIUFISvr2+81Wg1a9Zk06ZNHD582LjNxsYGQCapCCGESLfYgPSsWbOoW7cuo0ePpmPHjgwZMoTbt2/j5eVFv379yJ49O/PmzTP+Poo9Lq6MDkgDuLm50aFDBzQaDZ9++ilHjx7l4sWLTJ06FUtLS3bv3o2fn1+Gj0MIIYQQwlQSlBZCCCGEEG9catIZFilShJ49e3Ls2DGGDh3KixcvWLFiBSD18ETqxX7PLF26lNGjR1OmTBkGDRpE+/btGTduHPnz5+eHH37g+PHjqFQqqTn9hsT9TFAUBScnJ7p27crYsWPZunUrzs7O9OzZk+DgYL7++mu2b9/OpUuXGD9+PGFhYRQvXjwTRy+EEOJdMXXqVH744QfmzZvHuXPnGD16NLt27WLQoEGEhoZSpkwZevfujVar5csvv+TBgwcZcj2a0rVy7MS5Zs2aMWjQINzc3KhZsyZ+fn4cPnyYqVOnYmVlZfZxCSGEEEKkh0qR6f9CCCGEEOINipvO8M8//yR//vxky5YNBweHeGkT44rd/vLlS7755htevHjBggULsLCwSLS9EMkJDQ2lT58+/PHHH9SsWZPffvvNuG/v3r38+OOPKIrC/PnzyZUrV7zV1cL84n4m/Prrr1hZWdGsWTM0Gg09e/Zk48aN7Nmzh5IlS7J161bWrFnDmjVrKF68OBEREfz+++8SlBZCCJFusdcHdevWpXv37ly4cIEmTZqQK1cugoODqVy5MosWLQJgwYIFPHz4kDFjxph9HK//XnRwcMDb25sSJUrEa/f69cmZM2eIioqiYsWKCfoRQgghhMgKJCgthBBCCCHemNibZ48ePaJOnTpERkYSHh5Ou3bt+PLLL8mZM2eSN9Bij50+fTorV67k5MmTmfAKxNtGURQURUkweeHGjRt8++23HDx4kJ9++olGjRoZ961fv54pU6bg6enJhg0b5IZuBop7Q719+/YcPXqU0aNH06RJE7y9vYmKiqJx48Y8fvyYPXv2kD17dgDOnz+PSqWiYMGCshJMCCFEmiQ26ezq1au4uLhw48YN2rVrR9euXRkzZgy9e/fml19+4fPPP2fcuHEp9pNeDx8+pHbt2qhUKiIjI/H392fixIl06dIFd3f3eG0Tm9QpAWkhhBBCZEWyrEQIIYQQQrwxKpWKZ8+eMXfuXGrWrMmBAwf47LPPOHfuHMOHDyciIgKNRpNoyuTYm31HjhzBwcGB8PBwqfkrkhV7k1itVnPp0iXWrVvH7t27efbsGfnz52fw4MEUL16cWbNmcf78eeNxbdq0oUuXLvTq1Utu6GaAAwcOGB+rVCpiYmJo3749t27dYs+ePfTs2RNvb28ALC0t+fXXX4mMjKRnz54EBQUBUKJECYoXLy4BaSGEEKkWe52pUql4/vw5AQEBxn2+vr5ky5aNX375hYYNGzJq1Cg0Gg0+Pj5ky5aNtWvXcuXKFWMf6Q1I6/V6Hj58aHyuKApBQUG0b9+eypUrc/z4cY4fP87EiRMZP348q1evJjIyMl4fmVXTWgghhBAitSQoLYQQQggh3piDBw/StWtXDh8+TL9+/fD29mbUqFG0b9+e69evM2rUKMBwcy2xwPSFCxdwcnJi27Zt2NjYSEplkazY748FCxZQrVo1vv32W7p3706lSpU4c+YM5cuXp0+fPoSFhfH999/j7+9vPLZ///40a9Yss4b+zvrll1/o3LkzISEhxm0BAQHcvHmTb7/9lnz58nH9+nX27t3Ld999x8aNG3F3d2fDhg1s376dL774QiajCCGESJfYIO7ChQupVKkSlStXpnnz5vyfvbuOiyprAzj+GxokRBTFxO7urrVz7c61u11z7Y5dXV27O1bX7tZV1wC7FQMDRemamfcPXkaQmoEZBvT57mc/Mveee+4DDDN37nPOcw4cOKBp8/DhQ969e4e1tTUAHh4eDBgwgJMnT5IvXz5NHwm9FlWr1QQFBfHTTz8xbdo0PD09Nf0FBgby4cMHWrdujb29PenSpWPIkCH07NmTadOm8fbtW00fQgghhBApiSSlhRBCCCGEwXx7s8zT05N3795x48YN0qRJo9nepUsXGjZsyLlz55g3bx4Q86yPQoUKsXz5clKlSmXYwMV349ixY4wdO5bVq1dz9uxZTp8+Te7cufn555959OgRDRs2pG3btrx8+ZJhw4YRFhYGJPwms4hbmzZtuH//Pra2tvj6+gLg4+ODWq3m+PHjjB49mqFDhzJkyBB2795N69at2b9/P0WKFGHfvn00bdpUfjdCCCESJPKAx7179zJu3Dj69+/P+PHj+fz5M9OmTWPp0qUA1K5dm2fPntGwYUMqV67M2bNn6dixI5kyZdJLMlihUGBlZUXdunU5ePAg27Ztw8fHBwBfX1+eP3+OnZ0dAIGBgQDMnDkTa2trtm7dqulDCCGEECIlkaS0EEIIIYQwCKVSGe1mWcuWLRk8eDBOTk5MmjSJoKAgAKytrenZsycVKlRg4cKFnD9/3hghi++AUqmM8vjRo0fkzJmT+vXr4+TkRM6cOTl8+DCpUqVi6NChAJrnXoYMGaTcZRKwsbHh77//plixYrx584acOXPSqlUr7t+/z/79+2nUqBFr167l2LFjFC9enKdPnwLQoEEDmb0uhBAiwSIGPE6YMAF3d3fGjBnD4MGD6dChA7t376ZAgQJs374dd3d3OnbsSL9+/bC1taVgwYLcvn2bdOnSoVKp9JIMVqlUqNVqRo4cSfPmzVm2bBl79uwhKCiI3LlzU7duXYYPH05gYCDW1taEhYXh4+ND6tSpcXJySvT5hRBCCCGMQZLSQgghhBBC71QqFaampnz69Ilhw4YxcuRIJk2aREhICB06dKBHjx7cuXOH2bNna45Jnz49PXr0YM6cOVSqVMmI0YuUSq1WY2pqyocPH6hQoQJPnjwhKCiIFy9eYGlpiUKh0AyEmDt3LteuXePu3bsATJkyhTlz5qBQKKQcpgF8+zMtX748oaGh9OjRA6VSyciRI9m6dSu3b9+mZ8+eFCtWjICAAPz9/TXrSwshhBAJEfk96MuXLyxfvpyJEydqKnao1WrSpk3LkCFDeP78OZcvX8bJyYk+ffqwdetW/vrrL8zMzAgLC4uxkk9CKRQKgoODadmyJWq1muXLl3P69GkABgwYgFqtpm3btoSGhhIUFMS1a9d4//69vC8KIYQQIsWSpLQQQgghhNA7ExMTbt68Sf78+bl79y4hISGsWbOG2rVr8+DBA/r27Uv58uU5ePAgq1at0hxXqFAh2rRpAxDjmtJCxEWhUODt7c2YMWPImDEjqVOnpnLlyjg6OjJp0iQArKysAPD398fGxoZ06dIBYGFhAaC3GVDiq8hVEyL+rjNkyMCBAwc4f/48AwcORKlU4uDgAMC9e/fYv38/5cuXp1ChQjRr1sxosQshhEi5It5zFAoFb9++xcPDAwcHB06fPk2mTJm4evWqpmQ2QMGCBcmZMyfHjh0DoiazVSoVZmZmiY4pok8TExNu3LiBs7Mzy5cvx9nZmVu3bjFp0iRu3rxJzZo1GTduHO7u7mTOnJnq1avTpEkTBg0aRN26dRMdhxBCCCGEMUhSWgghhBBC6JVarSYgIICxY8fSunVrDh06xMKFCylTpgwvXrxAqVRqSicXKFCABQsWcPjw4Wj96HMmivgxXLp0if79+/PmzRtmzpyJk5MT+fLlo3Hjxhw8eJBFixYBEBAQwOnTp8mYMSOmpqZRbjrL807/TE1Nefz4MT179qRPnz5cuXKFkJAQChcuzPr161m2bBl//PEHAN7e3ixZsoSJEyfSpUsXtmzZYuTohRBCpEQqlUrznr5p0ybq1avH3r17+fjxI/ny5WPFihUcPHiQ33//HT8/PwCCgoIICAigWLFiQNQ1mxNzffDu3Tu2b98epc+AgACGDh1KixYtWLVqFWfOnOHff//lxYsXTJ8+nefPn9OkSRPc3NyYPn06ffv25ciRI4waNUrz/QkhhBBCpDSJH+InhBBCCCFEJAqFgsDAQDw9PVmyZAnBwcHUrFmTL1++cODAAQoUKICfnx9ZsmShZ8+eWFtbU7RoUWOHLb4DETd0Q0NDyZUrFwB2dnb07t0bgDFjxvD7779jY2ODt7c3x48fJ02aNMYM+bulVqs1N97v3LlDrVq1KF68OB8+fKBu3brMmTOH1q1b06RJE2bNmsXIkSPJnj07TZs2ZdCgQfTs2ZPChQsb+bsQQgiRkoSEhGgqn0QkkadPn87MmTOZOXMmDRs21KzHXLduXebNm8fgwYO5ePEiFSpU4Pr167x8+ZJ27drpLSalUsnAgQMpVapUlO1qtRofHx/Ne51SqSR//vysWbOGBg0akDlzZgYOHIirqyvdu3ePclzk708IIYQQIiWRKxghhBBCCJEoSqUy2rZUqVLh7+/P/v37KV26NPb29pw8eZICBQrw6tUrpk6dyps3byhXrhyLFy/GxcVFZnwIncT0vBs6dChdunTBz8+PKVOmaLa7uroyefJkLl26xOjRoxk4cCCPHj0ib968MfYjEidyuW4ALy8vOnXqxIEDB7hy5QqtW7dm3rx57N+/n7CwMIYNG0avXr1o1qwZN2/eJFeuXMkuIf3t80TWHRdCiORlw4YNNGnShNDQUM22O3fusGnTJrZv307fvn3JkCEDoaGh3L9/n8DAQAYOHMiIESM4cuQIXl5eVKlShYcPH+Lq6qq361JTU1OWLl3KiBEjAHj69CkAlpaWBAcH8+rVKyB85rNKpaJOnTrUqFGDrVu3smbNGs0s7ggKhUKWGRFCCCFEiiUzpYUQQgghRIIplUpMTU0JCgriyZMnZM6cGQcHBywtLalbty5Dhw6lcePG7NixQ3PM9evXOX78OI0aNSJjxowoFArUarXM+BBai3je+fv7s3z5ciwsLMiWLRsNGzZk4MCBvH79moMHD5InTx5at24NhN8ULlSoEIUKFYrWj9AftVqt+ZnOnz8fb29vnj9/TpkyZTRtli5dSqNGjVi4cCFOTk7UqlWLxYsXY2pqSrZs2YwVeqxUKpXmezp06BD16tWThIAQQiQzJUqUwMHBAXNzc802tVqNWq0mNDSUe/fusXnzZo4fP86rV6/ImTMnp0+fZtasWTx48IDjx49z8OBBrK2t9X594ODggEqlol+/fnh6ejJjxgzy58/P0KFD6dmzJ/Xr16dGjRoAhIWFkSZNGnLkyIG5uTm2trZ6i0MIIYQQwtjkzp8QQgghhEgwU1NTbt++Tb58+WjevDlFixblyJEjALRp04Zy5crx/v177t69i7u7OytWrKB9+/Z06NCBihUravqRBI/QRcTzLm/evOzcuZODBw/Stm1bBgwYAMCwYcPImjUry5Yt49y5c0DMZS4lIa1fKpVK87fcrFkzZs2axdmzZ9m0aRMnTpzg06dPmrYbN24EwhPXFy5cAOD333/H0dEx6QOPQ+Q1SZs0acLkyZPx9PQ0clRCCCEinDhxgoCAAAoWLEjjxo25d+8ey5cvB8KTwZkyZWLcuHGUKVOGV69e0bRpU5YsWcLt27dZt24dAGvXrsXc3Jxu3brh5eWll+uDiIQ4hF9vmJiYUKRIEV6/fs3q1av5+PEj7dq1o1OnTjRo0ICdO3dy+fJlNm7cyOPHj9myZQvjxo1LdBxCCCGEEMmJJKWFEEIIIYROIsoZqtVqgoKCGDt2LK1bt2b9+vWUKFGCvn37smvXLsqXL8+IESOwsrKiePHitG/fntmzZ7Ny5UoGDx6s6UOIuPj5+fHlyxfNY7VazcePH+nVqxctWrTgwoULHDhwgOLFi3PixAm8vLzInTs3/fr1I1WqVIwbN46HDx/KwIckYGJigre3N6tXryZdunS4u7tz5swZpkyZwvPnz1m5ciUhISFAeKJgxYoVXL16lQ8fPhg58qjevHmj+drExAQfHx/WrVuHtbU1f//9Ny4uLkaMTgghRISlS5dSq1YtTp48qVlm4cSJE/Tu3ZudO3eSJUsWli1bxrRp0zhx4gQLFixg1KhRlC9fnpw5c5IjRw4AUqdOzY4dOzh16hRLlixJdFwRg7QUCgUnTpxg7ty5APTp04fmzZtz4sQJVqxYgZmZGUuWLKF79+4MHDiQ1q1bM3z4cIYMGUKWLFk0fQkhhBBCfC8UarkTKIQQQgghtBS5nOGrV6+wt7dn0qRJDB8+XJOoady4MZ6ensydO5eqVasCcOnSJRwdHbG3tydjxoyaZLQkCkVs1Go1nz9/JnPmzGzbto2GDRtq9nl4eNC8eXOOHTuGjY0NDRo0wMPDg61bt1K8eHHN83T9+vUcOHCABQsWkDFjRiN+Nz8Gf39/+vbty9GjRylbtix79uzR7OvXrx/Xrl2jR48edO/eXbP9zZs3yeZ3o1Kp6NWrF46OjsyaNQuVSoWvry9t2rTh8ePHVKhQgfXr10vZdyGESEZatmzJ1atX2bp1K2XLlkWhUDB48GDWrFnDsWPHoiwfERwcjJ+fHwMGDODevXscOnSIDBkyaPbfuXOHggUL6i22KVOmsGjRIpo3b07fvn0pXLgwAP379+fKlSv07NmTX375BYB79+4REhJCmjRpyJIlC2q1Wq6ThRBCCPHdkaS0EEIIIYTQSuSbYx06dODatWu8f/+ebNmyceTIEdKlSweAj48PNWvWJH369IwfP56yZctG6SdyOVwh4nPhwgVNqfeI587du3epU6cOW7dupX///jg5ObFp0ybSp0/PkydP2Lt3L4MHD8bExITQ0FDMzc3l5q4BRPw+Iv9s9+7dy6xZs3j//j2XL1/GyckJAF9fX7p168aXL1/45ZdfaNWqFUCy+73s37+fBg0aRIlp2bJlTJ8+nfTp03P8+HHs7e3ldUwIIYwsMDAQa2trIHw9aVtbW5YuXapJKjdo0ICHDx9y7NgxXF1def36NStWrODo0aOEhYVx7NgxHBwcDPY+NG/ePObMmcO2bds0gzQj+Pn50b17dz58+EC/fv1o3rx5lP3yHiOEEEKI75Vc4QghhBBCiHhFlCFUqVQsXbqUe/fuMWPGDBo2bMjr16/5888/CQoKAsDe3p7169dz69YtZs2axbt376L0JTfZRHzUarWmDGeFChVQKpW0bNmSY8eOERYWRo4cOShZsiSVK1emYMGCHD9+nPTp0wNw48YN/v77b+7cuQMgCWkDCQsL0/wtf/78mdDQUCB83eU+ffrg4ODAuHHjNK8LdnZ2zJw5k3fv3nHx4kVN++Tye4koj9qwYUMUCgXLli1j/PjxAPTq1YvevXvj6+vLn3/+SWhoqCYZL4QQIumpVCqsra3x9fVlyZIl1K9fn/PnzzN37lxevXoFwKZNm7C0tKRbt274+vqSKVMmXFxcaNWqFVeuXMHBwYGwsDCDvA+FhoZy5coVBg4cSNWqVXn58iXnzp1jzJgxrFmzBltbW+bOnYtKpWLhwoU8evQoyvFyrSyEEEKI75WZsQMQQgghhBDJn4mJCe/fv2fKlCm8ePGCadOmUbduXZo2bUq/fv04evQomTNn1pQgzJ8/P3/88QevX7/WJAuF0FZoaCgWFhZA+EwoGxsb7t+/z4gRI9iyZQsFCxakefPmPH/+HGtra9RqNW/evOHUqVMMHDiQCRMmaEpkQvJJfH4vVCoVZmbhHyU7d+7Mw4cPsbCwoGzZssyePZuOHTvi6enJrl27mDFjBpMmTQIgZ86cbN26lXz58iW730nkBLOfnx8XLlzg/v375MiRg65du/Lrr7/y/Plz/vnnH9KnT0+3bt1QKBQy4EEIIYzAxMSE169fU6lSJQoXLsxPP/1Ejx49WLVqFU5OTowdOxZHR0d2795N1apV6dGjB1u3bqVnz56a12ylUql5L9M3c3NzzM3NWbt2LU5OTuzevZuAgACCgoKYM2cOXl5ejBgxgokTJ/Lq1Sty585tkDiEEEIIIZIbKd8thBBCCCG0cvjwYcaMGcOTJ080a8bC17K8nz59YtCgQTRu3DjasZK4Edo6efIkv//+O3v37uXgwYMsXLiQnTt3Ym1tTZ48eciVKxebNm3C0dGRxYsXs2TJEry8vMiRIweenp5Mnz6dLl26APK8MyRfX19q1KiBjY0N7dq14+XLlyxcuJD69euzdu1aFAoF48eP599//6V169YMGDDA2CHHy9vbm9evX1OoUCEePHjAtGnTePHiBaNGjaJ+/fp8/vyZbt264e/vT+/evfn555+NHbIQQvywVqxYwcKFC7l8+TK2trYArF27lm7durFw4UK6d+9OqlSpOHjwID///DNubm7ky5cP0N/1wbdltiP3e+3aNaZPn46bmxu9evWiZMmS1KhRgx49evD69Wv27duHqalpjMcKIYQQQnyvJCkthBBCCCGiUSqVUW6URdiyZQu//fYbxYsXZ9WqVaRKlQqAp0+f0rt3b4KCgpg3bx6lS5dO6pBFCubp6YmLiwsqlYr169ezYMECHBwcOH/+PEuWLKF3795A+POscOHCtG3blkWLFmFtbc3nz585cuQIadKkIXv27OTKlQuQ9Rj1Ia4b5EePHmXIkCEcOXKEzJkzA+E34CtXrsyoUaOYOHEiHh4ejBw5kqdPn7Jjxw6yZcuWlOHrJCgoiP79+7N//35u3LiBi4sL586dY/bs2SiVSqZNm0bx4sV58OABHTt2JF26dKxZswZnZ2djhy6EED+k0aNHc+LECa5evYpKpUKtVmNqakr//v3Ztm0bK1asoF69elhaWvLx40ecnJz0ev7I1xmPHz/WXH98KzQ0FHNzc80xjRs3pmTJkkycOFGuU4QQQgjxw5GktBBCCCGEiCIsLExTzvDatWtYWVmRPn160qZNC8Ds2bP5+++/qVWrFpMnT9Ycd+LECbZs2cLvv/+uSVYLEZ8HDx5QtmxZjh07RunSpVEqlVSrVo0LFy7QvXt3VqxYAUBISAgWFhYcPXqUunXrMnXqVPr164eDg0OU/iQZrR+RE9L79u3jzZs3BAcHM3DgQADmz5/P/PnzNWt3Rvx+/vrrL0aNGsXt27fJkiULt27dQqFQUKhQIaN9LzGJeJ5E/j6PHz/OjBkzgPDXM4Ddu3ezePFi0qdPz7x588iYMSNnzpzB1taWkiVLGi1+IYT40R04cIDmzZtz7tw5SpcurUn+bty4kU6dOpEqVSrOnz9P0aJFAf3ORI58rdGzZ0+8vb35448/cHFx0bSJfL6XL19y584dZsyYwefPnzl16hRp0qTRSyxCCCGEECmJ3K0RQgghhBAaEWvFqlQq6tSpQ7du3ShfvjwDBgxg7969APTt25eKFSty9OhR/vrrL82xNWvWZOXKlaRKlQoZ9yi05ejoqElIq1QqAKpXr07v3r25cOECmzZtAsDMzAylUknt2rVZtGgR48aNY9u2bSiVyij9SUI68SLfSO/Zsyfjxo3j3r17+Pr6an5HVatWJTAwUPO6EDGQpVChQtjZ2fHx40cAChcunGwS0iEhIUB4JYiI54mXl5dmf82aNenfvz/v37+nU6dOADRr1ow2bdrw6tUrRowYQUhICFWrVpWEtBBCGFnx4sVp3Lgx/fv35927d5rZyGq1mgULFjB06FBNQhrQa2nsiPeQY8eOcffuXSZNmhQlIR35fAEBAZw5c4ZZs2aRJUsW3NzcSJMmTbTrFyGEEEKIH4GZsQMQQgghhBDJh4mJCR8+fKBRo0bY2tqyZ88ePn78yKhRoxg3bhxp06alYsWKDBgwAB8fH1asWEGaNGlo1aqVpg9ZE0/owtnZGWdnZz59+kSbNm1YvXo1kydP5vHjx4SFhTFjxgyyZctGpUqVCAkJwdTUlH79+uHm5sanT59iLDMvEkehUKBSqWjevDkPHjxg586d5MiRAysrK00bZ2dn6tWrx59//knWrFkpXrw4AIGBgdjb22Nvb2+s8GN07Ngx+vfvj7u7O5aWlgAsXLiQVatWcerUKdKmTYtCoaBu3bp8/PiRKVOmMHnyZCZMmEDPnj15/PgxoaGhmuS7EEII48qYMSM9e/Zk6tSpVKpUiV69ehEaGsq0adNYu3YtgwYNAvRfQUWtVhMUFESFChWws7OjQoUKFChQINbz2NjYULt2bYoWLUrhwoWBqFWJhBBCCCF+JDKNQAghhBDiB/ftrOa9e/fi5OTEsWPHyJ49O0ePHuXy5ctYWVkxbNgwXrx4QbZs2fjll1/ImjWrpqx3BElIi4R4/fo1b9++5eeff0alUpErVy66du1Kvnz5GDlyJM+fP8fCwoK1a9dy5MgRli9fzujRo40d9ndr3759PHv2jK1bt1KgQAFNQjri9SJLlix06tQJS0tLmjVrxpQpU5gzZw7t27enVq1a5MiRw5jhR+Pi4oK/vz8tW7bUbHN2dsbBwYH+/ftrtllbW9OsWTMqV67M/Pnz2bx5MwCTJ09mwYIFMhNfCCEMLKIix7ciX69GfP3TTz+xYsUKKlasyK5du9i6dSsLFiygRYsWmrb6eN2OOF/EwEtra2saNmzI+fPnefPmDWFhYXGex9nZWZOQjqhKJIQQQgjxI5I1pYUQQgghBABv374lQ4YMBAYGcurUKerXr0+fPn04evQomzZt4s2bN3Tv3p1atWqxbt06rK2t+fLlS7Q1fYWIj1KpjDbDWa1Wc+bMGfr160euXLk0ZaEPHTrE/PnzefbsGeXKlWPz5s1cuXKFUqVKAbKGtKH88ssv3L59m3///TfavsjVENzc3Fi7di1nzpzB3t6ehg0bMnz48KQOVyvnz5+nUaNGtG/fnsWLF6NUKlm3bh1//PEH1atXZ8GCBZq248ePZ/HixZiamnLz5k0yZswozzMhhDCwyO/pW7Zs4dKlSwQGBtK+fXtKliyJnZ2dpu23lXnCwsLw9/fXXJfq6/og8jVLcHCwptoGQOfOndm7dy9///031apVk4GZQgghhBDxkKS0EEIIIYRg5syZvHr1isWLF2u2Xbt2jd69ezNv3jyqVKnCixcvqFGjBp8+faJ///5MmTLFiBGLlCryTeLVq1djaWkZpTz3vn376N+/P23btmX+/PkAXLx4kZ07d+Ll5cX06dPJnDmzlIk3sFatWvH582eOHj0aaxtPT0/NGprBwcGEhoZia2ubVCFq5dukxObNm+nUqRPz589n4MCBfP78mWXLlrFu3Tp69OjBkCFDUKlUjBgxAldXV+rUqUOePHmM+B0IIcSPZ+jQoaxatYqGDRvy6tUrbty4weDBg+nduzcZM2aM0taQ1wOR30NmzpypWQKiePHiDBw4EIDSpUujVCrZuHEjBQoUMEgcQgghhBDfC6kXI4QQQgjxA4oYlxhxE+/Bgwd8/vxZs0+hUODh4cG9e/c0SacHDx5Qvnx52rdvT7169YwSt0j5Im7u/vzzz5w7d45MmTJx79491q5dS7t27ahbty7jx4/n119/JU+ePPTu3ZsKFSpQoUIFTR8xzbQW+mVpacmTJ094+fIlWbJkibbf19eXHj160KFDB9q0aYOlpWWU2WPJQeTnibu7O1mzZqVdu3Z4eHgwZMgQcufOTb169Wjfvj3+/v6MHz+eQ4cOERAQwLt37zh9+jSZMmUy8nchhBA/lvPnz7Nnzx6OHz9O6dKlAZg3bx4bN24kTZo09O3bFwsLC017Qw5Qi7hmadCgAQ8ePKBHjx74+fkxatQoPDw8mDt3LqdPnyZ37tyMHTuWRYsWkTlzZoPFI4QQQgiR0kn9MSGEEEKIH0xE0jnymn2Ojo6aG28RN/cyZsxIiRIl6NOnD7NmzaJVq1YUL15ck5BWKpVJH7xIsSI/3zw8PLCwsODu3bscOnSIsWPH0rVrVw4ePEiqVKlo1aoVgwYNYsCAARw+fDhKP2q1WhLSBhTxexo0aBCenp7Mnz+fsLAwzUCWiP13797FxMSEIkWKGC3WuKhUKs3zpGPHjgwaNIgDBw6gVCrp168fPXr0oHXr1jx48IDMmTMzePBgNmzYQObMmSlRogTXr1+XhLQQQiSBbws4fvz4EZVKRfr06TXvOcOGDaN8+fKsWLGC0NDQJI3vn3/+4cuXL5w7d45Ro0ZRuHBhQkNDsbGxISwsjFSpUnH8+HH27t3LhAkTCAwMTNL4hBBCCCFSEpkpLYQQQgjxg1EoFLi5uTF+/HjKlClDtWrVcHFxwc3NDT8/P0353bJly9KlSxf+/vtvdu/ezdSpU+nfvz8giUGhOxMTE+7cucOTJ0+4ffs22bNnx9nZGYCJEyfy4sULunTpwokTJyhcuDBdu3blyZMneHt7R+lHSnYbVsTglAIFCjB8+HCmTp1K2rRp6dChA9myZcPPz48bN27QrVs3mjZtmmxLlZqYmBAaGkqjRo14+/YtK1aswNXVFVNTU+zs7Jg0aRLPnj2jQYMGuLm5kSZNGn7++WcaN24sr21CCJEEIkpjKxQKXrx4gZOTk+Ya9MOHD5iYmGBiYkJgYCDW1tb89ttvZM6cmcuXL1OjRo0ki/PKlSvY29vj4uLC5MmTmTt3LsuXL6dbt26o1WrevXtHgQIF2LFjB58/f8ba2jrJYhNCCCGESGlkTWkhhBBCiB/At+vt7dixgwsXLnD+/HlevXqFhYUFQUFBjB07lurVq5M1a1ZSp06tae/t7Y2joyMQfY1WIWIT+bmydetW2rVrR6FChbh9+zZ169Zl/fr1pE2bFghfk7hevXp4e3tz4MABMmbMSEBAADY2Nsb8Fn5onp6e/P7778yePRtXV1eyZs2KtbU1169fp0ePHkydOtXYIcbp+PHjDB06lL///pucOXNG2//o0SNatGiBtbU1//77rxEiFEKIH1Pk69K1a9cyZcoUWrVqxYwZM1Cr1VSuXBkIL+Ud4erVq3Tq1Il//vmH3Llz6zUelUoV64DLtWvXcvHiRfz8/Dh9+jQbNmygZs2aAGzZsoWHDx8yevToZLeEhRBCCCFEciR3E4UQQgghvnNhYWGaG3/BwcEAtGzZkoULF3Ly5EmuXr1Ks2bN8PLy4uTJk1SsWJFSpUrRtGlT9u/fD6BJSKvVaklIC61FPFfu3LnDzp072bp1KwcOHGDRokUcPnyY3bt3ExQUBISvYbxlyxYeP37MlClTADQJaRlHq1+RS6nHxcXFhZkzZ7Jnzx7atWtH+vTpqVixIlu2bEnWCemI58ujR48IDAzUvH59K3fu3Pz55588ffqUmzdvJmGEQgjxY4u4Lp09ezYDBgxgypQptGnTRrNv6tSpeHh4UK9ePY4ePcr169f57bffcHR0JH369HqL4/nz50DUCkCXL1/m0aNH+Pj4AJAtWza2bdvGuXPn2L9/vyYh/fbtWzZs2BDlOlsIIYQQQsRNZkoLIYQQQnzHIs9U7devH15eXvj6+tK+fXtatWqFubk5AGfOnKF169a4u7vz4sUL3NzcuH37NnPnzsXMTFZ8EboJCwvTPG8mTJjAP//8Q9q0adm1axcODg4AjB49mkWLFrF161bq1aunaf/s2TOyZMkizzsDUSqVmhvv//33H7a2tlhZWeHq6ppiqyBE/p4iW7NmDQMHDuTZs2ekTZtW87xUqVQcOHCAfPnykTt3bnx8fLC3tzdC5EIIoZtZs2ZpZhkPHDgwRZeK/vTpEy1atKBDhw5069Ytyj6lUombmxs9evTAy8sLCwsLsmXLxsGDB7GwsIhWASghHjx4QPHixZk2bRpDhgwBoHHjxly+fBlzc3MsLCxYvHgx9evXZ+HChYwdO5Zff/2VYsWKYWpqyvDhw8mUKRMHDx6UaxYhhBBCCC1JUloIIYQQ4jvn6+tLtWrVMDExYdCgQVy5coVDhw7RsGFDFixYAMDJkyfp0qULDx8+xMrKKsrxKTVRJYzr3bt3bNu2jVKlStG+fXuUSiUHDx6kUKFCmjbNmjXDzc2NjRs3Uq5cuSg3mCMntkXiqdXqKJUOevXqxfHjxzExMSEgIIANGzYk6Rqd+hL59Wnt2rUoFAqKFi1KsWLFCA0NpXjx4mTIkIHjx49rjnn69Cl9+/alR48eNG/e3FihCyGE1r58+ULVqlUxNTWlYMGCXLx4ERcXF2bMmEGlSpWMHV6C3Llzh6JFi3Lo0CFq1aqlqXIR+VrAz8+PT58+4e3tTdGiRQH9XR+8f/+eefPmsXLlSrZu3YqPjw8LFizgzz//JDAwkKVLl3LkyBH+/PNPmjdvzvjx4zl79ixubm7ky5ePEiVKsGTJEiD2wVFCCCGEECIqucsjhBBCCPGdiphFsmHDBhwdHTl27BgKhYK3b9/i6elJiRIlNG2LFClCQEAAt2/fplSpUlH6kIS00FVAQAC1a9cmXbp0DBw4kEWLFvHLL7+wZcsWBg4cqCm9uXv3bnLkyEGfPn04c+aMZhY1IAlpPXn16hWZM2dGoVCgUChQqVS0atWK+/fvs2PHDrJkyUL//v1p27Yt27dvp2rVqsYOWWsRCWmVSkWVKlXw9PTEx8eHnDlz0q5dOwYOHMjChQvp2LEj1atXp379+qROnZqpU6dSokQJSUgLIZI9tVqNSqVi6NChZMqUiQMHDgAwdepUJk+enCKWt4htVrOZmRn58+fHw8NDk9SN+Pf48eNkypSJ/PnzY2trS9asWYHw5K++rg+cnZ3p168fHz58oGPHjpQrV45evXppkt/lypWjRYsWjBs3jp9++okpU6bg4+ODt7c3ZmZmZMqUSROTJKSFEEIIIbQjdxiFEEIIIb5TETcAX7x4Qf78+VEoFHTr1o25c+eyY8cOOnbsSGBgIB4eHnz8+BFzc3PSpk0bYx9CaOvQoUP88ccf1KhRg7179wLQsGFDhg8fzsaNG9m5cyf+/v6a9tevX2fhwoVREtJCP2bPnk3v3r1xd3cHwhMDZ8+excLCgn/++YcSJUpw8OBBjh49ioODAx06dOD+/ftGjlp7JiYmfP78mR07dpAjRw7c3Nw4f/48ZcqUYe3atWzevJmffvqJkydPYmNjw7Zt21i3bh1dunTh77//Nnb4QggRr4jBRM+fP2fw4MEAdO7cmcWLF/P3339TuXJlwsLCjBtkHCInpE+fPs2OHTs4duwYAHnz5iVDhgwsXbqUmzdvAmiSuxs2bGDhwoUolcoo/ek7+Zs1a1YGDx5M5cqV+eeff3B0dATCB9cBbNq0iXfv3rFhwwYA7O3tyZYtmyYhrVKpJCEthBBCCKEDmX4ghBBCCPGdCwwM5MOHDzRs2JBnz55x/PhxChUqRFhYGFu2bCE0NJRevXrh7u5OunTpjB2uSMH8/PzYu3cvy5cvp1q1aqRKlYrQ0FDMzc0ZPnw4T58+ZcmSJaRLl45mzZphZmZG6tSpqVatml7WhxRRZc6cGU9PT1atWsWIESPInDkzefPmpXXr1uTIkYOFCxcye/Zs/vjjD+rXr0+xYsUYMWIEy5YtI2PGjMYOP16vXr2ib9++PHz4kE6dOmFra0vevHkZMGAAAQEBLFiwgHTp0lGrVi12796NmZkZ3t7e0QbfCCFEcuTn54etrS3m5uZ8+vSJhw8fsnTpUh4+fMixY8coXLgwPj4+/PHHH7Rt25acOXMaO+QoIs8gnjhxInPnziVLliya1+7Zs2ezY8cOypYty7Bhw6hQoQJFihRhw4YN3Llzh2PHjuk94RvTrOYiRYrQv39/7t27x8yZM2nYsCE2NjYolUrUajU5c+aMMpguMqkmJIQQQgihG7l6EkIIIYRIwXbt2sW+ffuizSQBNCUde/fuzYEDB7h//z6XL1/WrOn7+PFj/vrrL82NtnTp0qWIMpAi+fj2eWdra0v//v1p06YNN27cwMvLC3Nzc4KDgwFYsmQJ2bNnZ8yYMZrZuxEkIa1/7dq1o2vXrpw7d47ly5fj4+ODi4sLDRs25MuXL+zYsYMJEybQsWNHANKmTcuBAwcYN25csnwt+DamtGnTkj17dj59+sSbN28023Pnzk3Pnj1xdXVlxowZ3Lx5E0tLS0xNTXFyckrqsIUQQmeDBw9m8uTJvHv3DoAmTZowatQoPDw8uH37NoULFwbg2bNn/PPPP9y6dcuY4WpEnrUdkfz98OEDbm5unD59mqNHj7J06VLmz5/PzJkzSZ06Nfv376dkyZJs3bqVJUuWYGZmxu3bt8mdOzcqlUqvsUXEdOTIEa5cuaKpDlK+fHmmTZvG/fv36datm+aYd+/e8ebNG1xcXPQWhxBCCCHEj0yS0kIIIYQQKdjy5cvp0qUL//33X7R9ESUfCxQowNSpUzWzVDdu3Mi2bduoVq0aBQoUYOjQoVGOEUIbEbONAgMDOXXqFAcOHCAkJIRChQoxduxYsmXLRuPGjQGwtLTUJKY3bNhA9+7do6xpLvQv4kZ+//79qV27NocOHWLt2rVAeKLg3r17XL9+nTJlygDhpUpLlSrFv//+y6xZs5Lda0FYWFiUmNRqNVZWVvz66680adKEs2fPsnPnTs3+MmXK0K1bNz5//qwpuwryGieESBnSpUvHihUr2LdvHyqViqZNm1K+fHksLS15+PAhz58/58SJE9SvX5+SJUvStGlTY4fMyJEjGTZsmOaxWq2mXbt2FClSBAsLCwoVKkTWrFnp2LEj8+fPZ+zYsezcuZPcuXMzb9483N3d2bNnD3v37sXW1pawsDC9zURWq9WYmZnh5+dHlSpVGDJkCG3atKFp06acOHECCwsLatasycyZM9m0aROFChWiV69e1KpVi4oVK9KpUye9xCGEEEII8aNTqJPjEHghhBBCCBGjmEoclyxZEjMzM9avX0/evHljPXbhwoWsX7+ez58/4+LiQv369Rk7diwQczlDIeJz584dGjduTIYMGbh16xZVq1alXr169O3bl+PHj9OnTx/Kli3Lxo0bAQgODsbS0lJzvEqlktKXBhTx81UqlXTr1o2nT5/St29f2rZtC0DhwoUxNTWlWbNmrFmzhrJly7J161YjRx1d5NenYcOG8enTJ1KnTk2nTp0oXrw4d+/eZcKECXh7ezNlyhQqVKigOfbs2bNUqVLFWKELIYTWvr3GGzRoEDt37mTp0qU0btyYQ4cOsWjRIo4dO0aePHnw9/enc+fOTJo0CTD+e+rRo0dJly4dxYsX12w7ePAg/fr1I0uWLJw9e1azXalUMnToUDZu3MjRo0cpWbJklL4M8b14enrSoEEDsmTJwurVqwkJCaF169bcunWLM2fOUKRIET58+MCqVasYO3Ysw4YNo23btprvR66VhRBCCCEST5LSQgghhBApROSblZs2bcLOzo7GjRvz+fNn8uTJQ5UqVfjjjz/iXAvWy8sLExMTQkJCyJAhAyA32UTCPH78mEaNGtGwYUPmzJmDj48PhQoVImfOnOzatQsbGxt2797NsGHD6NChA3PmzDF2yD+kiL/vt2/f0r17d1QqFYMHD6ZOnTo8fvyYfv36AVC0aFFmz55t5Ghj9+HDBypVqoSzszNZs2bl5cuXXLlyhR07dtCoUSNOnz7NjBkzsLOzY+bMmeTKlcvYIQshhNaOHDnCw4cP6d69OzY2Nprt9evX59mzZ6xbt05T2eLEiRM4OjpiZmZGkSJFgOR1LXfkyBECAwNp2rQpYWFhbNmyhS5duvDXX3/Ro0cPzfWsv78/9evXx9PTk9u3b2NhYWHQuLZs2cKJEydYvnw5JiYmDBkyhM2bN+Pi4sLHjx+5efMmTk5OPHr0iN9//51GjRpRp04dIHn9fIUQQgghUjJJSgshhBBCpACRZ4z06dOHjRs3cvjwYSpUqIBCoeDGjRuUKVOGQYMGMW7cOFKnTh3l+JhmWMe1XYj4nDx5khEjRnDt2jXCwsI0s422b99OsWLFAAgKCmLWrFls27aNS5cuYW9vL883I4h4/XBzc6N///64uLgwevRoSpQoQWhoKEFBQdjZ2Rk7zCi+fW2aOHEiJ0+e5PTp05rEQI8ePdizZw8XL14kd+7cbNmyhdmzZ1OoUCFWr16Nubm5scIXQgitqVQqFi5cyPDhw9mxYwdNmjTBzMwMgNDQUPLly0fOnDmZPXu25v01MmNfy32bsO3RowerVq3i6tWrlCxZkuDgYGbMmMHUqVM5ceIEVatW1cT8/v17fHx89DqQSK1Wo1KpoiWR379/z61bt6hZsybdu3fnypUrrF27FjMzM8qXL0/NmjXZt28fEL2yixBCCCGE0A+plSeEEEIIkcxFTkjXqlWLQ4cOcenSJSpWrKhZN7p48eKsX7+e+fPns2HDBgIDA6P0EdvNSkkQCm2EhYVF2/by5UuyZMlCcHAwRYsW5dOnT5w5c4ZixYrx9OlT9uzZg4WFBQMHDuT69es4ODjI880AlEplvG1MTExQq9UULVqU0aNH4+Hhwdy5c3nx4gXm5ubJLiGtVCo1zxVvb28Abty4Qfbs2TE1NdU8H1esWEGWLFkYPXo0AG3btqVXr14MHz5cEtJCiBRh0aJFDBw4kKFDh9KvXz969erFxYsXUalUAJibm9O5c2dOnjzJ9OnT8fT0jNaHMd9bw8LCNMlfd3d3IPy1uUGDBrRt2xYPDw8sLS0ZOnQo7dq1o1WrVnh4eKBQKFCr1Tg7O5MrVy7N95sY79+/B4iSkH727BlhYWGac9WsWZP79+/j7u7OwoULKVmyJKlSpSJz5swcOHCACRMmAEhCWgghhBDCQCQpLYQQQgiRzJmYmODl5UXu3LlRq9Vcv36dQoUKafYfO3aMkJAQ2rZty8SJExkxYgSHDh2KMZEohK5UKhVmZmaEhoayfv169uzZw8uXL6lWrRoHDx7E2tqan376if379+Pi4gLA3r17OXDgAN7e3jg6OmJlZaWXG84iqsiz0/bv38/Ro0e5cuVKjG0jkhYNGjSgVatWhIaG4ujomGSxaivy9zRjxgzGjx/Phw8fKFy4MDdu3NA8H0NCQgBo164dHh4efPz4EYDevXtTtGhRo8UvhBDaCA0NZd++fWzatIn9+/ezbNkyFi1aRNGiRenTpw93797VtFUoFPTu3Zt06dJp3meTg8ivx/Xr12fSpEkcP34cgPXr12NhYUHXrl3x8fHB3t6eWbNmkTt3bkqXLk1YWFiUZHpi14/eunUrXbt25dKlS5iamhIcHEydOnVo1KgR5cuXZ9u2bZr3jdu3b3Pt2jVKly4NwPPnz6lUqRK3bt1i8uTJiYpDCCGEEELEzczYAQghhBBCiLgFBQUxePBgPD092b59O2nSpAHCZxBWqlSJcuXKUbFiRSwsLJg4cSIPHz5k8ODBpE2blsqVK8vsVJEoJiYmeHh4UK5cORwdHXn16hWZM2embdu2/PXXX/Tp04d27dqRKlUqlEol27dvZ86cOUyYMAEnJ6co/Qj9UavVmuTtzz//zPXr17G3t+fhw4fMmzePTp06YW9vH+0YhULB0KFDjV7uNTYR31OTJk1wd3dn8uTJfPr0icqVK3Ps2DHGjh3LjBkzNGuP+vr6kjZt2ijPNSGESM48PDwoW7YslStXJnv27Hh7ezNlyhTSp0/PkSNHKFCgAIMGDaJx48Y4OTmxfPlyjh8/Tt68eQHjl+uOYGJiwtu3b6levTqurq5MmjRJkzR3dHRk165dVKxYkSFDhrB8+XJcXFz466+/OHfunKY8ub7Y2try+vVr1qxZg729PQsXLsTU1JTZs2ezaNEifv/9dzw9PRkyZAi1atUib968lCtXjtq1a7Ny5UomTJhAwYIFgfDZ3/qOTwghhBBChJM1pYUQQgghUoC9e/eydOlSHBwc2Lp1K48ePaJGjRpUqlSJ5cuXY29vH+UmWt68ebGxseHs2bPJrjSvSFnevHnD/PnzCQ4OZvbs2Tx+/JijR48yduxYBg0aRFhYGIsXL6ZEiRKkTZuWM2fOsGDBArp3727s0L97gYGBtGzZki9fvrB161YyZcpE37592bx5M3PmzKFjx45YWVlFOSa5JDNiEhHbokWLWL9+Pbt27SJr1qxA+OCcmTNnsmvXLqpUqUKbNm14+/Ytffv2ZcSIEYwcOdLI0QshRNwibr/16tWLFy9ecOTIESD8tbxZs2Z8/vyZxYsXkzlzZrp06cKzZ88ICAhgwoQJ/PLLL5o+ksNreOTX671797J3715SpUql2QfhM7yPHTtGkyZNGDp0KFOnTo2xD31Zu3Ytf/75JzVq1OD169dMmzaNbNmyERISQt++fXnw4IFmIN2NGzeYPn06YWFhNGvWjI4dO+otDiGEEEIIETtJSgshhBBCpBCrV69mzZo1WFpacvnyZYYPH86YMWMwNzfX3NiLKH0bEBDA48ePKVKkiLHDFinYxYsX+e2333j//j2zZs2iTp06APj4+PD777+zYMEC/v33X9zc3Hj06BGWlpbUqlVL87xLLjfPv1c3btxg5cqVDB8+nOzZszNr1ixmz55N0aJFuXz5Mlu2bKFevXopbn3lzp078+XLF/bs2QOEl4g1MTHBx8eH7du3M3PmTFQqFWq1WpOUFkKIlKJNmzaYmZmxceNGzXXb/fv3qV27Njly5GDlypXkyJGDjx8/EhISQqZMmYDk+Z7aq1cvHj9+zLFjx6JVRIn43ubOncv8+fO5c+eOwZeNmDhxImvWrMHMzIzbt29jY2MDhJfoHjp0KH5+fowePZoaNWoA4QMCrK2tga/vNUIIIYQQwnDkaksIIYQQIoXo1q0bjRo14smTJ1StWpWJEydibm6uWZfvy5cvNG/enLNnz2JjY0ORIkVkHV+RKCVLlkSlUnHr1i28vLw02+3t7WnRogWZMmXC3d2dli1bMmbMGIYNGxbleZfcbp5/b3LkyEGbNm3Inj07M2bMYPny5WzYsIGTJ0+SO3dupk+fzoULF4wdpk5CQkLw8fHRJC4iJwns7e0pV64cDx8+5NKlS5w5c0YS0kKIFMfMzIy3b98C4e+TKpWKfPny0aZNG27cuMG0adO4d+8e6dKlI1OmTMn6PVWtVhMcHKwZGBnB39+fHTt2EBAQwPDhw3n06JHBE9IAkyZNonnz5oSGhrJ27VrNdldXV3799VcAJk+ezIMHDwA0CWmQZUaEEEIIIZKCXHEJIYQQQqQg/fv3p3nz5nz8+JG//voLCL+5eefOHYoVK8bnz5+pUqWKpr3cYBOJYWlpyc6dOylWrBhbt27F3d1dsy9Lliz4+vri7e2t2RZRhEmed0nDwcGBypUrExAQwOHDhxk2bBj169fH19cXR0dHrly5wuLFiwkLCzN2qFqzsLCgTp06bNq0iZs3b0Z5Lj1+/Jj58+dz79490qdPryntLYQQKcmYMWM4d+4c8+bNw8TERPM6FxYWRq1atXj48CGrVq3i/fv3Ro40fgMHDuTKlSvMnz8fU1NTzXY3NzfWrl3LrVu3AEiVKlWSDZScNGkSZcqU4e+//2bfvn2a7aVLl6Z79+5Uq1ZNsz63EEIIIYRIWmbGDkAIIYQQQmjPxsaGwYMH8+nTJzZu3EiePHmwsLCgbt26dOjQQZOojiiZKERipU6dmk2bNtG2bVumT5/OjBkzSJMmDSdOnMDPzw9XV1dN2+Q4i+tH8PTpU27evKmZNfzx40dy5szJmjVrcHFx0aw1n1J07dqVw4cP06BBAzZs2ICzszNKpZJOnTqRK1cuSSYIIVK0AgUKMH/+fAYMGICvry9FihRBqVTyzz//sGPHDk6cOMG2bduwsrJiwoQJWFlZGTvkWBUqVIh58+YxaNAgPD09yZYtG2ZmZowePZrevXtTtmxZTdukGrBmb2/PjBkz6N27N6tXryZdunSUK1cOgNatW2vaJcdy6EIIIYQQ3ztZU1oIIYQQIgW6efMm06dP59KlS7x+/Zr58+czePBgQBLSwjCOHj1K+/btAahUqRKPHz+mW7duDBkyxMiRCYC6dety/fp1atWqxdGjR2nYsCFr1qwxdlgJFhAQQJs2bXB3dycoKAhbW1vKlSvHxo0bjR2aEELoxYoVK1iyZAmfPn0iNDSU4cOHM3ToUAC6dOmCv78/q1evxs7OzsiRxm/dunWsW7eO169fkyZNGjp37kzv3r0B4yV/T58+zdixY8mUKRNTpkyJMqBJEtJCCCGEEMYhSWkhhBBCiBTq4MGD/Pnnn4waNUpTsjvy+qtC6Nvy5cuZO3cu7dq1o0uXLri6umrKccrzzrhUKhWDBg0iKCiIwoULM3DgQGOHpBc3b97ky5cvmJubU6FCBWOHI4QQevX+/XuCgoJQqVS4urpqBhaGhIRgYmKSoipd+Pn5adaYTps2LWD869IVK1awadMm1q5dG6WyixBCCCGEMA5JSgshhBBCpGBBQUFYWVmhUqlQKBQy60MYlFqtZuDAgdy9e5dBgwbRuHFjY4ckviGVEoQQIuWIPGM38u25lHg9l1xnH/v5+WFra2vsMIQQQgghBJKUFkIIIYRI8ZLrTUDxfQoNDaVt27Z4e3szZswYatasaeyQhBBCCCFiJdfKQgghhBDJg9TYE0IIIYRI4eQmm0hK5ubmrFixAqVSqSnPKYQQQgiRXMm1shBCCCFE8iAzpYUQQgghhBA6Cw0Nxdzc3NhhCCGEEEIIIYQQQogUQJLSQgghhBBCCCGEEEIIIYQQQgghDEbKdwshhBBCCCGEEEIIIYQQQgghhDAYSUoLIYQQQgghhBBCCCGEEEIIIYQwGElKCyGEEEIIIYQQQgghhBBCCCGEMBhJSgshhBBCCCGEEEIIIYQQQgghhDAYSUoLIYQQQgghhBBCCCGEEEIIIYQwGElKCyGEEEIIIYQQQgghhBBCCCGEMBhJSgshhBBCCCGEEEIIIYQQQgghhDAYSUoLIYQQQgghhBBCCCGEEEIIIYQwGElKJwPBwcH89ttvBAcHGzsUIYRIFuR1UQghopLXRSGEiE5eG4UQIip5XRRCiKjkdVGI5EWhVqvVxg7iR+fj44ODgwNfvnzB3t7e2OEIIYTRyeuiEEJEJa+LQggRnbw2CiFEVPK6KIQQUcnrohDJi8yUFkIIIYQQQgghhBBCCCGEEEIIYTCSlBZCCCGEEEIIIYQQQgghhBBCCGEwkpQWQgghhBBCCCGEEEIIIYQQQghhMJKUTgYsLS2ZOHEilpaWxg5FCCGSBXldFEKIqOR1UQghopPXRiGEiEpeF4UQIip5XRQieVGo1Wq1sYMQQgghhBBCCCGEEEIIIYQQQgjxfZKZ0kIIIYQQQgghhBBCCCGEEEIIIQxGktJCCCGEEEIIIYQQQgghhBBCCCEMRpLSQgghhBBCCCGEEEIIIYQQQgghDEaS0kIIIYQQQgghhBBCCCGEEEIIIQxGktJCCCGEEEIIIYQQQgghhBBCCCEMRpLSQgghhBBCCCGEEEIIIYQQQgghDEaS0kIIIYQQQgghhBBCCCGEEEIIIQxGktJCCCGEEEJnarWawBClscMQQgghhBBCCCGEEEKkAJKUFkIIIYQQOuux/hr5Jxzm5acAY4cihBBCCCGEECnSmttraLKnCR8DPxo7FCGEEMLgJCkthBBCCCF0dvzeOwC2XX1p5EiEEEIIIYQQImWaf20+T788ZeWtlcYORQghhDA4SUoLIYQQQgidyOxoIYQQQgghhNCdSq2KcXuoKjSJIxFCCCGSniSlhRBCCCGETirPPmXsEIQQQgghhBAiRfnj+h8UXV+UWjtr8dvF31Cr1Zp9kb+Oj1qtZuWtlZx9ddYQYQohhBAGI0lpIYQQQgihtW9vligUsbd97xvEf88/GTgiIYQ+rT7/jNXnnxk7DCGEEEKI786KWysAeOv/ll2PdlFkfRHNvg+BH/gS/EWrfi69ucTv13+n34l+BolTpAxqlYoXnbvwatBgY4cihBBaMzN2AEIIIYQQIuWYefh+lMeLTj7GwdqcXyrniNa2zLQTAGzvVZ4y2dMkSXxCCN0pVWpa/nURNXDD4zMALUtlxs7K3KhxiRRApYLX18K/trCB9AWNG48QQgihJ0qVElMT0yQ736mXp6i0tRJX2l/B2sw6zrae/p5JFJVIzkKePSPg8mUgPEGtMJH5h0KI5E9eqYQQQgghhNaWnXkabdvUA/e47uEd6zEXn3gZMiQhRCJde+HNdY/PmoQ0QIull1CptC8jKX5Q/62CVT+F/7+0AoQFGzsiIYQQItGmX55O5a2VeR/wPsnPXWN7jXjbqJFrtB+dWqnk3YyZmsev+g8wYjTgtWIFr4YMQa1UGjUOIUTyJ0lpIYQQQgihlYO3Yh+R32zJxVj3LTn1xBDhCCH0RBXDGoYP3vny4J2vEaIRKcr1dVEf39phnDiEEMJAXvq+5KH3Q2OHIZLYlvtb8A31ZePdjUl+br9QP2ruqMmBpweA8OWT/EP9NftDlCFMujQpyeMSycvbyVPwP39e89jv5EmCHhjvterDvPn4HjqM37lzRotBCJEySFJaCCGEEEJope+m6wk6LkSp0nMkIkJQqJKlp5/wzMs//sZCxGD9peecehDzLKAYctVCxG1vPwgNjH2/Wg0XfocHhwxzfr/3sKo23NxsmP5TIrUa3t0FZaixIxGCl74v2XxvM0FhQcYORWv1d9en+T/N6XWsF0efH0WtVvPY+zGh8jcldBAUFsQf1//Quv37gPeMPjeaTfc20ed4H8ptLsfzL88BOP/6fNwHix/C523bom172aOHESKJSh0kVXOEEHGTNaWFEEIIIYTBeX4JJL2dFZuueHD3zRdG1c1HahsLY4eV4pWdfoIvgaHMOnyfR9PqYW4qY06FdnyCQrn3xocJe+/E2ub4vXcUyGifhFGJFCc0hsTSzc1Qujv4vgPUYJfh675DI+HK8vCvf/ui/3hOTIKXl8P/L9ZO//2nNF6PYWMz+PwC8taHtluMHZH4gV18fZFex3sB8NrvNSNKjzByRPFTqb8OrLz45iIX33ytDFTCuQTr6q2L6TAhovgY+JFq26sl6NiZV76WZ971aBfDSg3TW+luvxA/UpmnQqFQ6KU/kXT8r1yJcXvY+/cE3rqNdeFCSRyREEJoT+5aCSGEEEL8oK48+8SWKx5671etVrP7+qso28rPOEmOMQcZv+c2W668jDMRJrT3JfDrLJ3cYw/hEySzdkT8AkOUFPntKK2X/xtnu/nHHvLgrZTwFrEI9oWPj6Jvv74ufFbuvDwwL2/UdaYjEtKGEuRj2P5TmsUlwxPSAA8OGjcW8cOLSEgDrL+7ni/BBhiYomd9T/SNdd/199e57Hk5CaMRMdn5cCe1d9bG7YObQfrXRwL4+IvjeogEFCii/Bth/IXxOvf14NMDym8pz+BTg/URmlE9+PRAU+r8R/Fl1+5Y9z1v2RJVSEgSRiOMIfjZM1QBAcYOQ4gEkaS0EEIIIcQP5pN/CNv/e0mrZZf4dfctrjz7FO8xXwK0T3YeufOWodvjvjF0+03yvxFpTCqVmkfvfFHrWD/5+N13Boro+3H87jtOPXhPUKiSoFClscMxCl3KvT/z8jNgJD+IkAC4fwBCvqMy+8owmJE55n2ebrB/yNfH3i+SJqZvPdJPEiBFUqvDS6kLkYzterTLaOe+/u46Ox7uQKVWoVSFXwuo1Wq+BH+h1b5WFF5XmOvvrnPh9YU4+/nl6C+SmDaySZcm4envSYeDHfALSZ7XLPqeifxtUnrP4z1RZvVrY/P98GUuTr48qbe4jKXFvhaMPjeaYaeH6fzZ6XulDkoZSyQEPXjI8zZt+bR+vfzudBBw4wZP69XnaaPGxg5FiASRpLQQQgghxHcsJEzFxSdeBIUqUanCP+i1Xf4vI3e6a9p4fIp/hK02bSK4v/p+E86eXwKT5APzxH/uUGvBWRaffMzrz4EcufNWq/OevB/z2sDJycUnXrRZfokzDz8k+bl9gkL5Zf1/dF1zlXzjD5Nv/GGm7r/Lyfs/VjJfqjQmEbUaVCo4MAy2toM9feI/RqXbTWWjCAmAKU5xt7mx4evXy6oYNp4oIr1ObmqehOdNZpZVhhsbjR2FEHH6NrGWlDof7szkS5Mpur4oxTYUY9TZUVTbXo1KWytx79M9TRtt/HL0F3xDkk9VkfcB73nhkzSDgb4Ef+Hm+5uMODOCp5+fJsk54/IhUD/XlpvubdJ8HaJMfjNOY0pyN9nTRKdYjfn3ZyhHXxylyPoi3Hh/Q7Pt7KuzDDgxAK9ALyNGlvQ+Ll+OWqnEo1s33s2YGWfbgP/+4/XIkQTeuq2fk+vwQcOje3cCb97k3fQZvJ87F3VICCGvXqH080et/DEHD2vD59AhAEJfvzZyJEIkjCSlhRBCCCG+Y5P23aHdisvkG3+YopOOkv3XAzx4F/XGmTYfG7X5bOnxMTxxrVXKNoUNhL74xIs+G69RfsZJRu+6FW3/4/e+zDh4j0/++rlxteHf8JuJ8449pOLMk/TacI09N6N+6AwOi/5Bfb+7J9dexD/z3ViCQpW0W3GZf59+ovPqK7z4mLQzR889jH5DauX5Z3Rb+x9PP8Q/uyZMqVvCUK1WM3qXO0tPP9HpuOREJi0kwtZ28GcZcAufjcTdveH/XloC7juitv3wEH5zgMmO4BF3WfUoHh+H07OSNpl9bp5u7cMCDRNHTPw/Jt25krO30d+nhBCxO/jsIJ+CEn79VGFLBe5+vKvHiKLzC/Fj8KnBHH52OM52NXfUpOHfDRP1/cTFK9CLSZcmMersKCptrUTHQx05/PwwPY/1NMj54rLy1sooj/UxcLTXsV5R1nHefH8zn4I+EapKBkvkKCL+if7B7LnPc1rsa6F5/DHwY6wxe/h4GLVSgT4tdVsabVunQ500JdP7nejH6VenmXN1TvL4HerJl71749z/ceUq7hcshP/FS3xat45P69dHa6MKCMDn2DFedOiIzz/7eN6yJe9mzDBUyFEEXL/Bi65dUXp9/Wz2adVq7hcpypOfavGwVCmet2kb5ZjQt295VKMGz1q1JvjZsySJUwhhGJKUFkIIIYT4TqlUajZd/rpmtG9wWIwJpmE73PRyE2fntZcArLkQ/4fEpzqUD04O2q24zKHbbwHY9t9L1lx4xnvfr2XRai84y7KzTykx5ZjBSkIvOvE4yuMXH2Oevd586SWd+959/RWnHrwnOExpsJnggSFKOq2+EmVbv83XDXKumEzadyfO88X284xw8YkXBSYcYdNl7WcfXff4zNarL5l1+L7WxyQ3kpNOhAcHo6+5vLM7HPkVdv/ydVtoEPxZ+uvjHV206//dXdjYHE5Phw1N4cSU8LWcDTmS4O0tODfXcP0nlEoFB0eCx0VjRyKEiMW3ibQdD3cw/fJ0zr06Z6SI9Kv1/tasub3GYP2vur2KEx4nGHF2hFbtDTVbevCpwex8uJODz6KuU/8u4B2BSTkICfj9+u9RHutj/eeLb6K/j1TdVpUSG0roXCI7gt7Ld8fS37MvzzT/VttejTb728TYbuq/U/Uaj65ClQlLDu9+tJvm/zTnrf9bvAK9mHxpMktuLomx7ZDTQ6Ksl33w2UFKbCjB2PNjE3Tu5ELp50/AtWs6H/duevRk85tRo3k9YGCUbZ/WRU9ea8Pv3Hmd2r9o146AS3EPwgy6dYugBw81j98MH0HYG0+C3N15Wq8+DytVJuxD0lfeSiyVvz+fd+4k7KN2AynDvL15PXwEvqdOGTgyIZKOJKWFEEIIIb5T6y4917ptxMxcfQgK1c+MvbMPPzB46w1O3n9HaAwzVL38gjl5/52mLHlSmrTvLmWmncAnKJRZh+8TOYQVZw1TvvCpl3+UmcX6KpP+4K0vQ7e70XXNVfKOO0z2Xw9y8bH+StytPv8M19EHyD/hcLT1y2+/9onzWG89zTwHWHPheZz7j8azHnf/zTcIUaoY+7f2pe0CQ/Q/QOHw7beUmXZcq7XgYyPlu43o9s7o26IlebX4BQV8gqXlvz5+dia8nylpw0s3v70NwXpcW1Othj194a9K+uvzN4fw0t6/OcCTBK5p+fkl+HuB+za4sky3Y9/f+z5nVrttNXYEQkQTFBYULWH40vclW+5voe+JvkaKSv/mX5tvsL69g7xj3P7G7w1qtTraLFB9l2cOCA1gw90NuH1wi7VNnZ119HpOCC+f3ft4b0acGRHle3zk/Sha298u/hZrP0FhQfQ42oPC6wpTeF1hzr46q3Msq2+v1vkY0N/vQpt+fr/+u2YG+UPvhzG2CVYG6yWehJh8aTJlNpXhtZ9uZYe3P9jOxIsTeej9kFo7a1F9e3V2PNwR5zEnPE5E2/bPk3+ibQtRhvDS56VO8RjLk7p1edG+g1768j12LMbtrwYOIsw75tebb735dQz3ChfhZY8e0faFenoS+uZNomJ81qQJqoAAgp8+JeC//6LsU3p58ahyFZ61aq0ZVK308zfoUlvq0FA8evXi867dCe7j7bTpeI4bj0eXrvG2DXrwkEflK+Czfz+v+kR6r4z0LX7Ztz/BsQhhLJKUFkIIIYT4jvgHh7H45CNcRx9g0j7tywhO2Hsnzv1H7rxNbGjRRJ5pHNmFx14M2nqDTquvsOfmG7qt/Y9ik45GabPu4nNKTT1Ot7X/seWqR4z9JIUivx2NVpr5hQ7rb+vqfKRk8fAdsd8U1MUr7+jxtlt5mSaLz/NSh+9FrVZz580XAkPC1y8/euct73yCmLxft3KWYUoV/sFhuI4+QPEpx1h0IvpNR12FhMU/UOKjX9w36BJyO9EQyd/eG6/x3jeYVssu4Tr6AMvP6l4aXJebo1K+Owm8vBz1sTZPnFW1Yt/39hb8VVG/azm/dYebm+JvpyvP/7+ObfhZ92MDPsHCQjAnJ+zpHX/7sBBQhsGzs/D6OiwpB3NyhM+y/vcvmOIMf/cG33fhs9dTqr97xb7vNwc4Es8ssdCg8N/H3v7h64cLkUhKlZLSm0rH3zAJKFVKOh7saOwwEiSm2bGb722mzq46FFlfhBIbSnD93fU42yfGnP/mMPvq7DjbeAd78z7gvV7Pu+vRLi68vsDh54cpsaEE7h/c+Rj4kWb/NIvW9uaHmzH2cfXtVUpvKs2/nl9nZvY70Y+A0KivcfHNLt9wd4Pu34AefQgInxUa13XcylsroyReDzw9EK3NLS/jLPFw9+NddjzcQZg6TKeqAlMuTWHKv1MMFlfHQx2p/3d9rnheib+xkUUud60rbWcV+x49ytsJE7Vq++XvvyE06oCY14MG8eXAAR5Xr8HjGjXxPX1a11CjeFCiJE/rN4h1f5C7O/fzF+BRteo8LFWK+/kLcC9ffnxiSbpH8L98hffzF6AOjX/mvtLPH49evbhfuAj+Z87iOTZhM+5V/v582R2e0A5+FP4ZV/n5M582bIwyc1qtVqP8/JlnTZrE2+ebESN4v3AhQffu8WbcOELfxT3QOkLou3e8nTyZ4Ccpd5kpkXJJUloIIYQQ4jsy9cBd5h6NeVR8fOKaHbvo5ONY92koFNzSYfZumWnRR6+rVGrar7zM3ptRR1X7hygZtPUGarWawBAlE//5mkQf+/dtnmixHnBC6Tra2szEcNNQ/3uu3ah1balUarqv+y/GfW6vvlB5tvZlwg7ffkuDP87z85IL/H7iET03XKPs9Oi/47isPPeUXGMPUXDiEc22ecceMmlf3IMmItx8+ZnH732jba+94Ey8x8b1W37vG8THSLO2D93y1CqepDD9oGFLg8dWCnPyvrspuix5spKQzP9HLV6TP+nxJlOoHkqyXlsb9/7fHMBbh6od77UY8PLxSfga2L85wNR0MMUJ1jWCFdW/tpnsCIdHgTIY3LbAvDwwLX34MbcTPhNGI9gPlleHM3Enc/RCpUV1hkuLw7+3kBiW0djWIfx7f3ISbmyA6S7gvh3W1A9fv1yIBBh/YbyxQwDCSwY32dsk1sSlvnwO+myQfmNKRH47M7vz4c6ar030fMv3xAvtrulq7qiJSq3ijtcdzazkiP8XXFugc4lvn+CoVXXaH2xPte3VtDrWO8ib3y7+Rrcj3WLc/22is+HfDePsL8Hlu/U0U3rf032o1WqdqguMPjc62jalOup7xVK3pXj6GfbadoX7Clrvb615vO3BNk258bhMujSJ7Q+3GzI0zXrw3Y92x8PHeIOd4/Nl375EHR85+ep7Mu4KNUF3tPvsFZs3w4Zrvn7Vuw+Btww/ECLsbdRB9K8HDIw14ex37hwenTvzcfly7hcuwuPadQh+9oyAq1djbP9x2V/4n4laXUEVoPvAvQ9/LIry+FGNGjwsV55306bxslf4AMvgR4+4n78AD8uVj6kLlD4+eG+IOkDm41/LePZzM77s3MWb4dGXeAh+9gxVUBChnp68HjGS1yNG8mrAQLw3b+Fpg4bcK1QYv7O6V48QIqEkKS2EEEII8R3ZciXhpcfarbwcf6M4bPr3BY0W67ae1Ohd7kB44nf9pefkGHMw1rZ7b74h+68HyT/hcLR9zZYYbh3RC491K++69epLgyXJb778rNf+Hr3XX5y7roeX4bv/1pffdZjdrPx/7fNua68y9cC9GNusufAc19EHNG2/de3FJ1xHH6Dpnxf4af5Z3vkEMXKnGwO23EClUvM8nvWiAarlTaf5OvLa2iqVOtrzq8+m69zw8A5fp3rT9VhLyPsFh2m+dh19QK9l0ZNK/803+BwQtYz63Tc+rL7wjKWnnxikRPkPz+c1rKgB/60Jn8WrUsKlJfDPQHhzM3xbUtvZPXHHq9Wwb1D87X4vAoGf4cGh8BnLcfW3NvZZMxqLSsCJyVqHGc3O+Esrxuv6OnhzHU5NA/e4S43Gyetx+M8mNmo1TE6jfX/flmJXhsK9GG547+4BLy6Er19+br7MnhY62/c0cYmUhFKr1XwK+sSTz0+Y9988SmwsYbB1liNbdXuVQfqNKbGpjGMgyq5Hu/R7fh1mXhddX5Q2B6KvZ7z69mqmX56u03kTsk50mCr8+uvXc7/G+XPY/3R/tLLn+o4FiDJDO7HcvdwT3ce3z6UlN5dQe1dtg8wEV6vV+Ib48seNP6Lta7ynsWb2d2zH7nwYw7Ineoyt97GolVZa7W9lsPMl1psRIxN1fMCNG5qvP61dF2dbtZ6vNV8PHqLX/rTlOfG3GLe/7NEzyuNQDw+e1qvPi46deFK/QbRB6R9XrIzWh9eSmNczj0vQvaifdcPefB0MEnT7Np4TJvK0UeM4+/DeFHflooCrV/m8a3f4e+CmTXgtXcrTevV5UKw4j6vXwGffPnz27SPIPdJrSVgYL3v24t2sJBhAKQSSlBZCCCGE+G5EToAZw8cErP+79epLas47Tetl/8ZbQjwuXwJD8dfz93/79Rd+WfcfHVbpnqyvOe9MvOWgE+KZlz/rLj4nKDTuRODKc9qtax0YTz9JofBvR2i8+Dwn78df7jHnmIPREv7+wWE0X3opyray00+w/b9X7HN7E+dAh28tOf2YO2++UHTSUbL/ehDX0QfIMeYgr7yjz+r5eclF1lx4zoFbniw8/jDGGfW9NlyL8jimgR9qtZpbr74QEJJ0f7+6VvSMvB6355dA6v9xTvNYKfW9E25Lu9hnIL++BvsHw9zc4YnGI7+GJzeXVw2f2ZuUvrwCn1eJ6+PR0fjbRJiVDba0CZ+x/DKWUpqqJHy/UybyXMpI7427f0lYH+/vw+KS4T+b3xzgUwyv8V46Vkn59DTqzGptZlmfmAT7Bup2HiG0ENPar4n1580/qbqtKk33NmXtnbV67z82Z17FX50lIR54P4i2LUwd++vTrke7+Hnvzzz21qKyhhY+BX3SSz97Hu+JVjZbn6psrULxDcWptq0aF95ciLe9LmWkvwRrXxEqssPPow+oTagb727E3yge386UjjDn6pxE9x1ZmCqMsefHUmFLhVjbzL82Hw8fjxivo4ecNmwi843/m2jPEf9Qf975a1f+OKmoVSpCPRM/kz3y7OX4PgyEvX2r17WZQ1+/1qpMtr592R2enA19p/2yAiFPn+K1dKnmcfCzmGf0f1yZgAFI8fzcP2+PvyqAWhn/gAHPsWPxP3+ed1Om8uH36ANCYvNpjfavh0IkhiSlhRBCCCFSuKBQJVP236XlX5fib5wMPfngz5Xnib/RVXDiEVxHH+D43XeM+fsWrqMPcPmpbrOcI2u46DzH7yX8pkSrZYb5fUz85w75xsd9c2vqgXtaraMcMVM9sYJClQn+WQWEKHHXoex7zXlnOHn/XZTH+jD279vMPvyABn+cJyhUt9kBf5x8zI5rX5N2vkGhbL8ac9WC0G9uJOx396TR4vNUnXM63vPEtg67oUWO+dvKAfq8YfVd0ebn8uAATMsQdzIwIJGz6z8nvHqGxoKCie/jzc2EHbeqVsw/n4f6u8Efr6Uxl0/UXjyjQN7eDv/5eD2CR9+UyPb/CIdGw5KyUbf/URzefJOYcN+me2huW79+rW3C/NYO+BR/yVUhAEKU2g1YHHxqMAA33t9g8KnBnH11NtHvL8vclyXq+IQyMzEzSL9uH9w0X3sHece4VvC3Hn9+zM///Jzocye0bHVstEkWR9B1drJ3cPhSNx+DtPsMsOjGIvxCDLcMkL5ZmFrofMyN99olshM6Ezwm86/Np+ymsvFWStj/dD8N/m4Q49+rIQarRBZbWfWfdv6k+XrVrVVsf2DY8uExCbp7lycNGnIvX37uFyjI4+o19Np/wOX4B13fz1+AZ81b8Gn9+hj3R14DWRsPypSNv5EBvOzdm8dVq/KwQkXCvL21mgXu9cciTXnu1wP1OBhPH8t8adnFt7PBhUhODHOlJIQQQgghksyaC89Zdd5wN4hLTjlmsL4N4Zf1X9dIbr38X6Y0LUTHctl06uPSk4QnsyM8+RDDep1JKDhMiYVZ3GNQ77+Nvv7ytzquuszTD/4Ehynx8gvBzETBwjbFqFswA2am4f3HlyTXt25r/yN72lTMbVmEtz7GSdR+a+e1V7QqlQWAodvdOHY35iR97rGHuD+lLlbmpnwJDGXAlvAbhR98gwkICcPGIvaPaO99Yp59HxymxNLMNJHfQewiz4Y2/eYpZYxK0inCwyPxt4ngYbjlB1hYCH7TcWbXk1PhpcKbLNLfjOTTupVrjWJyGhj1AqxTf922rUOiQ9Ka10M4vwAqJXDGluKbP5rtnSBzmfA1v+0zwsmpUfdnLgP5G0GZnuGzku/vj7nf5dVg8C1InTX88bl5use2ty8Uag7mVjGX7o7NH8Wg9lQo2AwcMul+XvHDGHhS+5v5Vzyv0P1o+FIBJzxOYGVqxfxq8zE3NSdX6lyktU6rdV+GXh83Lo+8HxGqCsXcxNxg56iyrYrB+o6JLiWutTH09FBudY59jdnAsEDMFGaYm5onyeC3v9z+okuhLlq1NfTvNj4zrszQ+ZhOhzppft5JUcIedJuBDuGVDXoX7R1/Qz0y+fb9OZIi64pESdK3zNNSpxL2ifF55048x403SN9qtZqQp9pV1ILwtaWD7twhTadOUbaHeXvzqGKlWI6K5dyBuq0nry8Ra0ErP33iUfnYZ+1/Sx0SAjY2BD+KvdqE7/Hj2P30U6z7v6WI4zmnQyeJ7yMOb6dPJ8OYMQY9hxAyU1oIIYQQIoXb/p8eZsLFISFluZOT8Xtux5ogjMlzL3/artDP2m/vE5AwvfZCP+URY1nmWGfnHnnx+nMgXn7hz4MwlZr+m2+Qa+whXEcfwHV0/DN1DOGZl3+0st3GdP2FNyFhKtZceBbv823BsfAyuxP23o6yfeO/cd8oPHEv5tJzLXT8Oeh6f1ep/HqAyTc3QqR8dywMmWg2pBOTYUNT+OIB65vA68SXCdWLWdlgVe3wdam/JLKUeEIc/w0CdHht9vgXPP8/q/Hbm4d398LRsfDf6ugJaYBXV+DYeJiWPvaEtKbt1fB/78XTLi7rGiXsuKPjYHXdhJ9XfJdClaG88v36N6rLjNiIhHSEIGUQfU/0pcfRHlTfXl2ntWV7H0/a5Na3Ft1YpPk6MCzQ6FVF7n68m6jj41q7Wl8W3VhE8fXFeez9mDKbylBnVx0AVtxaYfBzvw14S71d9bRqu/fxXgNHYxgRM+2b7m0aZztjzAqOcOjZISA8cWrIv5kvwV849OxQnIMtvp013uVwF976vzVYTJEZKiEN4bOfnzZoqPNxEeWvQ1694mWv3joldyPzO39BL6XIk4IqJP57ILquwawKSfzyXqFv3iS6j7h4r499bXm1Wq3zDHkhYiJJaSGEEEKIFC5Yj+sCLzz+kOCwr/3pksxNznqs/0+TQN182YPH731jvNmx4uxTqs09rbfzlpl+ghmH7mnV9u2XIFxHH9BbonX24fuovslMf/ANZsOl5/gEJf2aXt+7MJWativ+ZdK++G/8Ljv7lMqzT7L3ZtSbCtMP3sd19AEaLjrHl8Dov6MFx2NeM/bWa91mwvbZeC3+RpFEfhZ9O0skLIFTpdVqNcoEjpwIClUSEBLG5acfoz3HdXZ7N0xyhHWNE9fPt25u1m9/ibGtI9zSMpnz7WzbJJoVpJWXl+HaWv2UE0+I2Nb//pb/R1hdB5b9fybjpT8NF9PObnBmNmxrn/A+Xl2B59onDqP44gGPDVteVaQsLfa1oN7uesz7b16C19+NzaRLk2JMCD7yfhTtXE+/aD8T0BDW3F5Dh4MdKLyuMGU2laHI+iLU2lmLwusKM/rcaL2Xw45P6/2tmXxpslbnVavVhKnCGHZ6GHV31eXWh1uxrkGcGJGvw1/4vGC5+3LC1GGacuMfAj/gH+pPmL4qdsQjSKndQNLIZdRTkojnY3w/zyn/TkmiiKIbeXYkgWGBtDnQhiLrixjsPJW2VmLk2ZHU311f62Ouv79OrZ21ePL5icHiimBT1jhlruPyuGpVPH/7jSc/1cLvTMKXTnr5yy/4X0wZAzef1K0X7zrYoS+1mxwQdPcuj2vUJPA/3T6DRTvf+/d82bkrUX1oQxUcPXke5uXF85ateFSxEj7Hjhl9sJVI2RRqeQYJIYQQQqRYR+68pdeGxH24icmG7mXwDQqj76breu87uRhaKw8Da+YGwssf5x1nuBLUTqksODWiGvZWsZf7KznlmN5npY+qm48+1XJqHtddeJb7b31xtrPk7MjqSV52W2hvQI1cDKudV/N421UPRu2KvdSl+2+143x+RZaQ2e2Xfq2BqUJBmelRE1C5nW05MrgKJjqukdZ+5b/c8PjM9fG1sDI35ckHP8xNTMjqZBPrMZsuv2DyvrsEf7Ne+sGBlSmQ0V6n82vc3hWe2HOtDF0SMdv0W7856K8vfdGmjHdyjDu5GHgT0mSPu82HB/Bnma+PJ3jDZEeDhqUX5jYQGqCfvqwdYZBb+Ezq9/eg4uDwxHfV0WBmBSYyN+J7teneJmZemal5nMUuCy999V/NZ3Wd1ZTOUBqAjXc3MuvqLACGlxpOqfSlCAgLoNuRbno/rz5NrzSdRjm1r1Jw6c0leh7Tz/qgsZXNfuv/lqVuS9n9aHe0fQWcCiR6tnVMCjoVZHWd1ZzwOMGY8ymjXGxa67ScanVKq7YqtYruR7rz37v/4m+cjFzvcB1z04SVKH/t95rUlqkpt7mcnqNKPpxtnMlql5W+xfpqXov06V6+/HrvUxhO/vvxD0B/UrceIc+fJ/pcDi2aJ0lSGiDb5s0EXPsP8wwZCH70mI/Ll0drY+bsjHWJElgVKEDanj2SJC7xfZCktBBCCCFEMqVUqXn3//LPGVNbR9sfplSRa+yhpA7ru+I2oTZH775lxE73JDnf3cl1sLEwQ61W8/JTYJQEnLHKYIvkydXJhtMjqqNUqfkSGEoJLdZ2n9eyKA2KuGBlHvf60vp+rlXJk4713crE2eblpwA2X/GgXZmsZHa0JvuvBzX7GhfNyD9u4bPG57YsyvyjD+hWKTv1CruQKbU1H3yDKT3teJz9d6+UnRF18sb7vUcTkZQGGHIHHDLrdnxM/l0Kh0cnvh99G/4IbJzC10hOly/6LOjQQJiWwTixpQSp0sGI/68rqFKFJ1dVSjCJ9JxbWSs8ARthxBOYkxMRSa0p4bPHG8yD/LqXEBXG9d/b/3jp+5JCaQuR2zF3lH2F1xVOsjjKZCjDpAqTqLdbu5LLydGvZX6lXf52vPB5gUsqFyxMLWJtq++fbU6HnAQpgwhRhvAh8INe+06IPkX7sNRtqbHD0Jp7p/DPDfGtMXzS4ySDTg1KipD0LlfqXGxvtF3r9bNf+72m9f7Weq+QkNwVcipEZrvMdC7YGZdULijVSnyCfQhRhVDAqUCC+pSkdMqS9+YNQl+/xjxjRkyso9+zAXhUrTphb5Om/LsxmaVPT9q+fQl5/pyQFy9w6toFm9JRB26o1WoIC0Nh/vW1RR0WxpO69Qh99YqMs2bi0KRJUocukpgkpYUQQgghkqn3vkGUmRY+K3H/gEo421nyJTAUpVqNqULBvKMPOXzn+/9wI4RInC4VXLEyN+WvM4YvORihSp50VMrlxAffYCrmSkuXNVeT7NwAzUtkprSrI+ntrcjgYEVAiJJUlqY421nhHxzGO58gsqaxgTu7cT7SJ+rBqZwhc2nIUyc84WhmFZ6QVJhAiB84ugIKUIWCmTXYpgv/N9g3PNE7O57ZtMlNic5g5wJnZsbfVgh9y1MPclSDoC9gnTp8YIhVajC3BotU4X93puZgYh7+r62zkQP+sY0+N5oDTw+QK3UuplScglegF5amlnj6ezLx4kRjh/ddqZm1Jic8pDx+SpQ/TX7MTMy45RV7hZuUrqBTQX7K9hOpzFPh6e/JmttrjB1SspU/TX6KOxfn6Iuj9C/WnzRWaXC2ccbazBpLM0vCVGHYW9ijUqsIU4XxqWQ1Y4csRLLh2KkjtpUqYZ4lC+qQUFQ+X8DEBIWFBahUmDo6EvDfNRTm5lgXL0bY+w+oAgMwsbBAFRiIiY0N6rAwLFxdATBJlQpTOzvjflNCktJCCCGEEMmVNrMDhRBCJNwW86mUN9V/SVIhhIH8vAyKtjF2FD+scpvL4R/qb+wwhBDiu7V9RtKsoy7EjyrdkCGk7aWfJTFEwshiPkIIIYQQQgghfkjOCm9jhyCE0MWNjcaO4IcmCWkhhBBCpGTe27YaO4QfnpmxAxBCCCGEEDELClVqvl7VuRTZnFIRGKIkICQM74BQLj3xYt2lF0aMUAiREpTJngZzUwV33/jgHRCaJOd0sDanYi4n/IKVlHF1ZO7Rh0ly3shalcpM1TzOfAkMxc7KDGtzU0KVKlzTpuLuGx/SpLLA+r+q8DiGGxOlusGnZ1CsPZhbgYkZBHpDwCdIlxdQgIUNWNqBXUbwfRNecvj9PdjcMqm/1YRLmze8DHmGwnB3r7GjET+yUt3AIQtkLBa+ZrdCEV6q28cTXIqAWg339kHpX4wdqfi/vU334vbejcx2mfkU9InhZ4YbO6TvQhqrNBRNV5QHnx7wxv+NscMRIhpbc1tKZShFJttM3Hh/g7sfpeJMXLI7ZCePYx7e+r+lTb42WJtZExAagKu9K5ZmlliYWJDFLgvvAt4REBpA6IxGxg5ZiGTDtkYN0vbqSdiHDyjMzTFNk4awd+9Qh4ZikTMnoa9eofLzw9TBATMXFwIuX8GqQH4Crl3HMlfO8LWrTUwwc3QkzNubwJs3Sdevn7G/rR+elO8WQgghhEimwpQqKs46iaWZKWdGVEOhUETZ7xccRqGJR4wU3ffB/bfamJkoWH/pBTMP3Tf4+bpUcKVa3nRUzZMOtRpMTL7+TotNPspnAyUMJzQswJMPfmy67GGQ/oX+pUllwbVxP2n+7l1HH4j3mLH189OsRCacbC1jbfPMy5/qc0/rK0wNt4m1cbA2j7PNwVue5Hex5+E7X3ptuKbZPrlJQSbsvQNA8aypueHxGUcbc86NqoGtpRmhShW5xx6Ks287KzP+G/cTlmamugW+px/c/P/My7bbIG9d3Y6PycOjyTMx3fdfcM4PoUHhifZvXV4Gh0YmfVwpybj3YBb73xcbm8PjSMtumFqCMtjwcaUkNcbDtbXQ5yJY2Rs7GqGjX8/9yv6n+xlScgjdCnWLsq/wusJJEkNd17r0K9aPTHaZKLGhRJKc0xB2N95NbsfcWrVNqp+t0M7ldpfxDvYmk22meNum5N/dwZ8PktE2I6Ym8V9b+YX40WRPE94HvieVeaofrqpCyzwtaZ67OXYWdngFeuHq4EoaqzQJ6utevvx6jk4YUr67dwi86YZ18WLR7tdEeNm7D36nTydtYEks28YNKCwssCpcmLC3bzHLkAGFQoFaqURhGv01RK1Uhq8NrVAQ5uXFk/oNUPn4kG7QQNL26WOE70AkJZkpLYQQQgiRTJmZmnBuZA1MFMT4AcfcNOYPPfowtn5+XnzyZ+O/32cSM1Nqazb9UhZ7q/AkWu+qOXny3o8d114Z5HzFsqRmS49yWFt8/UD27a+0bsEMbL36Uq/nvTG+Fo6pLDSPB9TIzdqLz6maJx0n779jxblnej2f0J+dvctH+btf1rFklETut04Nr0b2tKni7TeNjUW8bXRhY2HKnUl1Yr0JE1n9wi4AuDh8TYgeH1qF7Glt+egXQtnsaSiXw4nrHt4UzOig+XsxNzXhwdS6DNvuxn53z2j9TmlaiI7lsiXsG1BFGgiij4Q0QJ7a+ulHX3LWhHT5whPSEHNCGqB4R0lKx6Xt1rgT0gAuRaMmpYfegzk5DBvXhE8wOWE3vpNM0bZQsmv4c8+lKFSRGbUp1eSKk+lcsDN5HPNE2zel4hTGXxhv0PPf7Hgz1gSZeyd3FAoF/3r+S4+jPQwahz5om5AGsDGzISAsINHnPNbiGBlSZdA8VqlVmChM8PDxoN+Jfjz3eR7jcems0/Eh8EOiz/+tieUn0jx3c177vabe7nrR9psoTFCpVXo/b2LZmNtgY26jVdtTrU5RfXt1A0ekXw1zNGRG5Rk6HWNrYcuJVicAUKvVFFlfROfz5k+TnwY5GjD3v7k6H5tUCjkVok+xPjhZO5E/TX4CQgP4HPyZzHaZNW2y2mc1YoRCnxw7dMB7Y9zLhihMTLApUTzONtYlSuglKW1TpgwBV64kuh9tZV68CLuffiLs40dMrK0J8fDgVf8BZJw9C5sSsQ8KM3dx0XwdU0L62+1madOS98pl/QUukj1JSgshhBBCJGMWZiax7jM3iX1fYjybUR+FQkFwmJIa+ZzptvY/g5zHGExNFPzZrjh1C7lE2zenZVGDJaX/7lsh3qTdz8Uz6TUp3aCIS5SENEAGBytG18sHgNurz3o7l4AVnUqx5PRjbnh81kt/Zt/8fdcpmCGWluG0SUgDONiYk8vZlsfv/XSKp1Ame1wcrDl2912U7XNbFtUqIR2ZlbkpT6bXB8L/JgGG1Pqa4CjlGj3BZmlmyuJ2JVjQWoX7q8+UyOpIcJgKK3MdZ0Z/q9qv8OgYlDFSAqPuLDg8yrDn6Lhbu3YWNjDIHX6PdCO5xjg4OdUwcaU0eaMnTKKpOBienII31yF7FUjlBI7ZwdtAA4DGeIIWM9ji9e3vPbEG3oQXF8DfCyoN1l+/wujMTczJlyZfjPua5mpKbsfcrHRfybuAdwwsMVCvyeGYEtLjy41nyr9TgK8DOMu5lNPbOQ2le6HuOrXfVH8TP//zc6LO+WfNP6MkpCE86QvhSbR9P+8DwhOKCoUCTz9P/nv3H3Wz12XAyQF8eK3fpHTEIAKAzHaZaZyzMf88+SdKm/b527Ph7ga9njexJpafqFN7U4UeXqP14PfqvzPo1CCt2uqakP6WrteFEf6o8QfONs7kTZOXJ5+fMPPKzETFEZchJYdgbWbN9MvT421bK1stGudsTHHn4thb2Ef5/mwtbLG1sDVYnMmJVZEihDx9ispPt88Q38p97iyPKlfRU1SGk/P4MczSpIkzKe3QvJlWfaXp3Imw9+/jTXDHJ8vKFTwoUjRRfWgjx6GDmNraYpYuHQBmTk4AWOXLR67jxwx+fvH9M8ydTCGEEEIIYXAmJgqm/VxIr33my2Cn+aBtaWZKjXzp9dq/MU1pUpAn0+vHmJCOMLRW9Jk/+qDNzZmyOZw4OawqedLr58ZG0cwOce7Pm8FOL+cRcHRIFWoVSM+yDiWpVyju5DHAwtbF4m1jbx19/HD1vOkSEl40deNJcH/LytyEHb0qMKdF9KRV7QIJe40wNVFoEtK6MDc1oWS2NCgUisQnpAHSZIcRT6D6mMT3pStzGyjbK+nPGxfHbNB4cfjXPU5BlRHGjSelsbKHnqfgty/QOTzBgyKBt11KdoXRL6FA05j3D3sYPpAAYGQikt719TQjLUf18DgmfAr/uyreQRLSP6CCTgVZUH0Bmxts1ltyeFzZcWxruC3GGdKt8rbiULNDXO94Pcr2061O6+XchtKnmG6lSXM55kr0OStlqqRVu4hrVhdbFxrlbIS5iTm/lvk10eePrIBTgWjXxtMqTeNW51vc6HhDs83BIu5rWX058HP8S6REqONaR6e+tSl9nRRqZK3BxvrxJ8R6Fumpl/O1ztuaLHZZ4m3XPHdzWuRpwZX2V8iQKgMmChPKuZSjbb62eokjJuvqrqNrwa78lPWnONu1yduG061OM7/afKplqYaDpUOCE+4pnU35cmTfvo3sO3dgV68uaTp3TnBfEYnO5M4ic2ZMbOKuiJBhjHafH0wsLckwbmyiYzKxsMCqYMFE9xMfy+zZU8zvSaRMkpQWQgghhEjB2pfNxt5+FfXW3+J20UtPPZiqp5K2RpQnvS0dy7vG265tGf2XWzs3UvuSfTnS2fJPf+1uGsandoG4E4/V8zpr1c/PxTOx+ZeyrO1amnzfeSJb17+lZzPq82R6ffKkD/+5ONtbsbRDSZ5Mr0+/6jljPa5p8UwMi2MAxJL2JUgdQ5ntNV3LxNhe179RXe+nbe1ZHmsLU1LbWFDrmyS0mel38JHSEFUn8jWMv82wB7r/MnTVbKXux5ToGJ5UzZRy12pNVuIq8Zo2L+SpC/kbQ82J0PVw+LrVo15Ao4XhSe4Wa6DnmfDtHfeEz2r+7QvYRfpbtElE+e4yPcBOt4Eq0ThkgbZbwuNIJgkYkfKVcymHWyc3WudrTQGnArG2y2yXGXMT8yjbnKydDB1enMq5lGNoyaGcbnWaW51v4Wz99ZrLRGGChYnuS2kUTvt1beL51eazveF2rY/tVqibZlZ0QmSzT+ASGbFYWTv29yYzEzMut7vM5XaXqZalml7PG5tMtplYUnOJVm3tLHS7FjZTJJ8ipUXTFeVW51txthlQfIBezjWu3DgONjsYb7uJ5ScysfxErM2so2w3UZgwp+ocvcTyrSLpiqBQKEhnk47djaNXk2mbry2nWp1ibLmxRn8tSS4yjA1PqFq4upJ5wQIcmjROUD+pW7bUZ1g6sciVk3y33Ml37268bZ16azdoVGGh32WRtGGeMfYB9vpgX1+LykBCJFLyeWcUQgghhBDJQPQEiaVZyr/BfHBgZa3apbOz5MrYmpSZdkIv582XwY4sabRbcy5CXLM/e1XJQbuyWXFxsCbPuENx9uOqRTnnp9Prk2NM3DeMFkSa1VstrzNqtZplZ58y89D9ePtPadKksuDcyOosOf2Esw8/UDO/M10quNJh5WXefAnStGtWIhMTGxZEoVAQ09LupiYKRtTJx5+nnkTZvvmXshTNkhqI+ffcq0oOfq2fX+e4df0bTUwatE7BDJoS3roMuPjhVB4G9/fH3cbKPu79RduB2+bExVHEeDf/xP9lKgHvvkkCOOWGbkfA2jHmQRGR1642MYGMxcK/zhnH31y77bC5lW6x/bzs6/nK94dLi3U7HsJngve/CubW8bcVQgfLai1LVCI1k20mXvu95lCzQ7zxe0P3o7qVzE6MFbVXRHm8p+kernheoUT6Etha2CZotmVm28zc8gp/LamWpVq0RHxcehVJPlU5Lra9GG9iN2K95rxp8ho8nnlV52FqYkrFTPob5BtZYp7DSe18m/N67/NQs0MxrhcO4TPU4/pbqOtalw13NuDu5a7XmMxMvqZDvl3bfW/TveRwyKHX86V0Zs7OWOaKWq3BqkDsA4Vik2HKZByTOCltXbw4WVetJOz9eyxcXbU+znnwYM3Xef67ysNSpWNuqOtruYkJqFS6HfON9OPG43vseKL6iEvGucl3TXfx/ZCktBBCCCFECqfPiXZOqZJ+tK+hnR9VXafZnM52Vno7984+FfTWF8Cw2nnjXGdcVyYJKJ+sUCjoXTUnPSvn4OF7X3b+94qV5w20XmoSixhAMKNZ4Sjb13cvw0/zz2oez29VLEH9V8iVNs792iSkzU0VhCrVCTq/ho4vGpFbNyueCWc7SwpmtMfJ1jLWY0Qifkfl+kK10WDlAE2XwNWVcGwChAboL7ykVrAZ3NFybevY2GYIT7IX/BlW1NBPXEnhp9/g+rqvj9tuhSxlEze7OSZ5dCspC4BLpHUJ7TMm7LzDH0lCWsRqaMmhzL82X6u2dhZ2+Ib4AnC9w/VEJ/P2Nt2Lb4gvaa3TktkuM2dbn6XKNuOsY2pnYUfNbDUT10mkN2NdEtIA5qa6tTeUnY126jzT2NCcbcJnsSsSNWQvdsmhfPeuxrvibdOxQEccLPVfLj2zXWZOtzpNvd31CAwLjLIvq338FaoW1VxEnZ11CFIGxds2oW51vkWwMhhL0+/7utbE3h6Vj4/W7dMOHIC5szO2NWN+7crz7yV8jh7l7QTt1llP6oQ0QKZ5czGxsYmWkE7bty9eS2KujpD5r6VRHpvaxr60lsJMx9SaqWmik9Lm6bWrdpZQCkNUkBLiG/IsE0IIIYRI4fR1E6VDuaw4fidJ6XZls/JH2+KcGl6NzI66zVQG7db8jc+IOnmxtdTvGFB1YhJdCbChe8zloiE8oZ0vgz3jGuo+Uj45Ojsi9hmIuZztsE7A+sU/5f9aXjdnuqgz16vkSdg6XceGVE3QcZEl5hXDxERBlTzpJCFtSHYZwhPSED6AoEwPGOsJE7yh7+Xwf7VRfZzhYtRVyzWJO77RHzD8AdSeCplK6icmXRRrn/BjbdJApSHhX9tmgLz19J+QTijnSANhSnaNui93nfCy4ZET19+qPQ1SxT3YRvzYuhbqGn+j//un6T8sq7UM907uekmiWppaktb66/PT0cqRYy2OJbrf+MyuMtswHSfwErB13tY6J7ENYWH1hUky81kXZiZmFHMuBoQPumySs0mc7XVdTxrAVKHfpPTORjsp4azb8hp5HKMuGRPTgI8uBbskJqw4OVk7caX9FW51vkVOh/DlbdbXW6/VsWms0nCx7UVKpjfse//3npAGUJibk/vSRfJcuYxN+XLxtk/Xty+pW7TAzNExxv2mqVPj2Eq7Ci15r1/TKVZ9yLpuHeYZYx5wl27gALKuX0em+fNIO6A/qVu2wHn4MLL/sxe7atW06t+2uu4Vo1Jp8XMHsK1RA8yjvm5nXhpzEj3P1Ss6xyGEsUlSWgghhBAihcueLv4yzdpIZ6u/GcLGkt/FntVdSjH958I0LpqR7FqUsI5J0+KZEhXH330r0K96rvgb6sgk0gzXb9f2jWxV51KJPte6bmWonDthidPEWNEp8bEnhLN93Dej2pXVfb3xOS2K0K1idjb3KMvhwVFnZ+XNYMfxoVWZ2rSQTn26pk3F4J++lhpsUNiw64qJhDLAjCsTE3DOF/5v07/ib191hP5jSCqVh4X/m+snGPsWSnaOut8mEWs89v8PyvYO/zpbJe2OaardWqOxqjoamvwJPU8lrh99qjoq6mMLGxh6H5xyQZ0Z0H57eNnwX06GJ6gjZC0Po56Hlx8v1zcpIxbfsTRWaUhrnZYKGSskqLS1tjKkysC4soYdsFMve/JZj/NW51uMK5e0A5SGlhzKrc63uNnxJl0LduXPmn9yq/MtamZN5ExxA/il8C9RHk+tNDXO9jMqzdD5HCYKE+wt4lmuQwf2FvZksk3c55TFNaIu1XC53WXNjHFD29N0D7c636K4c3GtjzE3NWdV7VV0LNDRgJElH2aGWjNYrcbM0RFTe3syL1gQZ1PzbLp/7omLiY3ug8R1O4EJabp1Q2FhgXWJEuQ6cZxUZWMfXA2QqkwZ7OvXJ12/frhMmYLTL79glSdPjG3TDRoY4zl1lXHWrPjbzJtLliV/ks/djTyX/yXdsKHkOnkCu0hJcPMsWTRfm9rZkbZfP51jiTm+mXrpR4j4SFJaCCGEECKFs7U048b4WvSonD1R/RTNkriSbV0quCbqeIAn0+tzd3ICSpACxbKk5tCgytTIF3uyVhebfimb4GMzpdZ/KdNZzQtjHqkMeVy3bGvm1/5n8O+v0W8SXhlTk6oJnMmbGANr5qZWgfSJTkxv71WeCjl1S1rFdw+8f/VcFM3swKTGBbXu0zGVBRMaFaBCzrRRfncRcjnbUimekt4xsbP6OnL+z/a6zZZJiJzOsZetE3qQoXD8bb5VrK3+44hNWiPMbKs4GCZ+hg67Yi4N3edi1MfZKoaXyY5P06WQNjfUmwWjPaCJFmsod9fDun3mVlC8Q8JLZGvr11fgkCX+dgCVhkbfZu8CA65B+UjJZlOz8AR1zYnhgwEaLghfCztruQTdkBUiJmvrrk2ycxlqtm6H/B04+PNBg/SdkkTMjjc1MWVoqaFUyWyckulAvMnbDvk76NRfQmfwl3VJ+GeKb6n//19iVM5cOcrjiDW8kzNTE1NGlh4ZbSCBrnY3TuRSIkkg+/btpKpaBdO0eq5Eov76vDFNnTrW5LeJvT05dmv/c8p55LDOoTgPH6Z127R9+0YrqR3BZeoU8t68Qf67d0g/cgT53N1w3bwJ80yJG7ihDUUClsEyc3SMNYFsYmtLXnc3HBo0CO9focDUwYG0PXpEm/FtkTXqoAGnX7rj0KK5zvF8y6FJ3NUihNAX+QQhhBBCCPEdcExlQZ2CGRLVR2JnprQpk4XVXRKWTHSbUJsn0+tjaqLAxsKMRW21Hz0fwTQBHwzjUjFXWp7PbJCwBLOeQmlcNCMrOpXi/pS6tC4d9cNnbL8uXUuGZ3Cw4vnMBlz6tQYNiriwq095nO11mzXfvVLCBkT83qaY5uvZzYswtFb46PRaBdJzY3wtOpRL2Cj9MtnTsLlHOex0+FnEVwbfMZUFe/tXorMeBl9E5po2FfsHVOLSr9qvkZvRIXFVDXT5U784uobey9CLb7T/Zq3HVHoYENLoj8T3EaHFKv31pYu4nqh2GaDrIfhpEoz/CF0PhpfIrj9X+/6tHMAingEXlYZCltLa92lslnYw5DYUiOemYvpC4YlyXVQeCiOeRC35LYQWMttmjrdNdofEDazURWLXqo7JurrrGFVmFFnstRwUoicWJrEvu9M8d+ITFMlBoxyNEnTc5XaXOdz8sKZc9LfquNbRaQ3lDKkS9zlLX+wt7HUqx963aMwVLW50vMHwUsOTpKS9PvUt2pdFNRZxoe0FbnW+pdOxTlZO5HbMHX9DIzNLm5asy5aR5/w58t25TbrBg/XTsTrqYIbsu3aR+a+lpOnSRbMt819LyXvlMiaptK82ZpEtG2kHDoh1f6Y/fo+2zbpYMa36Tj9mDOkGDsCuWjWyrlsXZV/GOXNI3aIFJlZJUO0thmtS65IJKymftn8/cp05E32HSoWJhXZLqblMm4ptzZpkXbsWABNrazJOnUr++/fIMGVyguJymTYtQccJkRCSlBZCCCGE+E6Uck1D5/LZEny8uWnsCYBS2WJeSyqyfBnsEzRLuUfl7DjYmEdJKv+kw0zfCHrOSWscHlw5/kbfcLbTz4djFwcrahVIj1UM6xnHlkS1t0pYAtHFwZo/25WgZDbd1zodn4B1pZd1LEmTYpl4PrMBz2c2oFXpqDdyHVNZMLVpYa6MqcmDqXW16jO3sy3ben5dq+vSGO1LRRqwWmi8CmVywMVB+8EPdQpmYFDN3KzpmrBkWRod1o5PbWP8dSi/e3bpYcD18FLSxTtAYe3W54vTt+WuEyNDYagxXn/9aUWLWWDZKkClweEzeSPE94ectXzUx7bpoG4MpQrL9IQhd+CnifHHkRxViKHMJIQn8LscDE/oJ4QxXyhFitW/eH9jhxCFtdnX91u3Tm64dXJjfrX5Ce7vUttLlEhv+KolaW2iz5o82epklMezq8ymhHMJmuVuxsTy+n39ypgq/koPy2st1+s5AVrkaaHzMSdantDM/t3eaDs2ZtFnAg8vNTzGY/OniXngjYNFwitKRV7bPDH6Fu2LbXyDqb7Rp1ifGLebmZjRuWDnZJNs15a5qTnVslTTa0n05Exhakra3r3009k3SWkzR0fsqlUj/ehR5L9/j/z372m9nvK30vbsGeN2mzJlsK9dO/r2UlEHslu4uuI8cmSUbQ7Nm5Gm09eS7anKliH3pYtk/nMxuc6cwaFRwwTFmhCpW7bExNYWh2bNyHnkMBmmTCZNB90qLURQKBSYp49eLj9VZe3vOZhnyECWPxeTqlz0KgypmzfHNHVqnWJybN+e1M2b6XSMEIkhSWkhhBBCiO/IpCa6rU8bWbnssZc73hhPKeudvcvHuT82VfKkY2DN6CPWrS2iJ2Hjo0uiTRd2VuaY6ZDxblsm8bNkpjQpSImsqelbLfZ1qWPLDTjYGObnoE/mpgqtZ/Y721thaWZKvULR23ermJ3eVcNnwJTK5siRwVUom+Pr89jW0izBidvkzMREwZBaeaieN2Hr/7Uurf1zNL4Z5EJPnHJC1wPh6w6bJnJmeq2EzZCIU5Xh4WsNJxV1QkuTfvN87XU2fE3kUS9g4A1IE8NszHLf3LCvMwPqzwGH+Gd3Jlux/a5MTMG1Ilj9GDfzRfLwU7afjB1CFLlS56J13tb0L9YfE4UJJgoTamWrxfUO13Xua3299TonCROqT9E+1M5Wm4XVF2q2fTvTt172eqyrt45JFSbpfW3ujfU3Ut4l7uv98hkT9nlAn9w7uUdZH9nC1IJ/2/3L4hqLyeuYl7qudbnY9mKsydjplabHuL1hjoQnwEwVun+uiUnx9LpVkjLEIIHkZk2dNVq3bZe/nQEjMSzXHdsT3Yc6wddW8VOYmZF99y5sa9bEIvvXay27n2J//c914uvyKBbZsuHUrSv5brmT9+YN8rrdJGMMM3fNHB2xq1kzxqSuIZk5OZHn30tknD4Ni2zZcGzZEoVZ4q7X03QNX+og3aCBOI8ahUsCZzh/S2Figl097QZ0AziPHEn6Mb/q5dxCaEvqsAkhhBBCfGeq5U3H6QcfdDrm4ugamMSReI1ppm5khTMnbPbA+m5lEnRcTCY1TnhCPj4XRteg7PQTWrVtUDjxa4Z2LO9Kx/KucbaJ7V5jQkqfJzVdkqIRlnYoSYEJhwkIUWq2NS+ZiQIu9nSr5Brr7PTqeZ0pnMmBW6+/xNn/j5R6tTQzZVbzwozapVvZQ6GDpJhROuoFHB4NblsibXsevt6vIXQ9DGvqgVoJn54a5hwRErq+ZeSfe8VB4FL062Pr1LEf12kv3NoJ9pmgTI+EnTs5sU4NQ+/D/HzGjkQILE0tudbhGiU3xlzqdGG1hUkaj0KhYFy5cdG2m5uaUytbLY690L6ccXHnpLvmsrOwY161eTFu9w3xNfj509mkY3nt5dzxukObA20Mfr6EiikZr1AoqJqlKlWzVI33+FyOubjS/gqvfV/z8z8/a7Z3KJCwWZH6FF9SsVWeVowvP54vwV+wt7DX+8CE5KhUBu2WjqqWuRrdC3U3cDSGY5VPD+/nKlXi+4iDVYECZPlzMQDBjx/j/+9lHNu0jrW9eaZMOPXqxcdVq0g3dCgACnNzFObJs0pTYpPQ33IeOQKnbl0xS6eHZXsSKPvfu7HKL8uyiKQnM6WFEEIIIb4za7qU5hcd1vgtlyMNGROybnIsGhR20VtfusiQyHV24+JsZ6l1W0tz415i53JOmhk737qsQ6nsX+sl7MPvpV9r0r1Sdrb1LMep4dUomNEBhUIRb7n0Pf0qxtv3j3DjLrIWJbUbGKDWpoyy0F3v84nvwzo1/PwXjH0LLdZAzzOGS0hDeKnrAf+FzzhOiEza3TgmTQ4wS2DFh8jrcesyYzxHNWiyGKr/CqbJ82aozuxdoPEiyB9pPdYf7HVOJB8WplH/pjfV38TFthdx6+RGzWzaXz8Y2szKMZTzj8U/Tf8xYCTaM8Qa2XEpmLYg1ztGn1XeKo8elp6IgTGuQ6zNrMnlmIuZlWeSxzEPB34+gJlJwhNSeRzz6DG62EUMtnCwdPihrmsrZaoUbduCaguiPC7mXAxTE/3MWDcKk8T/nettbWotWObKRZoO7eNN5DoPGUy+G9exyps0fyPJiUKhMFxCWstZ8ZKQFsYiSWkhhBBCiO+MQqFgXMMCPJtRX6v2+kxIA7QomfQlTw1VujuCQqFgd98KWrXVpdR3YiS3ssrp7a1Y3C7mGUP5Xeyj/FxSWSbsxp6DtTnjGxagbA4nsqdNpfVxpiaKeGflJ6+fpuGZmig4OFD39dKFlixjKY/cZAn89iV8nWZ9MbeGQs0gYzH99RmfCgN0P6bDLu3a5aqle98R8jaAMr3g5++/ZKlWSnQKLwkvRDLQOm/4jLkRpUZQJF0R7CzskjyhGh8LUwv2NtnLoBKD4mx3tf1VsjtoPwDUkOZWnYupwpTx5cYn2TnNTcxZVGNRlG39ivczyLkcLQ042CoeDXI0YFfjXWS1z5qofhrnbMyIUiMSHU9Egt7cJPrgqfX11v9QiejI/qj+R5THf9b8k/+1d+/BUZV5GsefTqc7aUNISEI66UAuRBAIhBASAgTxkmhAQKOog1wMOLrOTECToDMBF2KV3L0hchPHgVmFQscd1MJxS0REcRUiGJVVEQVXVxdQV4KgXDbp/YMlVBtyT/fbpL+fqhTnvOf06QcqvNV9fud93yu6X6H4sHMPSl/o/zYWa9sL6lGTJrZDkvbnryOjL2S2uKYHCdiS2tavAW3hX5/+AAAA0G4sFoten3GZZo/p2+h5rR212hATIxpmj/H+U76Zic27KTagW6R3g/y/qbnJ9druLbjEJ+/dkDHpLm2afm60wqpJg/TlwtF65e5L9VrZZbq6r1Mbm1ncb2+X9ozRZb3OPY0+MDHS4/gFfq+qVfq6OqvyvnxNv7LhtYKtPnrIosOJTpXy5khjH5O6JJ9pGzpNGuifNwRb7Mo5LX9NY9Nne1z7vpZf+6ygIOmaxdKAhqeLDDihEWdGt5d+bDoJAtysnFnadP0mTe472XSURvWI7KHb+99+3mnFnxvznD4q+kihwd6bnaelhsQP0XuT3tPNl3hnpHJD+kSd++yd3jVdUaFRXnmfHpE9vHJdX7IGWXVr2q1tvk5Xx5nPsb/P+L1H+7pr1vl0Knl/Y7PatGHMBl2Xep1eHfeqRnQbIWuQVf+44R9154Tbww0mbB+RN91kOgIuEFFTipo8J/Gpp3yQBDg/1pQGAADowHp07aQeXTsp9+JoVf98Wj+fqtHUtZV1x21Wi7q2YGrq9pSV1H4jHxIiW7n+aAuN6henV/YcbPD4zFG9G12buz1lJUfpb78bqt888Y6m5qao9Kpe6tTKEcjtqV9ChL5cOLpee3JMmFbf2szpe73AYrHor7cN1p5vqhUbHqITp2s14sGtHscDUdfwkAYLz8snZCok+AKe6tC0S2ec+fOSa6QvXpf6FhqN066C7dKQP0jvrmj/a4dGtP81A13UhV/UwYUvyBKkpM5JpmM0W15SnuYNn6f7tp95UGbGoBnqE+2fU522ZWrp1nKGOXVt6rXaf2S/1hSs8ep7+WrdbH+W0ClBPbv0lCTFOGKUl5inLV9tkXTmoYBAlxadprnD53q0BQcFq2Johd7973dVmFpoJhhgQFBoqCKuu07VL75Y75gjI0Ouhx6UvZvvZ7cDzjJ/1woAAABe1zvu3FSyrohQfVt9QpL06G8ymn2Np387WN8e+UV/+tePGj2vT3wD09b+yr/8tvHplFticIp3Rmf82sJx6croHqktnx7WzgP/U9c+e0xfffhfR1Q0LNknOc7KTo7S3rmjZLMyAVJz9Us4V/BacEN/zfx747/PgSDUdv7C8+h0M+vDdzidYqUB402naH8jF7R/UfpW/1ijFQAkaWTySL30+Usa6ByoKf2mmI7jd+YNn2c6goclVywxHaFRfyn4i57++Glt/Xpr0yf/SuHFhR77/raMj7+6sdeNurHXjaZj+FzCkkf1w5o1OvHBh2f2H3vMcCL4WlBY/Yf2I264Qa75/tVvIzBRlAYAAAgwqyYP0p1P71Jpfi+NSXc1+3WX9jwzZVxTRen4CIdeK7tM+Y9sa/Q8ewsLqf88uo/mvvyJJCkh0qFvjvwiSdpcOqJF12mLCIdNd16WqjsvS5Uk7f/umFyRjgaLer5AQbr1rh3goigtafKQJD386l6drvH91PuAJCk0Upr4N6l7+z2sBABtZbfa9eeCP5uOEfCaW4DNS8zzcpK2yY7LVnZctp75+BktqlzUotcmRyR77AfqDD9ons4jR6rzyJGqPXVKFpuN35cAFDNtmn7Z8x+KuO5a2eLiZIuPV0gf/5ztA4GHojQAAECASe8WqXdmtv6mzebSEbrq0TcbPefi2E5NXqelX46n5qZo697D6hkbrj9cnqrB889MWdfTaW6NsB5dm/57wn+FhQRr5315LX5AoqMJCwnWvnnXqOLFPfrrO/8pSdoxy79v7KKDufcLycrtCQBAfR2toOYIdjR6PCcuRzsO7vBoK0gq8Ni3BdnaPRc6hm4rltdtB9ntBpPApOCoKKU896zpGMB5BfbdFwAAALRYT2e4nv/dUN1bcEmbrtPU7aVVkzLrtm/ITJA1yKJ1tw/R/demKbZzqDb80xBtmj68TRmA2PBQRV7EDRtJKh/VR8nRF+mGzAQ5O4eajoNAMfXfKEgDABrU0aaqbqrI/sfBf9T28dvr9i9NuLTea0oHlSqpc5LKB5d7JSP8T/Qdt0sWi2yJiQ2eY0tMVPiVV/owFQC0HN/8AAAA0GJZyVE6/NPJNl2jqUEPI/vF68uFo3XidI1Cgus/SzmkR3Sb3h+AJ4fdqq33XN7hRiTBi5z9pUNtmAI/Nk1KGtp+eQAAAemqpKtMR2i2guQCVfx7RYPHe3XpJUma0HuC1n+6XsUZxfXOiQuL06brN3ktI/yPPTFRvT+o0s+739dXU6ac95ygkBDfhgKAVmCkNAAAAFolNvzcl97goPofK9ffkdPo65tb+Aq1WSmSAT7C/zW0SNFLTZ+Te3f9tsKV0h1bpds3t38mAECHUpJZUredGXtuJqVZObPqtv+U/SdfRmqTMFuYhic0PdvTzJyZqpxYqbSYNB+kwoXAYrc3+mS366GHfJgGAFqHkdIAAABolTRXRN22Naj+l+NhqTHqEROm/d8f92UsAICvXBTV9Dldks9tz/pWOn1CCmOmCwBA84zrNU65CblyXuSUxWLR4+8/rh9++UHjLxmvq5Ku0tFTR+UMc5qO2SKzcmbpmr9fU699Qu8JHvuhwSynAk+2BFeDx0Iv6eXDJADQOhSlAQAA0CoOu1Uf3n+1bOcZJX1WQhfHeYvSg5K6eDMaAMC00Egp+3Zp4K3n2uxhZ34AAGiBuLC4uu3pA6fXbcc4YhTjiDERqU26h3dXritXb3/7dl3baze+dsEV1+F79m7dTEcAgDahKA0AAIBW6xxqa/S4233+9v4JEec/AAC48FUcaXR6SQAAAt2iEYv08v6XNTJlpMJt4bJZG/9eBZyV+tpr+iI/36Mt+fnnDaUBgJahKA0AAACvmX7lxdr++femYwAAfGXsYxSkAQBoQkRIhCb0mdD0icCv2LslqM+nn6jm2DGd2r9f9h49ZO3UyXQsAGiWhudaBAAAANoop0e0dt6XV6+degUAdBCOX60rnT7eTA4AAIAAYu3USY70dArSAC4oFKUBAADgVbHhoaYjAAC8Jffuc9sz9ko2+nwAAAAAQH0UpQEAAOBzRUOTTUcAALSHuH7ntsPjzOUAAAAAAPg11pQGAACAz5Tk91TR0GR1CbObjgIAaA+pedL1qyVnX9NJAAAAAAB+zOJ2u92mQwAAAKBj++HYSR098b9KiQkzHQUAAAAAAACAj1GUBgAAAAAAAAAAAAB4DWtK+4GTJ0/q/vvv18mTJ01HAQC/QL8IAJ7oFwGgPvpGAPBEvwgAnugXAf/CSGk/cPToUUVERKi6ulqdO3c2HQcAjKNfBABP9IsAUB99IwB4ol8EAE/0i4B/YaQ0AAAAAAAAAAAAAMBrKEoDAAAAAAAAAAAAALyGojQAAAAAAAAAAAAAwGsoSvuBkJAQVVRUKCQkxHQUAPAL9IsA4Il+EQDqo28EAE/0iwDgiX4R8C8Wt9vtNh0CAAAAAAAAAAAAANAxMVIaAAAAAAAAAAAAAOA1FKUBAAAAAAAAAAAAAF5DURoAAAAAAAAAAAAA4DUUpQEAAAAAAAAAAAAAXkNRGgAAAAAAAAAAAADgNRSl/cDy5cuVnJys0NBQ5eTkaOfOnaYjAYARCxYsUHZ2tsLDwxUbG6vCwkLt3bvXdCwA8BsLFy6UxWJRSUmJ6SgAYMw333yjSZMmKTo6Wg6HQ/3799d7771nOhYAGFFTU6PZs2crJSVFDodDqampeuCBB+R2u01HAwCfefPNNzV27Fi5XC5ZLBa98MILHsfdbrfmzJmj+Ph4ORwO5efna9++fWbCAgGMorRhzz77rMrKylRRUaHdu3drwIABKigo0OHDh01HAwCf27Ztm4qLi/Xuu+9q8+bNOn36tK6++modP37cdDQAMK6yslJPPPGE0tPTTUcBAGN+/PFH5ebmymaz6ZVXXtHHH3+shx9+WF26dDEdDQCMWLRokVauXKlly5bpk08+0aJFi7R48WI9/vjjpqMBgM8cP35cAwYM0PLly897fPHixVq6dKlWrVqlHTt2KCwsTAUFBTpx4oSPkwKBzeLmsTmjcnJylJ2drWXLlkmSamtr1b17d02fPl3l5eWG0wGAWd99951iY2O1bds2jRgxwnQcADDm2LFjyszM1IoVKzR37lxlZGRoyZIlpmMBgM+Vl5fr7bff1ltvvWU6CgD4hTFjxsjpdOqpp56qaxs3bpwcDoeeeeYZg8kAwAyLxaKNGzeqsLBQ0plR0i6XSzNmzNA999wjSaqurpbT6dTatWs1fvx4g2mBwMJIaYNOnTqlXbt2KT8/v64tKChI+fn5eueddwwmAwD/UF1dLUmKiooynAQAzCouLtbo0aM9PjcCQCB66aWXlJWVpZtuukmxsbEaOHCgnnzySdOxAMCYYcOGacuWLfrss88kSR988IG2b9+uUaNGGU4GAP7hwIEDOnjwoMf36YiICOXk5FCHAXws2HSAQPb999+rpqZGTqfTo93pdOrTTz81lAoA/ENtba1KSkqUm5urfv36mY4DAMZs2LBBu3fvVmVlpekoAGDc/v37tXLlSpWVlWnWrFmqrKzUXXfdJbvdrqKiItPxAMDnysvLdfToUfXu3VtWq1U1NTWaN2+eJk6caDoaAPiFgwcPStJ56zBnjwHwDYrSAAC/VFxcrD179mj79u2mowCAMV9//bXuvvtubd68WaGhoabjAIBxtbW1ysrK0vz58yVJAwcO1J49e7Rq1SqK0gAC0nPPPad169Zp/fr1SktLU1VVlUpKSuRyuegXAQCAX2H6boNiYmJktVp16NAhj/ZDhw4pLi7OUCoAMG/atGnatGmTtm7dqm7dupmOAwDG7Nq1S4cPH1ZmZqaCg4MVHBysbdu2aenSpQoODlZNTY3piADgU/Hx8erbt69HW58+ffTVV18ZSgQAZt17770qLy/X+PHj1b9/f02ePFmlpaVasGCB6WgA4BfO1lqowwDmUZQ2yG63a9CgQdqyZUtdW21trbZs2aKhQ4caTAYAZrjdbk2bNk0bN27U66+/rpSUFNORAMCovLw8ffTRR6qqqqr7ycrK0sSJE1VVVSWr1Wo6IgD4VG5urvbu3evR9tlnnykpKclQIgAw6+eff1ZQkOctXqvVqtraWkOJAMC/pKSkKC4uzqMOc/ToUe3YsYM6DOBjTN9tWFlZmYqKipSVlaXBgwdryZIlOn78uKZOnWo6GgD4XHFxsdavX68XX3xR4eHhdeu6REREyOFwGE4HAL4XHh6ufv36ebSFhYUpOjq6XjsABILS0lINGzZM8+fP180336ydO3dq9erVWr16teloAGDE2LFjNW/ePCUmJiotLU3vv/++HnnkEd12222mowGAzxw7dkyff/553f6BAwdUVVWlqKgoJSYmqqSkRHPnzlXPnj2VkpKi2bNny+VyqbCw0FxoIABZ3G6323SIQLds2TI9+OCDOnjwoDIyMrR06VLl5OSYjgUAPmexWM7bvmbNGk2ZMsW3YQDAT11++eXKyMjQkiVLTEcBACM2bdqkmTNnat++fUpJSVFZWZnuuOMO07EAwIiffvpJs2fP1saNG3X48GG5XC7dcsstmjNnjux2u+l4AOATb7zxhq644op67UVFRVq7dq3cbrcqKiq0evVqHTlyRMOHD9eKFSvUq1cvA2mBwEVRGgAAAAAAAAAAAADgNawpDQAAAAAAAAAAAADwGorSAAAAAAAAAAAAAACvoSgNAAAAAAAAAAAAAPAaitIAAAAAAAAAAAAAAK+hKA0AAAAAAAAAAAAA8BqK0gAAAAAAAAAAAAAAr6EoDQAAAAAAAAAAAADwGorSAAAAAAAAAAAAAACvoSgNAAAAAAAAAAAAAPAaitIAAAAAAAAAAAAAAK+hKA0AAAAAAAAAAAAA8Jr/A3IBgZDnnOoHAAAAAElFTkSuQmCC\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Streaming HF space:\n", + "Try out the streaming HuggingFace space at: https://huggingface.co/spaces/facebook/seamless-streaming" + ], + "metadata": { + "id": "Jr0kcQ_mGj8s" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Unity.cpp" + ], + "metadata": { + "id": "OzNNvD5aGr8i" + } + }, + { + "cell_type": "code", + "source": [ + "# unity.cpp\n", + "%mkdir -p ggml/build\n", + "%cd ggml/build\n", + "!cmake -DGGML_OPENBLAS=ON -DBUILD_SHARED_LIBS=On -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS=\"-g2 -fno-omit-frame-pointer\" ..\n", + "!make -j4 unity\n", + "# Download seamless_M4T_medium model, converted to ggml format\n", + "# Conversion script: https://github.com/facebookresearch/seamless_communication/blob/main/ggml/ggml_convert.py\n", + "!wget https://dl.fbaipublicfiles.com/seamless/models/seamlessM4T_medium.ggml\n" + ], + "metadata": { + "id": "FFGHgLbaKQ00" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#Launching the console. But google colab doesn't support a console in C program\n", + "./bin/unity --model seamlessM4T_medium.ggml -t 8" + ], + "metadata": { + "id": "ktkvmng1KTKE" + }, + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "provenance": [], + "gpuType": "T4", + "include_colab_link": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/seamless_communication/__init__.py b/seamless_communication/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seamless_communication/demo/.gitignore b/seamless_communication/demo/.gitignore new file mode 100644 index 0000000..7e2f179 --- /dev/null +++ b/seamless_communication/demo/.gitignore @@ -0,0 +1 @@ +assets diff --git a/seamless_communication/demo/expressive/app.py b/seamless_communication/demo/expressive/app.py new file mode 100644 index 0000000..4b7d869 --- /dev/null +++ b/seamless_communication/demo/expressive/app.py @@ -0,0 +1,286 @@ +#!/usr/bin/env python +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import os +import pathlib +import tempfile + +import gradio as gr +import torch +import torchaudio +from fairseq2.assets import InProcAssetMetadataProvider, asset_store +from fairseq2.data import Collater +from fairseq2.data.audio import ( + AudioDecoder, + WaveformToFbankConverter, + WaveformToFbankOutput, +) + +from seamless_communication.inference import SequenceGeneratorOptions +from fairseq2.generation import NGramRepeatBlockProcessor +from fairseq2.memory import MemoryBlock +from huggingface_hub import snapshot_download +from seamless_communication.inference import Translator, SequenceGeneratorOptions +from seamless_communication.models.unity import ( + load_gcmvn_stats, + load_unity_unit_tokenizer, +) +from seamless_communication.cli.expressivity.predict.pretssel_generator import PretsselGenerator + +from typing import Tuple +from utils import LANGUAGE_CODE_TO_NAME + +DESCRIPTION = """\ +# Seamless Expressive +[SeamlessExpressive](https://github.com/facebookresearch/seamless_communication) is a speech-to-speech translation model that captures certain underexplored aspects of prosody such as speech rate and pauses, while preserving the style of one's voice and high content translation quality. +""" + +CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1" and torch.cuda.is_available() + +CHECKPOINTS_PATH = pathlib.Path(os.getenv("CHECKPOINTS_PATH", "/home/user/app/models")) +if not CHECKPOINTS_PATH.exists(): + snapshot_download(repo_id="facebook/seamless-expressive", repo_type="model", local_dir=CHECKPOINTS_PATH) + snapshot_download(repo_id="facebook/seamless-m4t-v2-large", repo_type="model", local_dir=CHECKPOINTS_PATH) + +# Ensure that we do not have any other environment resolvers and always return +# "demo" for demo purposes. +asset_store.env_resolvers.clear() +asset_store.env_resolvers.append(lambda: "demo") + +# Construct an `InProcAssetMetadataProvider` with environment-specific metadata +# that just overrides the regular metadata for "demo" environment. Note the "@demo" suffix. +demo_metadata = [ + { + "name": "seamless_expressivity@demo", + "checkpoint": f"file://{CHECKPOINTS_PATH}/m2m_expressive_unity.pt", + "char_tokenizer": f"file://{CHECKPOINTS_PATH}/spm_char_lang38_tc.model", + }, + { + "name": "vocoder_pretssel@demo", + "checkpoint": f"file://{CHECKPOINTS_PATH}/pretssel_melhifigan_wm-final.pt", + }, + { + "name": "seamlessM4T_v2_large@demo", + "checkpoint": f"file://{CHECKPOINTS_PATH}/seamlessM4T_v2_large.pt", + "char_tokenizer": f"file://{CHECKPOINTS_PATH}/spm_char_lang38_tc.model", + }, +] + +asset_store.metadata_providers.append(InProcAssetMetadataProvider(demo_metadata)) + +LANGUAGE_NAME_TO_CODE = {v: k for k, v in LANGUAGE_CODE_TO_NAME.items()} + + +if torch.cuda.is_available(): + device = torch.device("cuda:0") + dtype = torch.float16 +else: + device = torch.device("cpu") + dtype = torch.float32 + + +MODEL_NAME = "seamless_expressivity" +VOCODER_NAME = "vocoder_pretssel" + +# used for ASR for toxicity +m4t_translator = Translator( + model_name_or_card="seamlessM4T_v2_large", + vocoder_name_or_card=None, + device=device, + dtype=dtype, +) +unit_tokenizer = load_unity_unit_tokenizer(MODEL_NAME) + +_gcmvn_mean, _gcmvn_std = load_gcmvn_stats(VOCODER_NAME) +gcmvn_mean = torch.tensor(_gcmvn_mean, device=device, dtype=dtype) +gcmvn_std = torch.tensor(_gcmvn_std, device=device, dtype=dtype) + +translator = Translator( + MODEL_NAME, + vocoder_name_or_card=None, + device=device, + dtype=dtype, + apply_mintox=False, +) + +text_generation_opts = SequenceGeneratorOptions( + beam_size=5, + unk_penalty=torch.inf, + soft_max_seq_len=(0, 200), + step_processor=NGramRepeatBlockProcessor( + ngram_size=10, + ), +) +m4t_text_generation_opts = SequenceGeneratorOptions( + beam_size=5, + unk_penalty=torch.inf, + soft_max_seq_len=(1, 200), + step_processor=NGramRepeatBlockProcessor( + ngram_size=10, + ), +) + +pretssel_generator = PretsselGenerator( + VOCODER_NAME, + vocab_info=unit_tokenizer.vocab_info, + device=device, + dtype=dtype, +) + +decode_audio = AudioDecoder(dtype=torch.float32, device=device) + +convert_to_fbank = WaveformToFbankConverter( + num_mel_bins=80, + waveform_scale=2**15, + channel_last=True, + standardize=False, + device=device, + dtype=dtype, +) + + +def normalize_fbank(data: WaveformToFbankOutput) -> WaveformToFbankOutput: + fbank = data["fbank"] + std, mean = torch.std_mean(fbank, dim=0) + data["fbank"] = fbank.subtract(mean).divide(std) + data["gcmvn_fbank"] = fbank.subtract(gcmvn_mean).divide(gcmvn_std) + return data + + +collate = Collater(pad_value=0, pad_to_multiple=1) + + +AUDIO_SAMPLE_RATE = 16000 +MAX_INPUT_AUDIO_LENGTH = 10 # in seconds + + +def remove_prosody_tokens_from_text(text): + # filter out prosody tokens, there is only emphasis '*', and pause '=' + text = text.replace("*", "").replace("=", "") + text = " ".join(text.split()) + return text + + +def preprocess_audio(input_audio_path: str) -> None: + arr, org_sr = torchaudio.load(input_audio_path) + new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE) + max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE) + if new_arr.shape[1] > max_length: + new_arr = new_arr[:, :max_length] + gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.") + torchaudio.save(input_audio_path, new_arr, sample_rate=AUDIO_SAMPLE_RATE) + + +def run( + input_audio_path: str, + source_language: str, + target_language: str, +) -> Tuple[str, str]: + target_language_code = LANGUAGE_NAME_TO_CODE[target_language] + source_language_code = LANGUAGE_NAME_TO_CODE[source_language] + + preprocess_audio(input_audio_path) + + with pathlib.Path(input_audio_path).open("rb") as fb: + block = MemoryBlock(fb.read()) + example = decode_audio(block) + + example = convert_to_fbank(example) + example = normalize_fbank(example) + example = collate(example) + + # get transcription for mintox + source_sentences, _ = m4t_translator.predict( + input=example["fbank"], + task_str="S2TT", # get source text + tgt_lang=source_language_code, + text_generation_opts=m4t_text_generation_opts, + ) + source_text = str(source_sentences[0]) + + prosody_encoder_input = example["gcmvn_fbank"] + text_output, unit_output = translator.predict( + example["fbank"], + "S2ST", + tgt_lang=target_language_code, + src_lang=source_language_code, + text_generation_opts=text_generation_opts, + unit_generation_ngram_filtering=False, + duration_factor=1.0, + prosody_encoder_input=prosody_encoder_input, + src_text=source_text, # for mintox check + ) + speech_output = pretssel_generator.predict( + unit_output.units, + tgt_lang=target_language_code, + prosody_encoder_input=prosody_encoder_input, + ) + + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: + torchaudio.save( + f.name, + speech_output.audio_wavs[0][0].to(torch.float32).cpu(), + sample_rate=speech_output.sample_rate, + ) + + text_out = remove_prosody_tokens_from_text(str(text_output[0])) + + return f.name, text_out + + +TARGET_LANGUAGE_NAMES = [ + "English", + "French", + "German", + "Spanish", +] + +with gr.Blocks(css="style.css") as demo: + gr.Markdown(DESCRIPTION) + gr.DuplicateButton( + value="Duplicate Space for private use", + elem_id="duplicate-button", + visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1", + ) + with gr.Row(): + with gr.Column(): + with gr.Group(): + input_audio = gr.Audio(label="Input speech", type="filepath") + source_language = gr.Dropdown( + label="Source language", + choices=TARGET_LANGUAGE_NAMES, + value="English", + ) + target_language = gr.Dropdown( + label="Target language", + choices=TARGET_LANGUAGE_NAMES, + value="French", + ) + btn = gr.Button() + with gr.Column(): + with gr.Group(): + output_audio = gr.Audio(label="Translated speech") + output_text = gr.Textbox(label="Translated text") + + gr.Examples( + examples=[], + inputs=[input_audio, source_language, target_language], + outputs=[output_audio, output_text], + fn=run, + cache_examples=CACHE_EXAMPLES, + api_name=False, + ) + + btn.click( + fn=run, + inputs=[input_audio, source_language, target_language], + outputs=[output_audio, output_text], + api_name="run", + ) + +if __name__ == "__main__": + demo.queue(max_size=50).launch() \ No newline at end of file diff --git a/seamless_communication/demo/expressive/requirements.txt b/seamless_communication/demo/expressive/requirements.txt new file mode 100644 index 0000000..1ccc2a1 --- /dev/null +++ b/seamless_communication/demo/expressive/requirements.txt @@ -0,0 +1,5 @@ +gradio~=4.5.0 +omegaconf~=2.3.0 +torch~=2.1.0 +torchaudio~=2.1.0 +fairseq2~=0.2.0 diff --git a/seamless_communication/demo/expressive/utils.py b/seamless_communication/demo/expressive/utils.py new file mode 100644 index 0000000..4091f7a --- /dev/null +++ b/seamless_communication/demo/expressive/utils.py @@ -0,0 +1,104 @@ +LANGUAGE_CODE_TO_NAME = { + "afr": "Afrikaans", + "amh": "Amharic", + "arb": "Modern Standard Arabic", + "ary": "Moroccan Arabic", + "arz": "Egyptian Arabic", + "asm": "Assamese", + "ast": "Asturian", + "azj": "North Azerbaijani", + "bel": "Belarusian", + "ben": "Bengali", + "bos": "Bosnian", + "bul": "Bulgarian", + "cat": "Catalan", + "ceb": "Cebuano", + "ces": "Czech", + "ckb": "Central Kurdish", + "cmn": "Mandarin Chinese", + "cym": "Welsh", + "dan": "Danish", + "deu": "German", + "ell": "Greek", + "eng": "English", + "est": "Estonian", + "eus": "Basque", + "fin": "Finnish", + "fra": "French", + "gaz": "West Central Oromo", + "gle": "Irish", + "glg": "Galician", + "guj": "Gujarati", + "heb": "Hebrew", + "hin": "Hindi", + "hrv": "Croatian", + "hun": "Hungarian", + "hye": "Armenian", + "ibo": "Igbo", + "ind": "Indonesian", + "isl": "Icelandic", + "ita": "Italian", + "jav": "Javanese", + "jpn": "Japanese", + "kam": "Kamba", + "kan": "Kannada", + "kat": "Georgian", + "kaz": "Kazakh", + "kea": "Kabuverdianu", + "khk": "Halh Mongolian", + "khm": "Khmer", + "kir": "Kyrgyz", + "kor": "Korean", + "lao": "Lao", + "lit": "Lithuanian", + "ltz": "Luxembourgish", + "lug": "Ganda", + "luo": "Luo", + "lvs": "Standard Latvian", + "mai": "Maithili", + "mal": "Malayalam", + "mar": "Marathi", + "mkd": "Macedonian", + "mlt": "Maltese", + "mni": "Meitei", + "mya": "Burmese", + "nld": "Dutch", + "nno": "Norwegian Nynorsk", + "nob": "Norwegian Bokm\u00e5l", + "npi": "Nepali", + "nya": "Nyanja", + "oci": "Occitan", + "ory": "Odia", + "pan": "Punjabi", + "pbt": "Southern Pashto", + "pes": "Western Persian", + "pol": "Polish", + "por": "Portuguese", + "ron": "Romanian", + "rus": "Russian", + "slk": "Slovak", + "slv": "Slovenian", + "sna": "Shona", + "snd": "Sindhi", + "som": "Somali", + "spa": "Spanish", + "srp": "Serbian", + "swe": "Swedish", + "swh": "Swahili", + "tam": "Tamil", + "tel": "Telugu", + "tgk": "Tajik", + "tgl": "Tagalog", + "tha": "Thai", + "tur": "Turkish", + "ukr": "Ukrainian", + "urd": "Urdu", + "uzn": "Northern Uzbek", + "vie": "Vietnamese", + "xho": "Xhosa", + "yor": "Yoruba", + "yue": "Cantonese", + "zlm": "Colloquial Malay", + "zsm": "Standard Malay", + "zul": "Zulu", +} diff --git a/seamless_communication/demo/m4tv1/app.py b/seamless_communication/demo/m4tv1/app.py new file mode 100644 index 0000000..905978b --- /dev/null +++ b/seamless_communication/demo/m4tv1/app.py @@ -0,0 +1,729 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from __future__ import annotations + +import gradio as gr +import numpy as np +import torch +import torchaudio +from huggingface_hub import hf_hub_download + +from seamless_communication.models.inference.translator import Translator + +DESCRIPTION = """# SeamlessM4T + +[SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality +translation, allowing people from different linguistic communities to communicate effortlessly through speech and text. + +This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST) +translation and more, without relying on multiple separate models. +""" + +TASK_NAMES = [ + "S2ST (Speech to Speech translation)", + "S2TT (Speech to Text translation)", + "T2ST (Text to Speech translation)", + "T2TT (Text to Text translation)", + "ASR (Automatic Speech Recognition)", +] + +# Language dict +language_code_to_name = { + "afr": "Afrikaans", + "amh": "Amharic", + "arb": "Modern Standard Arabic", + "ary": "Moroccan Arabic", + "arz": "Egyptian Arabic", + "asm": "Assamese", + "ast": "Asturian", + "azj": "North Azerbaijani", + "bel": "Belarusian", + "ben": "Bengali", + "bos": "Bosnian", + "bul": "Bulgarian", + "cat": "Catalan", + "ceb": "Cebuano", + "ces": "Czech", + "ckb": "Central Kurdish", + "cmn": "Mandarin Chinese", + "cym": "Welsh", + "dan": "Danish", + "deu": "German", + "ell": "Greek", + "eng": "English", + "est": "Estonian", + "eus": "Basque", + "fin": "Finnish", + "fra": "French", + "gaz": "West Central Oromo", + "gle": "Irish", + "glg": "Galician", + "guj": "Gujarati", + "heb": "Hebrew", + "hin": "Hindi", + "hrv": "Croatian", + "hun": "Hungarian", + "hye": "Armenian", + "ibo": "Igbo", + "ind": "Indonesian", + "isl": "Icelandic", + "ita": "Italian", + "jav": "Javanese", + "jpn": "Japanese", + "kam": "Kamba", + "kan": "Kannada", + "kat": "Georgian", + "kaz": "Kazakh", + "kea": "Kabuverdianu", + "khk": "Halh Mongolian", + "khm": "Khmer", + "kir": "Kyrgyz", + "kor": "Korean", + "lao": "Lao", + "lit": "Lithuanian", + "ltz": "Luxembourgish", + "lug": "Ganda", + "luo": "Luo", + "lvs": "Standard Latvian", + "mai": "Maithili", + "mal": "Malayalam", + "mar": "Marathi", + "mkd": "Macedonian", + "mlt": "Maltese", + "mni": "Meitei", + "mya": "Burmese", + "nld": "Dutch", + "nno": "Norwegian Nynorsk", + "nob": "Norwegian Bokm\u00e5l", + "npi": "Nepali", + "nya": "Nyanja", + "oci": "Occitan", + "ory": "Odia", + "pan": "Punjabi", + "pbt": "Southern Pashto", + "pes": "Western Persian", + "pol": "Polish", + "por": "Portuguese", + "ron": "Romanian", + "rus": "Russian", + "slk": "Slovak", + "slv": "Slovenian", + "sna": "Shona", + "snd": "Sindhi", + "som": "Somali", + "spa": "Spanish", + "srp": "Serbian", + "swe": "Swedish", + "swh": "Swahili", + "tam": "Tamil", + "tel": "Telugu", + "tgk": "Tajik", + "tgl": "Tagalog", + "tha": "Thai", + "tur": "Turkish", + "ukr": "Ukrainian", + "urd": "Urdu", + "uzn": "Northern Uzbek", + "vie": "Vietnamese", + "xho": "Xhosa", + "yor": "Yoruba", + "yue": "Cantonese", + "zlm": "Colloquial Malay", + "zsm": "Standard Malay", + "zul": "Zulu", +} +LANGUAGE_NAME_TO_CODE = {v: k for k, v in language_code_to_name.items()} + +# Source langs: S2ST / S2TT / ASR don't need source lang +# T2TT / T2ST use this +text_source_language_codes = [ + "afr", + "amh", + "arb", + "ary", + "arz", + "asm", + "azj", + "bel", + "ben", + "bos", + "bul", + "cat", + "ceb", + "ces", + "ckb", + "cmn", + "cym", + "dan", + "deu", + "ell", + "eng", + "est", + "eus", + "fin", + "fra", + "gaz", + "gle", + "glg", + "guj", + "heb", + "hin", + "hrv", + "hun", + "hye", + "ibo", + "ind", + "isl", + "ita", + "jav", + "jpn", + "kan", + "kat", + "kaz", + "khk", + "khm", + "kir", + "kor", + "lao", + "lit", + "lug", + "luo", + "lvs", + "mai", + "mal", + "mar", + "mkd", + "mlt", + "mni", + "mya", + "nld", + "nno", + "nob", + "npi", + "nya", + "ory", + "pan", + "pbt", + "pes", + "pol", + "por", + "ron", + "rus", + "slk", + "slv", + "sna", + "snd", + "som", + "spa", + "srp", + "swe", + "swh", + "tam", + "tel", + "tgk", + "tgl", + "tha", + "tur", + "ukr", + "urd", + "uzn", + "vie", + "yor", + "yue", + "zsm", + "zul", +] +TEXT_SOURCE_LANGUAGE_NAMES = sorted( + [language_code_to_name[code] for code in text_source_language_codes] +) + +# Target langs: +# S2ST / T2ST +s2st_target_language_codes = [ + "eng", + "arb", + "ben", + "cat", + "ces", + "cmn", + "cym", + "dan", + "deu", + "est", + "fin", + "fra", + "hin", + "ind", + "ita", + "jpn", + "kor", + "mlt", + "nld", + "pes", + "pol", + "por", + "ron", + "rus", + "slk", + "spa", + "swe", + "swh", + "tel", + "tgl", + "tha", + "tur", + "ukr", + "urd", + "uzn", + "vie", +] +S2ST_TARGET_LANGUAGE_NAMES = sorted( + [language_code_to_name[code] for code in s2st_target_language_codes] +) +# S2TT / ASR +S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES +# T2TT +T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES + +# Download sample input audio files +filenames = ["assets/sample_input.mp3", "assets/sample_input_2.mp3"] +for filename in filenames: + hf_hub_download( + repo_id="facebook/seamless_m4t", + repo_type="space", + filename=filename, + local_dir=".", + ) + +AUDIO_SAMPLE_RATE = 16000.0 +MAX_INPUT_AUDIO_LENGTH = 60 # in seconds +DEFAULT_TARGET_LANGUAGE = "French" + +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +translator = Translator( + model_name_or_card="seamlessM4T_large", + vocoder_name_or_card="vocoder_36langs", + device=device, + dtype=torch.float16 if "cuda" in device.type else torch.float32, +) + + +def predict( + task_name: str, + audio_source: str, + input_audio_mic: str | None, + input_audio_file: str | None, + input_text: str | None, + source_language: str | None, + target_language: str, +) -> tuple[tuple[int, np.ndarray] | None, str]: + task_name = task_name.split()[0] + + source_language_code = ( + LANGUAGE_NAME_TO_CODE[source_language] if source_language else None + ) + target_language_code = LANGUAGE_NAME_TO_CODE[target_language] + + if task_name in ["S2ST", "S2TT", "ASR"]: + if audio_source == "microphone": + input_data = input_audio_mic + else: + input_data = input_audio_file + + arr, org_sr = torchaudio.load(input_data) + new_arr = torchaudio.functional.resample( + arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE + ) + max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE) + if new_arr.shape[1] > max_length: + new_arr = new_arr[:, :max_length] + gr.Warning( + f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used." + ) + torchaudio.save(input_data, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE)) + else: + input_data = input_text + + assert input_data is not None + text_output, speech_output = translator.predict( + input_data, + task_name, + target_language_code, + src_lang=source_language_code, + unit_generation_ngram_filtering=True, + ) + if task_name in ["S2ST", "T2ST"]: + assert speech_output is not None + + return ( + speech_output.sample_rate, + speech_output.audio_wavs[0].cpu().detach().numpy(), + ), str(text_output[0]) + else: + return None, str(text_output[0]) + + +def process_s2st_example( + input_audio_file: str, target_language: str +) -> tuple[tuple[int, np.ndarray] | None, str]: + return predict( + task_name="S2ST", + audio_source="file", + input_audio_mic=None, + input_audio_file=input_audio_file, + input_text=None, + source_language=None, + target_language=target_language, + ) + + +def process_s2tt_example( + input_audio_file: str, target_language: str +) -> tuple[tuple[int, np.ndarray] | None, str]: + return predict( + task_name="S2TT", + audio_source="file", + input_audio_mic=None, + input_audio_file=input_audio_file, + input_text=None, + source_language=None, + target_language=target_language, + ) + + +def process_t2st_example( + input_text: str, source_language: str, target_language: str +) -> tuple[tuple[int, np.ndarray] | None, str]: + return predict( + task_name="T2ST", + audio_source="", + input_audio_mic=None, + input_audio_file=None, + input_text=input_text, + source_language=source_language, + target_language=target_language, + ) + + +def process_t2tt_example( + input_text: str, source_language: str, target_language: str +) -> tuple[tuple[int, np.ndarray] | None, str]: + return predict( + task_name="T2TT", + audio_source="", + input_audio_mic=None, + input_audio_file=None, + input_text=input_text, + source_language=source_language, + target_language=target_language, + ) + + +def process_asr_example( + input_audio_file: str, target_language: str +) -> tuple[tuple[int, np.ndarray] | None, str]: + return predict( + task_name="ASR", + audio_source="file", + input_audio_mic=None, + input_audio_file=input_audio_file, + input_text=None, + source_language=None, + target_language=target_language, + ) + + +def update_audio_ui(audio_source: str) -> tuple[dict, dict]: + mic = audio_source == "microphone" + return ( + gr.update(visible=mic, value=None), # input_audio_mic + gr.update(visible=not mic, value=None), # input_audio_file + ) + + +def update_input_ui(task_name: str) -> tuple[dict, dict, dict, dict]: + task_name = task_name.split()[0] + if task_name == "S2ST": + return ( + gr.update(visible=True), # audio_box + gr.update(visible=False), # input_text + gr.update(visible=False), # source_language + gr.update( + visible=True, + choices=S2ST_TARGET_LANGUAGE_NAMES, + value=DEFAULT_TARGET_LANGUAGE, + ), # target_language + ) + elif task_name == "S2TT": + return ( + gr.update(visible=True), # audio_box + gr.update(visible=False), # input_text + gr.update(visible=False), # source_language + gr.update( + visible=True, + choices=S2TT_TARGET_LANGUAGE_NAMES, + value=DEFAULT_TARGET_LANGUAGE, + ), # target_language + ) + elif task_name == "T2ST": + return ( + gr.update(visible=False), # audio_box + gr.update(visible=True), # input_text + gr.update(visible=True), # source_language + gr.update( + visible=True, + choices=S2ST_TARGET_LANGUAGE_NAMES, + value=DEFAULT_TARGET_LANGUAGE, + ), # target_language + ) + elif task_name == "T2TT": + return ( + gr.update(visible=False), # audio_box + gr.update(visible=True), # input_text + gr.update(visible=True), # source_language + gr.update( + visible=True, + choices=T2TT_TARGET_LANGUAGE_NAMES, + value=DEFAULT_TARGET_LANGUAGE, + ), # target_language + ) + elif task_name == "ASR": + return ( + gr.update(visible=True), # audio_box + gr.update(visible=False), # input_text + gr.update(visible=False), # source_language + gr.update( + visible=True, + choices=S2TT_TARGET_LANGUAGE_NAMES, + value=DEFAULT_TARGET_LANGUAGE, + ), # target_language + ) + else: + raise ValueError(f"Unknown task: {task_name}") + + +def update_output_ui(task_name: str) -> tuple[dict, dict]: + task_name = task_name.split()[0] + if task_name in ["S2ST", "T2ST"]: + return ( + gr.update(visible=True, value=None), # output_audio + gr.update(value=None), # output_text + ) + elif task_name in ["S2TT", "T2TT", "ASR"]: + return ( + gr.update(visible=False, value=None), # output_audio + gr.update(value=None), # output_text + ) + else: + raise ValueError(f"Unknown task: {task_name}") + + +def update_example_ui(task_name: str) -> tuple[dict, dict, dict, dict, dict]: + task_name = task_name.split()[0] + return ( + gr.update(visible=task_name == "S2ST"), # s2st_example_row + gr.update(visible=task_name == "S2TT"), # s2tt_example_row + gr.update(visible=task_name == "T2ST"), # t2st_example_row + gr.update(visible=task_name == "T2TT"), # t2tt_example_row + gr.update(visible=task_name == "ASR"), # asr_example_row + ) + + +css = """ +h1 { + text-align: center; +} + +.contain { + max-width: 730px; + margin: auto; + padding-top: 1.5rem; +} +""" + +with gr.Blocks(css=css) as demo: + gr.Markdown(DESCRIPTION) + with gr.Group(): + task_name = gr.Dropdown( + label="Task", + choices=TASK_NAMES, + value=TASK_NAMES[0], + ) + with gr.Row(): + source_language = gr.Dropdown( + label="Source language", + choices=TEXT_SOURCE_LANGUAGE_NAMES, + value="English", + visible=False, + ) + target_language = gr.Dropdown( + label="Target language", + choices=S2ST_TARGET_LANGUAGE_NAMES, + value=DEFAULT_TARGET_LANGUAGE, + ) + with gr.Row() as audio_box: + audio_source = gr.Radio( + label="Audio source", + choices=["file", "microphone"], + value="file", + ) + input_audio_mic = gr.Audio( + label="Input speech", + type="filepath", + source="microphone", + visible=False, + ) + input_audio_file = gr.Audio( + label="Input speech", + type="filepath", + source="upload", + visible=True, + ) + input_text = gr.Textbox(label="Input text", visible=False) + btn = gr.Button("Translate") + with gr.Column(): + output_audio = gr.Audio( + label="Translated speech", + autoplay=False, + streaming=False, + type="numpy", + ) + output_text = gr.Textbox(label="Translated text") + + with gr.Row(visible=True) as s2st_example_row: + s2st_examples = gr.Examples( + examples=[ + ["assets/sample_input.mp3", "French"], + ["assets/sample_input.mp3", "Mandarin Chinese"], + ["assets/sample_input_2.mp3", "Hindi"], + ["assets/sample_input_2.mp3", "Spanish"], + ], + inputs=[input_audio_file, target_language], + outputs=[output_audio, output_text], + fn=process_s2st_example, + ) + with gr.Row(visible=False) as s2tt_example_row: + s2tt_examples = gr.Examples( + examples=[ + ["assets/sample_input.mp3", "French"], + ["assets/sample_input.mp3", "Mandarin Chinese"], + ["assets/sample_input_2.mp3", "Hindi"], + ["assets/sample_input_2.mp3", "Spanish"], + ], + inputs=[input_audio_file, target_language], + outputs=[output_audio, output_text], + fn=process_s2tt_example, + ) + with gr.Row(visible=False) as t2st_example_row: + t2st_examples = gr.Examples( + examples=[ + ["My favorite animal is the elephant.", "English", "French"], + ["My favorite animal is the elephant.", "English", "Mandarin Chinese"], + [ + "Meta AI's Seamless M4T model is democratising spoken communication across language barriers", + "English", + "Hindi", + ], + [ + "Meta AI's Seamless M4T model is democratising spoken communication across language barriers", + "English", + "Spanish", + ], + ], + inputs=[input_text, source_language, target_language], + outputs=[output_audio, output_text], + fn=process_t2st_example, + ) + with gr.Row(visible=False) as t2tt_example_row: + t2tt_examples = gr.Examples( + examples=[ + ["My favorite animal is the elephant.", "English", "French"], + ["My favorite animal is the elephant.", "English", "Mandarin Chinese"], + [ + "Meta AI's Seamless M4T model is democratising spoken communication across language barriers", + "English", + "Hindi", + ], + [ + "Meta AI's Seamless M4T model is democratising spoken communication across language barriers", + "English", + "Spanish", + ], + ], + inputs=[input_text, source_language, target_language], + outputs=[output_audio, output_text], + fn=process_t2tt_example, + ) + with gr.Row(visible=False) as asr_example_row: + asr_examples = gr.Examples( + examples=[ + ["assets/sample_input.mp3", "English"], + ["assets/sample_input_2.mp3", "English"], + ], + inputs=[input_audio_file, target_language], + outputs=[output_audio, output_text], + fn=process_asr_example, + ) + + audio_source.change( + fn=update_audio_ui, + inputs=audio_source, + outputs=[ + input_audio_mic, + input_audio_file, + ], + queue=False, + api_name=False, + ) + task_name.change( + fn=update_input_ui, + inputs=task_name, + outputs=[ + audio_box, + input_text, + source_language, + target_language, + ], + queue=False, + api_name=False, + ).then( + fn=update_output_ui, + inputs=task_name, + outputs=[output_audio, output_text], + queue=False, + api_name=False, + ).then( + fn=update_example_ui, + inputs=task_name, + outputs=[ + s2st_example_row, + s2tt_example_row, + t2st_example_row, + t2tt_example_row, + asr_example_row, + ], + queue=False, + api_name=False, + ) + + btn.click( + fn=predict, + inputs=[ + task_name, + audio_source, + input_audio_mic, + input_audio_file, + input_text, + source_language, + target_language, + ], + outputs=[output_audio, output_text], + api_name="run", + ) + +if __name__ == "__main__": + demo.queue().launch() diff --git a/seamless_communication/demo/m4tv1/requirements.txt b/seamless_communication/demo/m4tv1/requirements.txt new file mode 100644 index 0000000..34b2c3f --- /dev/null +++ b/seamless_communication/demo/m4tv1/requirements.txt @@ -0,0 +1,6 @@ +fairseq2 +git+https://github.com/facebookresearch/seamless_communication +gradio +huggingface_hub +torch +torchaudio diff --git a/seamless_communication/demo/m4tv2/app.py b/seamless_communication/demo/m4tv2/app.py new file mode 100644 index 0000000..dc9faed --- /dev/null +++ b/seamless_communication/demo/m4tv2/app.py @@ -0,0 +1,370 @@ +#!/usr/bin/env python +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from __future__ import annotations + +import os +import pathlib +import getpass + +import gradio as gr +import numpy as np +import torch +import torchaudio +from fairseq2.assets import InProcAssetMetadataProvider, asset_store +from huggingface_hub import snapshot_download +from seamless_communication.inference import Translator + +from lang_list import ( + ASR_TARGET_LANGUAGE_NAMES, + LANGUAGE_NAME_TO_CODE, + S2ST_TARGET_LANGUAGE_NAMES, + S2TT_TARGET_LANGUAGE_NAMES, + T2ST_TARGET_LANGUAGE_NAMES, + T2TT_TARGET_LANGUAGE_NAMES, + TEXT_SOURCE_LANGUAGE_NAMES, +) + +user = getpass.getuser() # this is not portable on windows +CHECKPOINTS_PATH = pathlib.Path(os.getenv("CHECKPOINTS_PATH", f"/home/{user}/app/models")) +if not CHECKPOINTS_PATH.exists(): + snapshot_download(repo_id="facebook/seamless-m4t-v2-large", repo_type="model", local_dir=CHECKPOINTS_PATH) +asset_store.env_resolvers.clear() +asset_store.env_resolvers.append(lambda: "demo") +demo_metadata = [ + { + "name": "seamlessM4T_v2_large@demo", + "checkpoint": f"file://{CHECKPOINTS_PATH}/seamlessM4T_v2_large.pt", + "char_tokenizer": f"file://{CHECKPOINTS_PATH}/spm_char_lang38_tc.model", + }, + { + "name": "vocoder_v2@demo", + "checkpoint": f"file://{CHECKPOINTS_PATH}/vocoder_v2.pt", + }, +] +asset_store.metadata_providers.append(InProcAssetMetadataProvider(demo_metadata)) + +DESCRIPTION = """\ +# SeamlessM4T +[SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality +translation, allowing people from different linguistic communities to communicate effortlessly through speech and text. +This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST) +translation and more, without relying on multiple separate models. +""" + +CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1" and torch.cuda.is_available() + +AUDIO_SAMPLE_RATE = 16000.0 +MAX_INPUT_AUDIO_LENGTH = 60 # in seconds +DEFAULT_TARGET_LANGUAGE = "French" + +if torch.cuda.is_available(): + device = torch.device("cuda:0") + dtype = torch.float16 +else: + device = torch.device("cpu") + dtype = torch.float32 + +translator = Translator( + model_name_or_card="seamlessM4T_v2_large", + vocoder_name_or_card="vocoder_v2", + device=device, + dtype=dtype, + apply_mintox=True, +) + + +def preprocess_audio(input_audio: str) -> None: + arr, org_sr = torchaudio.load(input_audio) + new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE) + max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE) + if new_arr.shape[1] > max_length: + new_arr = new_arr[:, :max_length] + gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.") + torchaudio.save(input_audio, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE)) + + +def run_s2st( + input_audio: str, source_language: str, target_language: str +) -> tuple[tuple[int, np.ndarray] | None, str]: + preprocess_audio(input_audio) + source_language_code = LANGUAGE_NAME_TO_CODE[source_language] + target_language_code = LANGUAGE_NAME_TO_CODE[target_language] + out_texts, out_audios = translator.predict( + input=input_audio, + task_str="S2ST", + src_lang=source_language_code, + tgt_lang=target_language_code, + ) + out_text = str(out_texts[0]) + out_wav = out_audios.audio_wavs[0].cpu().detach().numpy() + return (int(AUDIO_SAMPLE_RATE), out_wav), out_text + + +def run_s2tt(input_audio: str, source_language: str, target_language: str) -> str: + preprocess_audio(input_audio) + source_language_code = LANGUAGE_NAME_TO_CODE[source_language] + target_language_code = LANGUAGE_NAME_TO_CODE[target_language] + out_texts, _ = translator.predict( + input=input_audio, + task_str="S2TT", + src_lang=source_language_code, + tgt_lang=target_language_code, + ) + return str(out_texts[0]) + + +def run_t2st(input_text: str, source_language: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]: + source_language_code = LANGUAGE_NAME_TO_CODE[source_language] + target_language_code = LANGUAGE_NAME_TO_CODE[target_language] + out_texts, out_audios = translator.predict( + input=input_text, + task_str="T2ST", + src_lang=source_language_code, + tgt_lang=target_language_code, + ) + out_text = str(out_texts[0]) + out_wav = out_audios.audio_wavs[0].cpu().detach().numpy() + return (int(AUDIO_SAMPLE_RATE), out_wav), out_text + + +def run_t2tt(input_text: str, source_language: str, target_language: str) -> str: + source_language_code = LANGUAGE_NAME_TO_CODE[source_language] + target_language_code = LANGUAGE_NAME_TO_CODE[target_language] + out_texts, _ = translator.predict( + input=input_text, + task_str="T2TT", + src_lang=source_language_code, + tgt_lang=target_language_code, + ) + return str(out_texts[0]) + + +def run_asr(input_audio: str, target_language: str) -> str: + preprocess_audio(input_audio) + target_language_code = LANGUAGE_NAME_TO_CODE[target_language] + out_texts, _ = translator.predict( + input=input_audio, + task_str="ASR", + src_lang=target_language_code, + tgt_lang=target_language_code, + ) + return str(out_texts[0]) + + +with gr.Blocks() as demo_s2st: + with gr.Row(): + with gr.Column(): + with gr.Group(): + input_audio = gr.Audio(label="Input speech", type="filepath") + source_language = gr.Dropdown( + label="Source language", + choices=ASR_TARGET_LANGUAGE_NAMES, + value="English", + ) + target_language = gr.Dropdown( + label="Target language", + choices=S2ST_TARGET_LANGUAGE_NAMES, + value=DEFAULT_TARGET_LANGUAGE, + ) + btn = gr.Button("Translate") + with gr.Column(): + with gr.Group(): + output_audio = gr.Audio( + label="Translated speech", + autoplay=False, + streaming=False, + type="numpy", + ) + output_text = gr.Textbox(label="Translated text") + + gr.Examples( + examples=[], + inputs=[input_audio, source_language, target_language], + outputs=[output_audio, output_text], + fn=run_s2st, + cache_examples=CACHE_EXAMPLES, + api_name=False, + ) + + btn.click( + fn=run_s2st, + inputs=[input_audio, source_language, target_language], + outputs=[output_audio, output_text], + api_name="s2st", + ) + +with gr.Blocks() as demo_s2tt: + with gr.Row(): + with gr.Column(): + with gr.Group(): + input_audio = gr.Audio(label="Input speech", type="filepath") + source_language = gr.Dropdown( + label="Source language", + choices=ASR_TARGET_LANGUAGE_NAMES, + value="English", + ) + target_language = gr.Dropdown( + label="Target language", + choices=S2TT_TARGET_LANGUAGE_NAMES, + value=DEFAULT_TARGET_LANGUAGE, + ) + btn = gr.Button("Translate") + with gr.Column(): + output_text = gr.Textbox(label="Translated text") + + gr.Examples( + examples=[], + inputs=[input_audio, source_language, target_language], + outputs=output_text, + fn=run_s2tt, + cache_examples=CACHE_EXAMPLES, + api_name=False, + ) + + btn.click( + fn=run_s2tt, + inputs=[input_audio, source_language, target_language], + outputs=output_text, + api_name="s2tt", + ) + +with gr.Blocks() as demo_t2st: + with gr.Row(): + with gr.Column(): + with gr.Group(): + input_text = gr.Textbox(label="Input text") + with gr.Row(): + source_language = gr.Dropdown( + label="Source language", + choices=TEXT_SOURCE_LANGUAGE_NAMES, + value="English", + ) + target_language = gr.Dropdown( + label="Target language", + choices=T2ST_TARGET_LANGUAGE_NAMES, + value=DEFAULT_TARGET_LANGUAGE, + ) + btn = gr.Button("Translate") + with gr.Column(): + with gr.Group(): + output_audio = gr.Audio( + label="Translated speech", + autoplay=False, + streaming=False, + type="numpy", + ) + output_text = gr.Textbox(label="Translated text") + + gr.Examples( + examples=[], + inputs=[input_text, source_language, target_language], + outputs=[output_audio, output_text], + fn=run_t2st, + cache_examples=CACHE_EXAMPLES, + api_name=False, + ) + + gr.on( + triggers=[input_text.submit, btn.click], + fn=run_t2st, + inputs=[input_text, source_language, target_language], + outputs=[output_audio, output_text], + api_name="t2st", + ) + +with gr.Blocks() as demo_t2tt: + with gr.Row(): + with gr.Column(): + with gr.Group(): + input_text = gr.Textbox(label="Input text") + with gr.Row(): + source_language = gr.Dropdown( + label="Source language", + choices=TEXT_SOURCE_LANGUAGE_NAMES, + value="English", + ) + target_language = gr.Dropdown( + label="Target language", + choices=T2TT_TARGET_LANGUAGE_NAMES, + value=DEFAULT_TARGET_LANGUAGE, + ) + btn = gr.Button("Translate") + with gr.Column(): + output_text = gr.Textbox(label="Translated text") + + gr.Examples( + examples=[], + inputs=[input_text, source_language, target_language], + outputs=output_text, + fn=run_t2tt, + cache_examples=CACHE_EXAMPLES, + api_name=False, + ) + + gr.on( + triggers=[input_text.submit, btn.click], + fn=run_t2tt, + inputs=[input_text, source_language, target_language], + outputs=output_text, + api_name="t2tt", + ) + +with gr.Blocks() as demo_asr: + with gr.Row(): + with gr.Column(): + with gr.Group(): + input_audio = gr.Audio(label="Input speech", type="filepath") + target_language = gr.Dropdown( + label="Target language", + choices=ASR_TARGET_LANGUAGE_NAMES, + value=DEFAULT_TARGET_LANGUAGE, + ) + btn = gr.Button("Translate") + with gr.Column(): + output_text = gr.Textbox(label="Translated text") + + gr.Examples( + examples=[], + inputs=[input_audio, target_language], + outputs=output_text, + fn=run_asr, + cache_examples=CACHE_EXAMPLES, + api_name=False, + ) + + btn.click( + fn=run_asr, + inputs=[input_audio, target_language], + outputs=output_text, + api_name="asr", + ) + + +with gr.Blocks(css="style.css") as demo: + gr.Markdown(DESCRIPTION) + gr.DuplicateButton( + value="Duplicate Space for private use", + elem_id="duplicate-button", + visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1", + ) + + with gr.Tabs(): + with gr.Tab(label="S2ST"): + demo_s2st.render() + with gr.Tab(label="S2TT"): + demo_s2tt.render() + with gr.Tab(label="T2ST"): + demo_t2st.render() + with gr.Tab(label="T2TT"): + demo_t2tt.render() + with gr.Tab(label="ASR"): + demo_asr.render() + + +if __name__ == "__main__": + demo.queue(max_size=50).launch() diff --git a/seamless_communication/demo/m4tv2/lang_list.py b/seamless_communication/demo/m4tv2/lang_list.py new file mode 100644 index 0000000..75f562b --- /dev/null +++ b/seamless_communication/demo/m4tv2/lang_list.py @@ -0,0 +1,255 @@ +# Language dict +language_code_to_name = { + "afr": "Afrikaans", + "amh": "Amharic", + "arb": "Modern Standard Arabic", + "ary": "Moroccan Arabic", + "arz": "Egyptian Arabic", + "asm": "Assamese", + "ast": "Asturian", + "azj": "North Azerbaijani", + "bel": "Belarusian", + "ben": "Bengali", + "bos": "Bosnian", + "bul": "Bulgarian", + "cat": "Catalan", + "ceb": "Cebuano", + "ces": "Czech", + "ckb": "Central Kurdish", + "cmn": "Mandarin Chinese", + "cym": "Welsh", + "dan": "Danish", + "deu": "German", + "ell": "Greek", + "eng": "English", + "est": "Estonian", + "eus": "Basque", + "fin": "Finnish", + "fra": "French", + "gaz": "West Central Oromo", + "gle": "Irish", + "glg": "Galician", + "guj": "Gujarati", + "heb": "Hebrew", + "hin": "Hindi", + "hrv": "Croatian", + "hun": "Hungarian", + "hye": "Armenian", + "ibo": "Igbo", + "ind": "Indonesian", + "isl": "Icelandic", + "ita": "Italian", + "jav": "Javanese", + "jpn": "Japanese", + "kam": "Kamba", + "kan": "Kannada", + "kat": "Georgian", + "kaz": "Kazakh", + "kea": "Kabuverdianu", + "khk": "Halh Mongolian", + "khm": "Khmer", + "kir": "Kyrgyz", + "kor": "Korean", + "lao": "Lao", + "lit": "Lithuanian", + "ltz": "Luxembourgish", + "lug": "Ganda", + "luo": "Luo", + "lvs": "Standard Latvian", + "mai": "Maithili", + "mal": "Malayalam", + "mar": "Marathi", + "mkd": "Macedonian", + "mlt": "Maltese", + "mni": "Meitei", + "mya": "Burmese", + "nld": "Dutch", + "nno": "Norwegian Nynorsk", + "nob": "Norwegian Bokm\u00e5l", + "npi": "Nepali", + "nya": "Nyanja", + "oci": "Occitan", + "ory": "Odia", + "pan": "Punjabi", + "pbt": "Southern Pashto", + "pes": "Western Persian", + "pol": "Polish", + "por": "Portuguese", + "ron": "Romanian", + "rus": "Russian", + "slk": "Slovak", + "slv": "Slovenian", + "sna": "Shona", + "snd": "Sindhi", + "som": "Somali", + "spa": "Spanish", + "srp": "Serbian", + "swe": "Swedish", + "swh": "Swahili", + "tam": "Tamil", + "tel": "Telugu", + "tgk": "Tajik", + "tgl": "Tagalog", + "tha": "Thai", + "tur": "Turkish", + "ukr": "Ukrainian", + "urd": "Urdu", + "uzn": "Northern Uzbek", + "vie": "Vietnamese", + "xho": "Xhosa", + "yor": "Yoruba", + "yue": "Cantonese", + "zlm": "Colloquial Malay", + "zsm": "Standard Malay", + "zul": "Zulu", +} +LANGUAGE_NAME_TO_CODE = {v: k for k, v in language_code_to_name.items()} + +# Source langs: S2ST / S2TT / ASR don't need source lang +# T2TT / T2ST use this +text_source_language_codes = [ + "afr", + "amh", + "arb", + "ary", + "arz", + "asm", + "azj", + "bel", + "ben", + "bos", + "bul", + "cat", + "ceb", + "ces", + "ckb", + "cmn", + "cym", + "dan", + "deu", + "ell", + "eng", + "est", + "eus", + "fin", + "fra", + "gaz", + "gle", + "glg", + "guj", + "heb", + "hin", + "hrv", + "hun", + "hye", + "ibo", + "ind", + "isl", + "ita", + "jav", + "jpn", + "kan", + "kat", + "kaz", + "khk", + "khm", + "kir", + "kor", + "lao", + "lit", + "lug", + "luo", + "lvs", + "mai", + "mal", + "mar", + "mkd", + "mlt", + "mni", + "mya", + "nld", + "nno", + "nob", + "npi", + "nya", + "ory", + "pan", + "pbt", + "pes", + "pol", + "por", + "ron", + "rus", + "slk", + "slv", + "sna", + "snd", + "som", + "spa", + "srp", + "swe", + "swh", + "tam", + "tel", + "tgk", + "tgl", + "tha", + "tur", + "ukr", + "urd", + "uzn", + "vie", + "yor", + "yue", + "zsm", + "zul", +] +TEXT_SOURCE_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in text_source_language_codes]) + +# Target langs: +# S2ST / T2ST +s2st_target_language_codes = [ + "eng", + "arb", + "ben", + "cat", + "ces", + "cmn", + "cym", + "dan", + "deu", + "est", + "fin", + "fra", + "hin", + "ind", + "ita", + "jpn", + "kor", + "mlt", + "nld", + "pes", + "pol", + "por", + "ron", + "rus", + "slk", + "spa", + "swe", + "swh", + "tel", + "tgl", + "tha", + "tur", + "ukr", + "urd", + "uzn", + "vie", +] +S2ST_TARGET_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in s2st_target_language_codes]) +T2ST_TARGET_LANGUAGE_NAMES = S2ST_TARGET_LANGUAGE_NAMES + +# S2TT / T2TT / ASR +S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES +T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES +ASR_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES diff --git a/seamless_communication/demo/m4tv2/requirements.txt b/seamless_communication/demo/m4tv2/requirements.txt new file mode 100644 index 0000000..1ccc2a1 --- /dev/null +++ b/seamless_communication/demo/m4tv2/requirements.txt @@ -0,0 +1,5 @@ +gradio~=4.5.0 +omegaconf~=2.3.0 +torch~=2.1.0 +torchaudio~=2.1.0 +fairseq2~=0.2.0 diff --git a/seamless_communication/dev_requirements.txt b/seamless_communication/dev_requirements.txt new file mode 100644 index 0000000..b5d8c83 --- /dev/null +++ b/seamless_communication/dev_requirements.txt @@ -0,0 +1,7 @@ +audiocraft +black +flake8 +isort +mypy +pre-commit +pytest diff --git a/seamless_communication/docs/expressive/README.md b/seamless_communication/docs/expressive/README.md new file mode 100644 index 0000000..5b02ea2 --- /dev/null +++ b/seamless_communication/docs/expressive/README.md @@ -0,0 +1,200 @@ +# SeamlessExpressive + +SeamlessExpressive model consists of two main modules: (1) Prosody UnitY2, which is a prosody-aware speech-to-unit translation model based on UnitY2 architecture; and (2) PRETSSEL, which is a unit-to-speech model featuring cross-lingual expressivity preservation. + +![SeamlessExpressive architectures](seamlessexpressive_arch.jpg) + + +## Prosody UnitY2 + +Prosody UnitY2 is an expressive speech-to-unit translation model, injecting expressivity embedding from PRETSSEL into the unit generation. It could transfer phrase-level prosody such as speech rate or pauses. + + +## PRETSSEL + +**P**aralinguistic **RE**presentation-based +**T**extle**SS** acoustic mod**EL** (PRETSSEL) is an expressive unit-to-speech generator, and it can efficiently disentangle semantic and expressivity components from speech. It transfers utterance-level expressivity like the style of one's voice. + +# Benchmark Datasets + +## mExpresso (Multilingual Expresso) + +mExpresso is an expressive S2ST dataset that includes seven styles of read speech (i.e., default, happy, sad, confused, enunciated, whisper and laughing) between English and five other languages -- French, German, Italian, Mandarin and Spanish. We create the dataset by expanding a subset of read speech in [Expresso Dataset](https://github.com/facebookresearch/textlesslib/tree/main/examples/expresso/dataset). We first translate the English transcriptions into other languages, including the emphasis markers in the transcription, and then the gender matched bilingual speakers read the translation in the style suggested by the markers. + +We are currently open source the text translation of the other language to enable evaluating English to other directions. We will open source the audio files in the near future. + +Text translation in other languages can be [Downloaded](https://dl.fbaipublicfiles.com/seamless/datasets/mexpresso_text/mexpresso_text.tar). + +### Statistics of mExpresso +| language pair | subset | # items | English duration (hr) | # speakers | +|---------------|--------|---------|-----------------------|------------| +|eng-cmn| dev | 2369 | 2.1 | 1 | +| | test | 5003 | 4.8 | 2 | +|eng-deu| dev | 4420 | 3.9 | 2 | +| | test | 5733 | 5.6 | 2 | +|eng-fra| dev | 4770 | 4.2 | 2 | +| | test | 5742 | 5.6 | 2 | +|eng-ita| dev | 4413 | 3.9 | 2 | +| | test | 5756 | 5.7 | 2 | +|eng-spa| dev | 4758 | 4.2 | 2 | +| | test | 5693 | 5.5 | 2 | + +### Create mExpresso S2T dataset by downloading and combining with English Expresso +Run the following command to create English to other langauges speech-to-text dataset from scratch. It will first download the English Expresso dataset, downsample the audio to 16k Hz, and join with the text translation to form the manifest. + +```python +python3 -m seamless_communication.cli.expressivity.data.prepare_mexpresso \ + +``` + +The output manifest will be located at `/{dev,test}_mexpresso_eng_{spa,fra,deu,ita,cmn}.tsv` + + +# Automatic evaluation + +Python package dependencies (on top of seamless_communication, coming from stopes pipelines): +* Unidecode +* scipy +* phonemizer +* s3prl +* syllables +* ipapy +* pkuseg +* nltk +* fire +* inflect + +```bash +pip install Unidecode scipy phonemizer s3prl syllables ipapy pkuseg nltk fire inflect +``` + +As described in Section 4.3 we use following automatic metrics: + +1. **ASR-BLEU**: refer to `/src/seamless_communication/cli/eval_utils` to see how the OpenAI whisper ASR model is used to extract transcriptions from generated audios. + +2. **Vocal Style Similarity**: refer to [stopes/eval/vocal_style_similarity](https://github.com/facebookresearch/stopes/tree/main/stopes/eval/vocal_style_similarity) for implementation details. + +3. **AutoPCP**: refer to [stopes/eval/auto_pcp](https://github.com/facebookresearch/stopes/tree/main/stopes/eval/auto_pcp) for implementation details. + +4. **Pause and Rate scores**: refer to [stopes/eval/local_prosody](https://github.com/facebookresearch/stopes/tree/main/stopes/eval/local_prosody) for implementation details. Rate score corresponds to the syllable speech rate spearman correlation between source and predicted speech. Pause score corresponds to the weighted mean joint score produced by `stopes/eval/local_prosody/compare_utterances.py` script from stopes repo. + +## Evaluation results: mExpresso + +Please see [mExpresso section](#mexpresso-multilingual-expresso) on how to download evaluation data + +*Important Notes*: + +* We used empirically chosen duration factors per each tgt language towards the best perceptual quality: 1.0 (default) for cmn, spa, ita; 1.1 for deu; 1.2 for fra. Same settings were used to report results in the "Seamless: Multilingual Expressive and Streaming Speech Translation" paper. + +* Results here slightly differs from ones shown in the paper due to several descrepancies in the pipeline: results reported here use pipeline w/ fairseq2 backend for model's inference and pipeline includes watermarking. + +| Language | Partition | ASR-BLEU | Vocal Style Sim | AutoPCP | Pause | Rate | +|----------|-----------|----------|-------------|---------|-------|------| +| eng_cmn | dev | 26.080 | 0.207 | 3.168 | 0.236 | 0.538 | +| eng_deu | dev | 36.940 | 0.261 | 3.298 | 0.319 | 0.717 | +| eng_fra | dev | 37.780 | 0.231 | 3.285 | 0.331 | 0.682 | +| eng_ita | dev | 40.170 | 0.226 | 3.322 | 0.388 | 0.734 | +| eng_spa | dev | 42.400 | 0.228 | 3.379 | 0.332 | 0.702 | +| eng_cmn | test | 23.320 | 0.249 | 2.984 | 0.385 | 0.522 | +| eng_deu | test | 27.780 | 0.290 | 3.117 | 0.483 | 0.717 | +| eng_fra | test | 38.360 | 0.270 | 3.117 | 0.506 | 0.663 | +| eng_ita | test | 38.020 | 0.274 | 3.130 | 0.523 | 0.686 | +| eng_spa | test | 42.920 | 0.274 | 3.183 | 0.508 | 0.675 | +### Step-by-step evaluation + +Pre-requisite: all steps described here assume that the generation/inference has been completed following [steps](../../README.md#seamlessexpressive-inference). + +For stopes installation please refer to [stopes/eval](https://github.com/facebookresearch/stopes/tree/main/stopes/eval). + +The resulting directory of generated outputs: +```bash +export SPLIT="dev_mexpresso_eng_spa" # example, change for your split +export TGT_LANG="spa" +export SRC_LANG="eng" +export GENERATED_DIR="path_to_generated_output_for_given_data_split" +export GENERATED_TSV="generate-${SPLIT}.tsv" +export STOPES_ROOT="path_to_stopes_code_repo" +export SC_ROOT="path_to_this_repo" +``` + +**ASR-BLEU evaluation** + +```bash +python ${SC_ROOT}/src/seamless_communication/cli/expressivity/evaluate/run_asr_bleu.py \ + --generation_dir_path=${GENERATED_DIR} \ + --generate_tsv_filename=generate-${SPLIT}.tsv \ + --tgt_lang=${TGT_LANG} +``` +* `generate-${SPLIT}.tsv` is an expected output from inference described in pre-requisite + +After completion resulting ASR-BLEU score is written in `${GENERATED_DIR}/s2st_asr_bleu_normalized.json`. + +**Vocal Style Similarity** + +Download & set WavLM finetuned ckpt path (`${SPEECH_ENCODER_MODEL_PATH}`) as described in [stopes README](https://github.com/facebookresearch/stopes/tree/main/stopes/eval/vocal_style_similarity#pre-requisites) to reproduce our vocal style similarity eval. + +```bash +python -m stopes.modules +vocal_style_similarity=base \ + launcher.cluster=local \ + vocal_style_similarity.model_type=valle \ + +vocal_style_similarity.model_path=${SPEECH_ENCODER_MODEL_PATH} \ + +vocal_style_similarity.input_file=${GENERATED_DIR}/${GENERATED_TSV} \ + +vocal_style_similarity.output_file=${GENERATED_DIR}/vocal_style_sim_result.txt \ + vocal_style_similarity.named_columns=true \ + vocal_style_similarity.src_audio_column=src_audio \ + vocal_style_similarity.tgt_audio_column=hypo_audio +``` +* We report average number from all utterance scores written in `${GENERATED_DIR}/vocal_style_sim_result.txt`. + +**AutoPCP** + +```bash +python -m stopes.modules +compare_audios=AutoPCP_multilingual_v2 \ + launcher.cluster=local \ + +compare_audios.input_file=${GENERATED_DIR}/${GENERATED_TSV} \ + compare_audios.src_audio_column=src_audio \ + compare_audios.tgt_audio_column=hypo_audio \ + +compare_audios.named_columns=true \ + +compare_audios.output_file=${GENERATED_DIR}/autopcp_result.txt +``` +* We report average number from all utterance scores written in `${GENERATED_DIR}/autopcp_result.txt`. + +**Pause and Rate** + +This stage includes 3 steps: (1) src lang annotation, (2) tgt lang annotation, (3) pairwise comparison + +```bash +# src lang pause&rate annotation +python ${STOPES_ROOT}/stopes/eval/local_prosody/annotate_utterances.py \ + +data_path=${GENERATED_DIR}/${GENERATED_TSV} \ + +result_path=${GENERATED_DIR}/${SRC_LANG}_speech_rate_pause_annotation.tsv \ + +audio_column=src_audio \ + +text_column=src_text \ + +speech_units=[syllable] \ + +vad=true \ + +net=true \ + +lang=$SRC_LANG \ + +forced_aligner=fairseq2_nar_t2u_aligner + +# tgt lang pause&rate annotation +python ${STOPES_ROOT}/stopes/eval/local_prosody/annotate_utterances.py \ + +data_path=${GENERATED_DIR}/${GENERATED_TSV} \ + +result_path=${GENERATED_DIR}/${TGT_LANG}_speech_rate_pause_annotation.tsv \ + +audio_column=hypo_audio \ + +text_column=s2t_out \ + +speech_units=[syllable] \ + +vad=true \ + +net=true \ + +lang=$TGT_LANG \ + +forced_aligner=fairseq2_nar_t2u_aligner + +# pair wise comparison +python ${STOPES_ROOT}/stopes/eval/local_prosody/compare_utterances.py \ + +src_path=${GENERATED_DIR}/${SRC_LANG}_speech_rate_pause_annotation.tsv \ + +tgt_path=${GENERATED_DIR}/${TGT_LANG}_speech_rate_pause_annotation.tsv \ + +result_path=${GENERATED_DIR}/${SRC_LANG}_${TGT_LANG}_pause_scores.tsv \ + +pause_min_duration=0.1 +``` + +* For Rate reporting, please see the aggregation function `get_rate` in `${SC_ROOT}/src/seamless_communication/cli/expressivity/evaluate/post_process_pauserate.py`; +* For Pause reporting, please see the aggregation function `get_pause` in `${SC_ROOT}/src/seamless_communication/cli/expressivity/evaluate/post_process_pauserate.py`. diff --git a/seamless_communication/docs/expressive/seamless_align_expressive_README.md b/seamless_communication/docs/expressive/seamless_align_expressive_README.md new file mode 100644 index 0000000..2755e34 --- /dev/null +++ b/seamless_communication/docs/expressive/seamless_align_expressive_README.md @@ -0,0 +1,27 @@ +# SeamlessAlignExpressive + +Building upon our past work with WikiMatrix, CCMatrix, NLLB, SpeechMatrix and SeamlessM4T, we’re introducing the first expressive speech alignment procedure. Starting with raw data, the expressive alignment procedure automatically discovers pairs of audio segments sharing not only the same meaning, but the same overall expressivity. To showcase this procedure, we are making metadata available to create a benchmarking dataset called SeamlessAlignExpressive, that can be used to validate the quality of our alignment method. SeamlessAlignExpressive is the first large-scale collection of multilingual audio alignments for expressive translation for benchmarking. + +## Format + +The metadata files are space separated, gzip files. Each file corresponds to one alignment direction. File naming convention: we use 2 letters with an 'A': e.g. `frA`, `enA`, `deA`. + +For example, the direction `deA-enA` corresponds to information for reconstructing German speech to English speech alignments. + +Each line has 9 columns. + +The columns correspond to: + - `direction`: direction, e.g. `enA-deA` + - `side`: side, e.g. `enA` or `deA` + - `line_no`: alignment number + - `cc_warc`: The public CC warc file reference containing the public audio url + - `duration`: original file duration + - `audio_speech_segment_url`: public audio reference + - `audio_speech_start_frame`: start frame when the audio is resampled at 16kHz + - `audio_speech_end_frame`: end frame when the audio is resampled at 16kHz + - `laser_score`: score of the alignment + + +## Data + +[deA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_expressive/seamless.dataset.metadata.public.deA-enA.tsv.gz) [enA-esA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_expressive/seamless.dataset.metadata.public.enA-esA.tsv.gz) [enA-frA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_expressive/seamless.dataset.metadata.public.enA-frA.tsv.gz) [enA-itA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_expressive/seamless.dataset.metadata.public.enA-itA.tsv.gz) [enA-zhA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_expressive/seamless.dataset.metadata.public.enA-zhA.tsv.gz) diff --git a/seamless_communication/docs/expressive/seamlessexpressive_arch.jpg b/seamless_communication/docs/expressive/seamlessexpressive_arch.jpg new file mode 100644 index 0000000..909fb68 Binary files /dev/null and b/seamless_communication/docs/expressive/seamlessexpressive_arch.jpg differ diff --git a/seamless_communication/docs/m4t/README.md b/seamless_communication/docs/m4t/README.md new file mode 100644 index 0000000..f98a626 --- /dev/null +++ b/seamless_communication/docs/m4t/README.md @@ -0,0 +1,382 @@ +# SeamlessM4T +SeamlessM4T is our foundational all-in-one **M**assively **M**ultilingual and **M**ultimodal **M**achine **T**ranslation model delivering high-quality translation for speech and text in nearly 100 languages. + +SeamlessM4T models support: +- :microphone: 101 languages for speech input. +- :speech_balloon: 96 Languages for text input/output. +- :speaker: 35 languages for speech output. + +This unified model enables multiple tasks without relying on multiple separate models: +- Speech-to-speech translation (S2ST) +- Speech-to-text translation (S2TT) +- Text-to-speech translation (T2ST) +- Text-to-text translation (T2TT) +- Automatic speech recognition (ASR). + +> [!NOTE] +> SeamlessM4T v2 and v1 are also supported in the 🤗 Transformers library, more on it [in the dedicated section below](#transformers-usage). + +## SeamlessM4T v1 +The v1 version of SeamlessM4T is a multitask adaptation of the *UnitY* architecture [(Inaguma et al., 2023)](https://aclanthology.org/2023.acl-long.872/). +*UnitY* is a two-pass direct S2ST architecture which first generates textual representations and subsequently predicts discrete acoustic units. + + +## SeamlessM4T v2 +The v2 version of SeamlessM4T is a multitask adaptation of our novel *UnitY2* architecture. +*Unity2* with its hierarchical character-to-unit upsampling and non-autoregressive text-to-unit decoding considerably improves over SeamlessM4T v1 in quality and inference speed. + +![SeamlessM4T architectures](seamlessm4t_arch.svg) + +## SeamlessM4T models +| Model Name | #params | checkpoint | metrics | +| ------------------ | ------- | --------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------ | +| SeamlessM4T-Large v2 | 2.3B | [🤗 Model card](https://huggingface.co/facebook/seamless-m4t-v2-large) - [checkpoint](https://huggingface.co/facebook/seamless-m4t-v2-large/resolve/main/seamlessM4T_v2_large.pt) | [metrics](https://dl.fbaipublicfiles.com/seamless/metrics/seamlessM4T_large_v2.zip) | +| SeamlessM4T-Large (v1) | 2.3B | [🤗 Model card](https://huggingface.co/facebook/seamless-m4t-large) - [checkpoint](https://huggingface.co/facebook/seamless-m4t-large/resolve/main/multitask_unity_large.pt) | [metrics](https://dl.fbaipublicfiles.com/seamless/metrics/seamlessM4T_large.zip) | +| SeamlessM4T-Medium (v1) | 1.2B | [🤗 Model card](https://huggingface.co/facebook/seamless-m4t-medium) - [checkpoint](https://huggingface.co/facebook/seamless-m4t-medium/resolve/main/multitask_unity_medium.pt) | [metrics](https://dl.fbaipublicfiles.com/seamless/metrics/seamlessM4T_medium.zip) | + +We provide the extensive evaluation results of seamlessM4T-Large and SeamlessM4T-Medium reported in the paper (as averages) in the `metrics` files above. + +The evaluation data ids for FLEURS, CoVoST2 and CVSS-C can be found [here](https://dl.fbaipublicfiles.com/seamless/metrics/evaluation_data_ids.zip) + + +## Using SeamlessM4T models + +### `m4t_predict` with CLI: +Inference is run with the CLI, from the root directory of the repository. + +The model can be specified with `--model_name` `seamlessM4T_v2_large`, `seamlessM4T_large` or `seamlessM4T_medium`: + +```bash +# S2ST: +m4t_predict --task s2st --tgt_lang --output_path --model_name seamlessM4T_v2_large + +# S2T: +m4t_predict --task s2tt --tgt_lang --model_name seamlessM4T_v2_large + +# T2TT: +m4t_predict --task t2tt --tgt_lang --src_lang --model_name seamlessM4T_v2_large + +# T2ST: +m4t_predict --task t2st --tgt_lang --src_lang --output_path --model_name seamlessM4T_v2_large + +# ASR: +m4t_predict --task asr --tgt_lang --model_name seamlessM4T_v2_large + +``` +### Inference with `Translator`: +Inference calls for the `Translator` object instantiated with a multitask UnitY or UnitY2 model with the options: +- [`seamlessM4T_v2_large`](https://huggingface.co/facebook/seamless-m4t-v2-large) +- [`seamlessM4T_large`](https://huggingface.co/facebook/seamless-m4t-large) +- [`seamlessM4T_medium`](https://huggingface.co/facebook/seamless-m4t-medium) + +and a vocoder: +- `vocoder_v2` for `seamlessM4T_v2_large`. +- `vocoder_36langs` for `seamlessM4T_large` or `seamlessM4T_medium`. + +```python +import torch +from seamless_communication.inference import Translator + + +# Initialize a Translator object with a multitask model, vocoder on the GPU. +translator = Translator("seamlessM4T_large", "vocoder_36langs", torch.device("cuda:0"), torch.float16) +``` + +Now `predict()` can be used to run inference as many times on any of the supported tasks. + +Given an input audio with `` or an input text `` in ``, +we first set the `text_generation_opts`, `unit_generation_opts` and then translate into `` as follows: + +**S2ST and T2ST (speech output):** + +```python +# S2ST +text_output, speech_output = translator.predict( + input=, + task_str="S2ST", + tgt_lang=, + text_generation_opts=text_generation_opts, + unit_generation_opts=unit_generation_opts +) + +# T2ST +text_output, speech_output = translator.predict( + input=, + task_str="T2ST", + tgt_lang=, + src_lang=, + text_generation_opts=text_generation_opts, + unit_generation_opts=unit_generation_opts +) + +``` +Note that `` must be specified for T2ST. + +The generated units are synthesized and the output audio file is saved with: + +```python +# Save the translated audio output: +import torchaudio +torchaudio.save( + , + speech_output.audio_wavs[0][0].cpu(), + sample_rate=speech_output.sample_rate, +) +``` +**S2TT, T2TT and ASR (text output):** + +```python +# S2TT +text_output, _ = translator.predict( + input=, + task_str="S2TT", + tgt_lang=, + text_generation_opts=text_generation_opts, + unit_generation_opts=None +) + +# ASR +# This is equivalent to S2TT with `=`. + text_output, _ = translator.predict( + input=, + task_str="ASR", + tgt_lang=, + text_generation_opts=text_generation_opts, + unit_generation_opts=None +) + +# T2TT +text_output, _ = translator.predict( + input=, + task_str="T2TT", + tgt_lang=, + src_lang=, + text_generation_opts=text_generation_opts, + unit_generation_opts=None +) + +``` +Note that `` must be specified for T2TT + +To reproduce the seamless papers results ([v1](https://arxiv.org/abs/2308.11596) or [v2](https://arxiv.org/abs/2312.05187)), or to evaluate using the same metrics over your own test sets, please check out the [Evaluation README here](../../src/seamless_communication/cli/m4t/evaluate/README.md). + +## Inference with 🤗 `Transformers` + +SeamlessM4T is available in the 🤗 Transformers library, requiring minimal dependencies. Steps to get started: + +1. First install the 🤗 [Transformers library](https://github.com/huggingface/transformers) from main and [sentencepiece](https://github.com/google/sentencepiece): + +``` +pip install git+https://github.com/huggingface/transformers.git sentencepiece +``` + +2. Run the following Python code to generate speech samples. Here the target language is Russian: + +```py +import torchaudio +from transformers import AutoProcessor, SeamlessM4Tv2Model + +processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large") +model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large") + +# from text +text_inputs = processor(text="Hello, my dog is cute", src_lang="eng", return_tensors="pt") +audio_array_from_text = model.generate(**text_inputs, tgt_lang="rus")[0].cpu().squeeze() + +# from audio +audio, orig_freq = torchaudio.load("https://www2.cs.uic.edu/~i101/SoundFiles/preamble10.wav") +audio = torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16_000) # must be a 16 kHz waveform array +audio_inputs = processor(audios=audio, return_tensors="pt") +audio_array_from_audio = model.generate(**audio_inputs, tgt_lang="rus")[0].cpu().squeeze() +``` + +3. Listen to the audio samples either in an ipynb notebook: + +```py +from IPython.display import Audio + +sample_rate = model.config.sampling_rate +Audio(audio_array_from_text, rate=sample_rate) +Audio(audio_array_from_audio, rate=sample_rate) +``` + +Or save them as a `.wav` file using a third-party library, e.g. `torchaudio`: + +```py +torchaudio.save( + , + audio_array_from_audio, # or audio_array_from_text + sample_rate=model.config.sampling_rate, +) +``` +2. (bis) To run inference for text generating tasks (T2TT, ASR or S2TT), it is recommended to use [dedicated models](https://huggingface.co/docs/transformers/main/en/model_doc/seamless_m4t_v2#1-use-dedicated-models). With that, only the required sub-modules will be loaded. For exmaple, text-to-text translation from English to Bulgarian, is performed as follows: +```py +from transformers import AutoProcessor, SeamlessM4Tv2ForTextToText +processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large") +model = SeamlessM4Tv2ForTextToText.from_pretrained("facebook/seamless-m4t-v2-large") + +src_lang, tgt_lang = "eng", "bul" +text_inputs = processor(text='Hello, my dog is cute', src_lang=src_lang, return_tensors="pt") +decoder_input_ids = model.generate(**text_inputs, tgt_lang=tgt_lang)[0].tolist() +translated_text = processor.decode(decoder_input_ids, skip_special_tokens=True) +print(f"{tgt_lang}: {translated_text}") + +``` + +> [!NOTE] +> For more details on using the SeamlessM4T model for inference using the 🤗 Transformers library, refer to the +[SeamlessM4T v2 docs](https://huggingface.co/docs/transformers/main/en/model_doc/seamless_m4t_v2), the +[SeamlessM4T v1 docs](https://huggingface.co/docs/transformers/main/en/model_doc/seamless_m4t) or to this hands-on [Google Colab](https://colab.research.google.com/github/ylacombe/scripts_and_notebooks/blob/main/v2_seamless_m4t_hugging_face.ipynb). + + +## Finetuning SeamlessM4T models +Please check out the [Finetuning README here](../../src/seamless_communication/cli/m4t/finetune/README.md). + +## Supported Languages: + +Listed below, are the languages supported by SeamlessM4T-large (v1/v2). +The `source` column specifies whether a language is supported as source speech (`Sp`) and/or source text (`Tx`). +The `target` column specifies whether a language is supported as target speech (`Sp`) and/or target text (`Tx`). + + +| code | language | script | Source | Target | +| ---- | ---------------------- | ---------- | ------ | ------ | +| afr | Afrikaans | Latn | Sp, Tx | Tx | +| amh | Amharic | Ethi | Sp, Tx | Tx | +| arb | Modern Standard Arabic | Arab | Sp, Tx | Sp, Tx | +| ary | Moroccan Arabic | Arab | Sp, Tx | Tx | +| arz | Egyptian Arabic | Arab | Sp, Tx | Tx | +| asm | Assamese | Beng | Sp, Tx | Tx | +| ast | Asturian | Latn | Sp | \-- | +| azj | North Azerbaijani | Latn | Sp, Tx | Tx | +| bel | Belarusian | Cyrl | Sp, Tx | Tx | +| ben | Bengali | Beng | Sp, Tx | Sp, Tx | +| bos | Bosnian | Latn | Sp, Tx | Tx | +| bul | Bulgarian | Cyrl | Sp, Tx | Tx | +| cat | Catalan | Latn | Sp, Tx | Sp, Tx | +| ceb | Cebuano | Latn | Sp, Tx | Tx | +| ces | Czech | Latn | Sp, Tx | Sp, Tx | +| ckb | Central Kurdish | Arab | Sp, Tx | Tx | +| cmn | Mandarin Chinese | Hans | Sp, Tx | Sp, Tx | +| cmn_Hant | Mandarin Chinese | Hant | Sp, Tx | Sp, Tx | +| cym | Welsh | Latn | Sp, Tx | Sp, Tx | +| dan | Danish | Latn | Sp, Tx | Sp, Tx | +| deu | German | Latn | Sp, Tx | Sp, Tx | +| ell | Greek | Grek | Sp, Tx | Tx | +| eng | English | Latn | Sp, Tx | Sp, Tx | +| est | Estonian | Latn | Sp, Tx | Sp, Tx | +| eus | Basque | Latn | Sp, Tx | Tx | +| fin | Finnish | Latn | Sp, Tx | Sp, Tx | +| fra | French | Latn | Sp, Tx | Sp, Tx | +| fuv | Nigerian Fulfulde | Latn | Sp, Tx | Tx | +| gaz | West Central Oromo | Latn | Sp, Tx | Tx | +| gle | Irish | Latn | Sp, Tx | Tx | +| glg | Galician | Latn | Sp, Tx | Tx | +| guj | Gujarati | Gujr | Sp, Tx | Tx | +| heb | Hebrew | Hebr | Sp, Tx | Tx | +| hin | Hindi | Deva | Sp, Tx | Sp, Tx | +| hrv | Croatian | Latn | Sp, Tx | Tx | +| hun | Hungarian | Latn | Sp, Tx | Tx | +| hye | Armenian | Armn | Sp, Tx | Tx | +| ibo | Igbo | Latn | Sp, Tx | Tx | +| ind | Indonesian | Latn | Sp, Tx | Sp, Tx | +| isl | Icelandic | Latn | Sp, Tx | Tx | +| ita | Italian | Latn | Sp, Tx | Sp, Tx | +| jav | Javanese | Latn | Sp, Tx | Tx | +| jpn | Japanese | Jpan | Sp, Tx | Sp, Tx | +| kam | Kamba | Latn | Sp | \-- | +| kan | Kannada | Knda | Sp, Tx | Tx | +| kat | Georgian | Geor | Sp, Tx | Tx | +| kaz | Kazakh | Cyrl | Sp, Tx | Tx | +| kea | Kabuverdianu | Latn | Sp | \-- | +| khk | Halh Mongolian | Cyrl | Sp, Tx | Tx | +| khm | Khmer | Khmr | Sp, Tx | Tx | +| kir | Kyrgyz | Cyrl | Sp, Tx | Tx | +| kor | Korean | Kore | Sp, Tx | Sp, Tx | +| lao | Lao | Laoo | Sp, Tx | Tx | +| lit | Lithuanian | Latn | Sp, Tx | Tx | +| ltz | Luxembourgish | Latn | Sp | \-- | +| lug | Ganda | Latn | Sp, Tx | Tx | +| luo | Luo | Latn | Sp, Tx | Tx | +| lvs | Standard Latvian | Latn | Sp, Tx | Tx | +| mai | Maithili | Deva | Sp, Tx | Tx | +| mal | Malayalam | Mlym | Sp, Tx | Tx | +| mar | Marathi | Deva | Sp, Tx | Tx | +| mkd | Macedonian | Cyrl | Sp, Tx | Tx | +| mlt | Maltese | Latn | Sp, Tx | Sp, Tx | +| mni | Meitei | Beng | Sp, Tx | Tx | +| mya | Burmese | Mymr | Sp, Tx | Tx | +| nld | Dutch | Latn | Sp, Tx | Sp, Tx | +| nno | Norwegian Nynorsk | Latn | Sp, Tx | Tx | +| nob | Norwegian Bokmål | Latn | Sp, Tx | Tx | +| npi | Nepali | Deva | Sp, Tx | Tx | +| nya | Nyanja | Latn | Sp, Tx | Tx | +| oci | Occitan | Latn | Sp | \-- | +| ory | Odia | Orya | Sp, Tx | Tx | +| pan | Punjabi | Guru | Sp, Tx | Tx | +| pbt | Southern Pashto | Arab | Sp, Tx | Tx | +| pes | Western Persian | Arab | Sp, Tx | Sp, Tx | +| pol | Polish | Latn | Sp, Tx | Sp, Tx | +| por | Portuguese | Latn | Sp, Tx | Sp, Tx | +| ron | Romanian | Latn | Sp, Tx | Sp, Tx | +| rus | Russian | Cyrl | Sp, Tx | Sp, Tx | +| slk | Slovak | Latn | Sp, Tx | Sp, Tx | +| slv | Slovenian | Latn | Sp, Tx | Tx | +| sna | Shona | Latn | Sp, Tx | Tx | +| snd | Sindhi | Arab | Sp, Tx | Tx | +| som | Somali | Latn | Sp, Tx | Tx | +| spa | Spanish | Latn | Sp, Tx | Sp, Tx | +| srp | Serbian | Cyrl | Sp, Tx | Tx | +| swe | Swedish | Latn | Sp, Tx | Sp, Tx | +| swh | Swahili | Latn | Sp, Tx | Sp, Tx | +| tam | Tamil | Taml | Sp, Tx | Tx | +| tel | Telugu | Telu | Sp, Tx | Sp, Tx | +| tgk | Tajik | Cyrl | Sp, Tx | Tx | +| tgl | Tagalog | Latn | Sp, Tx | Sp, Tx | +| tha | Thai | Thai | Sp, Tx | Sp, Tx | +| tur | Turkish | Latn | Sp, Tx | Sp, Tx | +| ukr | Ukrainian | Cyrl | Sp, Tx | Sp, Tx | +| urd | Urdu | Arab | Sp, Tx | Sp, Tx | +| uzn | Northern Uzbek | Latn | Sp, Tx | Sp, Tx | +| vie | Vietnamese | Latn | Sp, Tx | Sp, Tx | +| xho | Xhosa | Latn | Sp | \-- | +| yor | Yoruba | Latn | Sp, Tx | Tx | +| yue | Cantonese | Hant | Sp, Tx | Tx | +| zlm | Colloquial Malay | Latn | Sp | \-- | +| zsm | Standard Malay | Latn | Tx | Tx | +| zul | Zulu | Latn | Sp, Tx | Tx | + + +Note that seamlessM4T-medium supports 200 languages in the text modality, and is based on NLLB-200 (see full list in [asset card](src/seamless_communication/cards/unity_nllb-200.yaml)) + +## Citation +For *UnitY*, please cite : +```bibtex +@inproceedings{inaguma-etal-2023-unity, + title="{U}nit{Y}: Two-pass Direct Speech-to-speech Translation with Discrete Units", + author="Inaguma, Hirofumi and Popuri, Sravya and Kulikov, Ilia and Chen, Peng-Jen and Wang, Changhan and Chung, Yu-An and Tang, Yun and Lee, Ann and Watanabe, Shinji and Pino, Juan", + booktitle="Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", + year="2023", + url="https://aclanthology.org/2023.acl-long.872", +} +``` + +For SeamlessM4T v1, please cite : +```bibtex +@article{seamlessm4t2023, + title={SeamlessM4T: Massively Multilingual \& Multimodal Machine Translation}, + author={{Seamless Communication}, Lo\"{i}c Barrault, Yu-An Chung, Mariano Cora Meglioli, David Dale, Ning Dong, Paul-Ambroise Duquenne, Hady Elsahar, Hongyu Gong, Kevin Heffernan, John Hoffman, Christopher Klaiber, Pengwei Li, Daniel Licht, Jean Maillard, Alice Rakotoarison, Kaushik Ram Sadagopan, Guillaume Wenzek, Ethan Ye, Bapi Akula, Peng-Jen Chen, Naji El Hachem, Brian Ellis, Gabriel Mejia Gonzalez, Justin Haaheim, Prangthip Hansanti, Russ Howes, Bernie Huang, Min-Jae Hwang, Hirofumi Inaguma, Somya Jain, Elahe Kalbassi, Amanda Kallet, Ilia Kulikov, Janice Lam, Daniel Li, Xutai Ma, Ruslan Mavlyutov, Benjamin Peloquin, Mohamed Ramadan, Abinesh Ramakrishnan, Anna Sun, Kevin Tran, Tuan Tran, Igor Tufanov, Vish Vogeti, Carleigh Wood, Yilin Yang, Bokai Yu, Pierre Andrews, Can Balioglu, Marta R. Costa-juss\`{a} \footnotemark[3], Onur \,{C}elebi,Maha Elbayad,Cynthia Gao, Francisco Guzm\'an, Justine Kao, Ann Lee, Alexandre Mourachko, Juan Pino, Sravya Popuri, Christophe Ropers, Safiyyah Saleem, Holger Schwenk, Paden Tomasello, Changhan Wang, Jeff Wang, Skyler Wang}, + journal={ArXiv}, + year={2023} +} +``` + +For SeamlessM4T v2, please cite : +```bibtex +@inproceedings{seamless2023, + title="Seamless: Multilingual Expressive and Streaming Speech Translation", + author="{Seamless Communication}, Lo{\"i}c Barrault, Yu-An Chung, Mariano Coria Meglioli, David Dale, Ning Dong, Mark Duppenthaler, Paul-Ambroise Duquenne, Brian Ellis, Hady Elsahar, Justin Haaheim, John Hoffman, Min-Jae Hwang, Hirofumi Inaguma, Christopher Klaiber, Ilia Kulikov, Pengwei Li, Daniel Licht, Jean Maillard, Ruslan Mavlyutov, Alice Rakotoarison, Kaushik Ram Sadagopan, Abinesh Ramakrishnan, Tuan Tran, Guillaume Wenzek, Yilin Yang, Ethan Ye, Ivan Evtimov, Pierre Fernandez, Cynthia Gao, Prangthip Hansanti, Elahe Kalbassi, Amanda Kallet, Artyom Kozhevnikov, Gabriel Mejia, Robin San Roman, Christophe Touret, Corinne Wong, Carleigh Wood, Bokai Yu, Pierre Andrews, Can Balioglu, Peng-Jen Chen, Marta R. Costa-juss{\`a}, Maha Elbayad, Hongyu Gong, Francisco Guzm{\'a}n, Kevin Heffernan, Somya Jain, Justine Kao, Ann Lee, Xutai Ma, Alex Mourachko, Benjamin Peloquin, Juan Pino, Sravya Popuri, Christophe Ropers, Safiyyah Saleem, Holger Schwenk, Anna Sun, Paden Tomasello, Changhan Wang, Jeff Wang, Skyler Wang, Mary Williamson", + journal={ArXiv}, + year={2023} +} +``` diff --git a/seamless_communication/docs/m4t/en_alignment.png b/seamless_communication/docs/m4t/en_alignment.png new file mode 100644 index 0000000..85d866d Binary files /dev/null and b/seamless_communication/docs/m4t/en_alignment.png differ diff --git a/seamless_communication/docs/m4t/on_device_README.md b/seamless_communication/docs/m4t/on_device_README.md new file mode 100644 index 0000000..8f9c455 --- /dev/null +++ b/seamless_communication/docs/m4t/on_device_README.md @@ -0,0 +1,61 @@ +# On-device Models [Experimental] + +Apart from SeamlessM4T-LARGE (2.3B) and SeamlessM4T-MEDIUM (1.2B) models, we are also developing a small model (281M) targeting for on-device inference. +This folder contains an example to run an exported small model covering most tasks (ASR/S2TT/S2ST). The model could be executed on popular mobile devices with Pytorch Mobile (https://pytorch.org/mobile/home/). + +## Updates +[2023/8/23] Uploaded new on-device models with several fixes to reduce size and avoid OOM. Metrics should be close to what's reported below, will rerun eval and update. + +## Overview +| Model | Checkpoint | Num Params | Disk Size | Supported Tasks | Supported Languages| +|---------|------------|----------|-------------|------------|-------------------------| +| UnitY-Small|[🤗 Model card](https://huggingface.co/facebook/seamless-m4t-unity-small) - [checkpoint](https://huggingface.co/facebook/seamless-m4t-unity-small/resolve/main/unity_on_device.ptl) | 281M | 747MB | S2ST, S2TT, ASR |eng, fra, hin, por, spa| +| UnitY-Small-S2T |[🤗 Model card](https://huggingface.co/facebook/seamless-m4t-unity-small-s2t) - [checkpoint](https://huggingface.co/facebook/seamless-m4t-unity-small-s2t/resolve/main/unity_on_device_s2t.ptl) | 235M | 481MB | S2TT, ASR |eng, fra,hin, por, spa| + +UnitY-Small-S2T is a pruned version of UnitY-Small without 2nd pass unit decoding. + +## Inference +To use exported model, users don't need seamless_communication or fairseq2 dependency. +```python +import torchaudio +import torch +audio_input, _ = torchaudio.load(TEST_AUDIO_PATH) # Load waveform using torchaudio + +s2t_model = torch.jit.load("unity_on_device_s2t.ptl") # Load exported S2T model +with torch.no_grad(): + text = s2t_model(audio_input, tgt_lang=TGT_LANG) # Forward call with tgt_lang specified for ASR or S2TT +print(text) # Show text output + +s2st_model = torch.jit.load("unity_on_device.ptl") +with torch.no_grad(): + text, units, waveform = s2st_model(audio_input, tgt_lang=TGT_LANG) # S2ST model also returns waveform +print(text) +torchaudio.save(f"{OUTPUT_FOLDER}/result.wav", waveform.unsqueeze(0), sample_rate=16000) # Save output waveform to local file +``` + + +Also running the exported model doesn't need python runtime. For example, you could load this model in C++ following [this tutorial](https://pytorch.org/tutorials/advanced/cpp_export.html), or building your own on-device applications similar to [this example](https://github.com/pytorch/ios-demo-app/tree/master/SpeechRecognition) + + +## Metrics +### S2TT BLEU / S2ST ASR-BLEU on FLEURS +For ASR-BLEU, we follow the same protocol as SeamlessM4T Large/Medium models: We used Whisper-large-v2 for Eng-X and Whisper-medium for X-Eng when evaluating ASR BLEU. +| Direction | 1st-pass BLEU (S2TT) | 2nd-pass ASR-BLEU (S2ST) +|---------|----------------------|----------------------| +| eng-hin|10.43|15.06| +| eng-por|21.54|17.35| +| eng-rus|7.88|5.11| +| eng-spa|12.78|11.75| +| hin-eng|12.92|10.50| +| por-eng|22.99|24.81| +| rus-eng|18.24|18.24| +| spa-eng|14.37|14.85| + +### ASR WER on FLEURS +| LANG | WER | +|---------|----------------------| +| eng|27.3| +| hin|41.5| +| por|25.2| +| rus|33.0| +| spa|18.0| diff --git a/seamless_communication/docs/m4t/ru_alignment.png b/seamless_communication/docs/m4t/ru_alignment.png new file mode 100644 index 0000000..ca42419 Binary files /dev/null and b/seamless_communication/docs/m4t/ru_alignment.png differ diff --git a/seamless_communication/docs/m4t/seamless_align_README.md b/seamless_communication/docs/m4t/seamless_align_README.md new file mode 100644 index 0000000..2f2ea5a --- /dev/null +++ b/seamless_communication/docs/m4t/seamless_align_README.md @@ -0,0 +1,111 @@ +# Seamless - Speech to Speech and Speech to Text Metadata + +This document contains metadata information for reconstructing the dataset we used for training our models. + +## Format + +The metadata format is similar to [NLLB bitext format](https://github.com/facebookresearch/LASER/tree/main/data/nllb200) with some small differences. + +The metadata files are tab separated, gzip files. Each file corresponds to one alignment direction. + +File naming convention: + +- for text, we use 3 letters: e.g. `fra`, `eng`, `tur` +- for audio, we use 2 letters and a 'A': e.g. `frA`, `enA`, `trA` + +For example, the direction `eng-trA` corresponds to information for reconstructing English text with Turkish speech alignments. Similarly, `enA-jpn` corresponds to "English speech with Japanese text", and `enA-frA` corresponds to "English speech with French speech". + +Each line has 11 columns. + +For Audio, the columns correspond to: + + - `cc_warc`: The warc file reference containing the public audio url + - `cc_sha`: not used + - `audio_speeh_segment_url`: space separated audio reference. See below. + - `cc_lineno`: not used + - `paragraph_digest`: expected duration of the whole audio file (without start/end frame trimming) + - `sentence_digest`: not used + - `text_lid_score`: not used + - `laser_score`: score of the alignment + - `direction`: direction, e.g. `enA-jpn` + - `side`: side, e.g. `enA` or `jpn` + - `line_no`: alignment number + +`audio_speech_segment_url` is a space separated audio reference. It has the following format: +` `, where `start_frame` and `end_frame` correspond to the segment that needs to be extracted from the audio file that is referenced at ``, resampled at 16000 Hz. + +For text, the columns are similar to NLLB format (except being tab separated here): + +- If the metadata comes from Common Crawl: + + - `cc_warc`: the reference to the Common Crawl WET file + - `cc_sha`: the document sha1 in the WET file + - `cc_document_url`: the url of the document referenced in the WET file + - `cc_lineno`: the line number in the document referenced in the WET file + - `paragraph_digest`: xxhash.xxh3_64_intdigest of the paragraph + - `sentence_digest`: xxhash.xxh3_64_intdigest of the sentence + - `text_lid_score`: language identification score, when available + - `laser_score`: score of the alignment + - `direction`: direction, e.g. `enA-jpn` + - `side`: side, e.g. `enA` or `jpn` + - `line_no`: alignment number + +- If the metadata comes from other corpus: + - `corpus`: corpus name + - `cc_sha`: not used + - `cc_document_url`: not used + - `lineno`: line number in the document + - `paragraph_digest`: xxhash.xxh3_64_intdigest of the paragraph + - `sentence_digest`: xxhash.xxh3_64_intdigest of the sentence + - `text_lid_score`: language identification score, when available + - `laser_score`: score of the alignment + - `direction`: direction, e.g. `enA-jpn` + - `side`: side, e.g. `enA` or `jpn` + - `line_no`: alignment number + +## Data + +Update: 30 Nov 2023 + + +We are publishing an extension of the previous speech to speech release. + + +[afA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.afA-enA.tsv.gz) [amA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.amA-enA.tsv.gz) [arA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.arA-enA.tsv.gz) [asA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.asA-enA.tsv.gz) [azA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.azA-enA.tsv.gz) [beA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.beA-enA.tsv.gz) [bgA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.bgA-enA.tsv.gz) [bnA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.bnA-enA.tsv.gz) [bsA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.bsA-enA.tsv.gz) [caA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.caA-enA.tsv.gz) [csA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.csA-enA.tsv.gz) [cyA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.cyA-enA.tsv.gz) [daA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.daA-enA.tsv.gz) [deA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.deA-enA.tsv.gz) [elA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.elA-enA.tsv.gz) [enA-esA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-esA.tsv.gz) [enA-etA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-etA.tsv.gz) [enA-fiA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-fiA.tsv.gz) [enA-frA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-frA.tsv.gz) [enA-gaA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-gaA.tsv.gz) [enA-glA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-glA.tsv.gz) [enA-guA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-guA.tsv.gz) [enA-heA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-heA.tsv.gz) [enA-hiA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-hiA.tsv.gz) [enA-hrA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-hrA.tsv.gz) [enA-huA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-huA.tsv.gz) [enA-hyA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-hyA.tsv.gz) [enA-idA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-idA.tsv.gz) [enA-isA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-isA.tsv.gz) [enA-itA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-itA.tsv.gz) [enA-jaA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-jaA.tsv.gz) [enA-jvA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-jvA.tsv.gz) [enA-kaA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-kaA.tsv.gz) [enA-kiA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-kiA.tsv.gz) [enA-kkA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-kkA.tsv.gz) [enA-knA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-knA.tsv.gz) [enA-koA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-koA.tsv.gz) [enA-kyA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-kyA.tsv.gz) [enA-lgA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-lgA.tsv.gz) [enA-loA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-loA.tsv.gz) [enA-ltA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-ltA.tsv.gz) [enA-lvA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-lvA.tsv.gz) [enA-mkA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-mkA.tsv.gz) [enA-mlA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-mlA.tsv.gz) [enA-mnA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-mnA.tsv.gz) [enA-mrA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-mrA.tsv.gz) [enA-msA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-msA.tsv.gz) [enA-mtA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-mtA.tsv.gz) [enA-neA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-neA.tsv.gz) [enA-nlA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-nlA.tsv.gz) [enA-noA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-noA.tsv.gz) [enA-orA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-orA.tsv.gz) [enA-paA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-paA.tsv.gz) [enA-pbA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-pbA.tsv.gz) [enA-plA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-plA.tsv.gz) [enA-psA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-psA.tsv.gz) [enA-ptA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-ptA.tsv.gz) [enA-rnA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-rnA.tsv.gz) [enA-ruA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-ruA.tsv.gz) [enA-sdA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-sdA.tsv.gz) [enA-skA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-skA.tsv.gz) [enA-slA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-slA.tsv.gz) [enA-srA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-srA.tsv.gz) [enA-svA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-svA.tsv.gz) [enA-swA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-swA.tsv.gz) [enA-taA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-taA.tsv.gz) [enA-teA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-teA.tsv.gz) [enA-tgA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-tgA.tsv.gz) [enA-thA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-thA.tsv.gz) [enA-trA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-trA.tsv.gz) [enA-ukA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-ukA.tsv.gz) [enA-urA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-urA.tsv.gz) [enA-uzA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-uzA.tsv.gz) [enA-viA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-viA.tsv.gz) [enA-yoA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-yoA.tsv.gz) [enA-zhA](https://dl.fbaipublicfiles.com/seamless/data/seamless_align_nov2023_extension/seamless.dataset.metadata.public.enA-zhA.tsv.gz) + + + +-------- + + +Update: 25 Sep 2023 + +We are publishing updated metadata with the expected duration of the original audio file in the column `paragraph_digest` (originally not used for audio). + +[arb-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.arb-enA.withduration.tsv.gz) [ben-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.ben-enA.withduration.tsv.gz) [cat-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.cat-enA.withduration.tsv.gz) [dan-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.dan-enA.withduration.tsv.gz) [enA-est](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-est.withduration.tsv.gz) [enA-fin](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-fin.withduration.tsv.gz) [enA-jpn](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-jpn.withduration.tsv.gz) [enA-mlt](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-mlt.withduration.tsv.gz) [enA-nld](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-nld.withduration.tsv.gz) [enA-pol](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-pol.withduration.tsv.gz) [enA-por](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-por.withduration.tsv.gz) [enA-ron](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-ron.withduration.tsv.gz) [enA-slk](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-slk.withduration.tsv.gz) [enA-swe](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-swe.withduration.tsv.gz) [enA-swh](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-swh.withduration.tsv.gz) [enA-tur](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-tur.withduration.tsv.gz) [enA-ukr](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-ukr.withduration.tsv.gz) [enA-urd](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-urd.withduration.tsv.gz) [enA-vie](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-vie.withduration.tsv.gz) [arA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.arA-enA.withduration.tsv.gz) [arA-eng](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.arA-eng.withduration.tsv.gz) [beA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.beA-enA.withduration.tsv.gz) [caA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.caA-enA.withduration.tsv.gz) [caA-eng](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.caA-eng.withduration.tsv.gz) [csA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.csA-enA.withduration.tsv.gz) [csA-eng](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.csA-eng.withduration.tsv.gz) [cyA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.cyA-enA.withduration.tsv.gz) [cyA-eng](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.cyA-eng.withduration.tsv.gz) [daA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.daA-enA.withduration.tsv.gz) [daA-eng](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.daA-eng.withduration.tsv.gz) [deA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.deA-enA.withduration.tsv.gz) [deA-eng](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.deA-eng.withduration.tsv.gz) [enA-esA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-esA.withduration.tsv.gz) [enA-fiA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-fiA.withduration.tsv.gz) [enA-frA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-frA.withduration.tsv.gz) [enA-hiA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-hiA.withduration.tsv.gz) [enA-idA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-idA.withduration.tsv.gz) [enA-itA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-itA.withduration.tsv.gz) [enA-knA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-knA.withduration.tsv.gz) [enA-koA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-koA.withduration.tsv.gz) [enA-mtA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-mtA.withduration.tsv.gz) [enA-nlA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-nlA.withduration.tsv.gz) [enA-plA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-plA.withduration.tsv.gz) [enA-ptA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-ptA.withduration.tsv.gz) [enA-rnA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-rnA.withduration.tsv.gz) [enA-ruA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-ruA.withduration.tsv.gz) [enA-skA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-skA.withduration.tsv.gz) [enA-svA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-svA.withduration.tsv.gz) [enA-swA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-swA.withduration.tsv.gz) [enA-taA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-taA.withduration.tsv.gz) [enA-teA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-teA.withduration.tsv.gz) [enA-tgA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-tgA.withduration.tsv.gz) [enA-thA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-thA.withduration.tsv.gz) [enA-trA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-trA.withduration.tsv.gz) [enA-ukA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-ukA.withduration.tsv.gz) [enA-urA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-urA.withduration.tsv.gz) [enA-uzA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-uzA.withduration.tsv.gz) [enA-viA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-viA.withduration.tsv.gz) [enA-zhA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-zhA.withduration.tsv.gz) [eng-esA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-esA.withduration.tsv.gz) [eng-fiA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-fiA.withduration.tsv.gz) [eng-frA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-frA.withduration.tsv.gz) [eng-hiA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-hiA.withduration.tsv.gz) [eng-idA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-idA.withduration.tsv.gz) [eng-itA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-itA.withduration.tsv.gz) [eng-knA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-knA.withduration.tsv.gz) [eng-koA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-koA.withduration.tsv.gz) [eng-mtA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-mtA.withduration.tsv.gz) [eng-nlA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-nlA.withduration.tsv.gz) [eng-plA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-plA.withduration.tsv.gz) [eng-ptA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-ptA.withduration.tsv.gz) [eng-rnA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-rnA.withduration.tsv.gz) [eng-ruA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-ruA.withduration.tsv.gz) [eng-skA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-skA.withduration.tsv.gz) [eng-swA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-swA.withduration.tsv.gz) [eng-taA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-taA.withduration.tsv.gz) [eng-teA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-teA.withduration.tsv.gz) [eng-tgA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-tgA.withduration.tsv.gz) [eng-thA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-thA.withduration.tsv.gz) [eng-trA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-trA.withduration.tsv.gz) [eng-ukA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-ukA.withduration.tsv.gz) [eng-urA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-urA.withduration.tsv.gz) [eng-uzA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-uzA.withduration.tsv.gz) [eng-viA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-viA.withduration.tsv.gz) [eng-zhA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-zhA.withduration.tsv.gz) + + +-------- + + +You can find the legacy metadata (without duration information) here: + +### Legacy Data + +[arb-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.arb-enA.tsv.gz) [ben-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.ben-enA.tsv.gz) [cat-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.cat-enA.tsv.gz) [dan-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.dan-enA.tsv.gz) [enA-est](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-est.tsv.gz) [enA-fin](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-fin.tsv.gz) [enA-jpn](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-jpn.tsv.gz) [enA-mlt](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-mlt.tsv.gz) [enA-nld](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-nld.tsv.gz) [enA-pol](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-pol.tsv.gz) [enA-por](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-por.tsv.gz) [enA-ron](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-ron.tsv.gz) [enA-slk](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-slk.tsv.gz) [enA-swe](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-swe.tsv.gz) [enA-swh](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-swh.tsv.gz) [enA-tur](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-tur.tsv.gz) [enA-ukr](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-ukr.tsv.gz) [enA-urd](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-urd.tsv.gz) [enA-vie](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-vie.tsv.gz) [arA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.arA-enA.tsv.gz) [arA-eng](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.arA-eng.tsv.gz) [beA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.beA-enA.tsv.gz) [caA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.caA-enA.tsv.gz) [caA-eng](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.caA-eng.tsv.gz) [csA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.csA-enA.tsv.gz) [csA-eng](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.csA-eng.tsv.gz) [cyA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.cyA-enA.tsv.gz) [cyA-eng](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.cyA-eng.tsv.gz) [daA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.daA-enA.tsv.gz) [daA-eng](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.daA-eng.tsv.gz) [deA-enA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.deA-enA.tsv.gz) [deA-eng](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.deA-eng.tsv.gz) [enA-esA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-esA.tsv.gz) [enA-fiA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-fiA.tsv.gz) [enA-frA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-frA.tsv.gz) [enA-hiA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-hiA.tsv.gz) [enA-idA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-idA.tsv.gz) [enA-itA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-itA.tsv.gz) [enA-knA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-knA.tsv.gz) [enA-koA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-koA.tsv.gz) [enA-mtA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-mtA.tsv.gz) [enA-nlA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-nlA.tsv.gz) [enA-plA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-plA.tsv.gz) [enA-ptA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-ptA.tsv.gz) [enA-rnA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-rnA.tsv.gz) [enA-ruA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-ruA.tsv.gz) [enA-skA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-skA.tsv.gz) [enA-svA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-svA.tsv.gz) [enA-swA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-swA.tsv.gz) [enA-taA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-taA.tsv.gz) [enA-teA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-teA.tsv.gz) [enA-tgA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-tgA.tsv.gz) [enA-thA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-thA.tsv.gz) [enA-trA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-trA.tsv.gz) [enA-ukA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-ukA.tsv.gz) [enA-urA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-urA.tsv.gz) [enA-uzA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-uzA.tsv.gz) [enA-viA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-viA.tsv.gz) [enA-zhA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.enA-zhA.tsv.gz) [eng-esA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-esA.tsv.gz) [eng-fiA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-fiA.tsv.gz) [eng-frA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-frA.tsv.gz) [eng-hiA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-hiA.tsv.gz) [eng-idA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-idA.tsv.gz) [eng-itA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-itA.tsv.gz) [eng-knA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-knA.tsv.gz) [eng-koA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-koA.tsv.gz) [eng-mtA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-mtA.tsv.gz) [eng-nlA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-nlA.tsv.gz) [eng-plA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-plA.tsv.gz) [eng-ptA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-ptA.tsv.gz) [eng-rnA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-rnA.tsv.gz) [eng-ruA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-ruA.tsv.gz) [eng-skA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-skA.tsv.gz) [eng-swA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-swA.tsv.gz) [eng-taA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-taA.tsv.gz) [eng-teA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-teA.tsv.gz) [eng-tgA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-tgA.tsv.gz) [eng-thA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-thA.tsv.gz) [eng-trA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-trA.tsv.gz) [eng-ukA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-ukA.tsv.gz) [eng-urA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-urA.tsv.gz) [eng-uzA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-uzA.tsv.gz) [eng-viA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-viA.tsv.gz) [eng-zhA](https://dl.fbaipublicfiles.com/seamless/data/seamless.dataset.metadata.public.eng-zhA.tsv.gz) + +## Download script + +You can use the `wet_lines` script to download and gather aligned text information from the metadata. This script can be found [here](https://github.com/kpu/preprocess/blob/wet/preprocess/wet_lines_main.cc). + +### Example usage: + +`zcat seamless.dataset.metadata.public.enA-swA.tsv.gz | egrep ^crawl-data | tr '\t' ' ' | wet_lines` + +Based on metadata information it receives from stdin, wet_lines will download the corpora, find the paragraph and print the input with an additional column which corresponds to the text of the paragraph. + +In order to retrieve the sentences from these paragraphs, one can use the sentence splitter available [here](https://github.com/facebookresearch/LASER/tree/main/utils). It will print the input (metadata + paragraph) with an additional column which corresponds to the text of the sentence. + +### Reconstructing sentences from metadata: + +`xzcat metadatafile.xz | egrep ^crawl-data | wet_lines | python -c "from sentence_cleaner_splitter.cleaner_splitter import *; split_clean()"` diff --git a/seamless_communication/docs/m4t/seamlessm4t_arch.svg b/seamless_communication/docs/m4t/seamlessm4t_arch.svg new file mode 100644 index 0000000..04fe2c3 --- /dev/null +++ b/seamless_communication/docs/m4t/seamlessm4t_arch.svg @@ -0,0 +1,1088 @@ + + + + + + + + + + + + + + + + +S +EAMLESS +M +4 +T +(v1) + + + + + + + + +x +text +( + +s +) + + + + + + + + + + + + + + + + + + + + + + + + + + + +Mel-Filterbanksextractor(bins +=80 +) + + + + + + + + + + +x +speech +( + +s +) + + + + + + + + + + + + + + + + + + +Transformertextencoder +S +EAMLESS +M +4 +T +- +N +L +L +B + + + + + + + + + + + + + + + + + + + + + +Conformerspeechencoder +W +2 +V +- +B +E +R +T +2 +. +0 + + + + + + + + + + + + + + + + + + + + + +Lengthadaptor + + + + + + + + + + + + + + + + + + + + + +Transformertextdecoder +S +EAMLESS +M +4 +T +- +N +L +L +B + + + + + + + + + + + +y +text +( + +t +) + + + + + + + + + + + + + + + + + + +subword-lengthT2Uencoder + + + + + + + + + + + + + + + + + + + + + +ARunitdecoder + + + + + + + + + + + +u +dedup + + + + + + + + + + + + + + + + + + +HiFi-GANunit-vocoder + + + + + + + + + + + +( + +t +) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Continuousdecoderoutput + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +S +EAMLESS +M +4 +T +V +2 + + + + + + + + +x +text +( + +s +) + + + + + + + + + + + + + + + + + + + + + + + + + + + +Mel-Filterbanksextractor(bins +=80 +) + + + + + + + + + + +x +speech +( + +s +) + + + + + + + + + + + + + + + + + + +Transformertextencoder +S +EAMLESS +M +4 +T +- +N +L +L +B + + + + + + + + + + + + + + + + + + + + + +Conformerspeechencoder +W +2 +V +- +B +E +R +T +2 +. +0 + + + + + + + + + + + + + + + + + + + + + +Lengthadaptor + + + + + + + + + + + + + + + + + + + + + +Transformertextdecoder +S +EAMLESS +M +4 +T +- +N +L +L +B + + + + + + + + + + + +y +text +( + +t +) + + + + + + + + + + + + + + + + + + +subword-lengthT2Uencoder + + + + + + + + + + + + + + + + + + + + + +subword-to-characterupsampler + + + + + + + + + + + + + + + + + + + + + +Unitdurationpredictor + + + + + + + + + + + + + + + + + + + + + +character-to-unitupsampler + + + + + + + + + + + + + + + + + + + + + +Aligner + + + + + + + + + + + +u +dup + + + + + + + + +y +text-char + + + + + + + + + + + + + +Trainingsupervision + + + + + + + + + + + + + + + + + + +NARunitdecoder + + + + + + + + + + + +u +dup + + + + + + + + + + + + + + + + + + +HiFi-GANunit-vocoder + + + + + + + + + + + +( + +t +) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Continuousdecoderoutput + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/seamless_communication/docs/m4t/unity2_aligner_README.md b/seamless_communication/docs/m4t/unity2_aligner_README.md new file mode 100644 index 0000000..61ebc84 --- /dev/null +++ b/seamless_communication/docs/m4t/unity2_aligner_README.md @@ -0,0 +1,76 @@ +# UnitY2 forced alignment extractor + +Please refer to Section 3.3.2 of the "Seamless: Multilingual Expressive and Streaming Speech Translation" paper to read more details about aligner design & training. + +We provide a light-weight wrapper to extract alignments between given text and acoustic unit sequences. Unit extractor is also available from the wrapper itself. + +## Alignment extractor codebase + +The entire codebase is located in `/src/seamless_communication/models/aligner`. It is built using fairseq2 library. This time we release a mutlilingual (38 languages following SeamlessM4Tv2 target languages) checkpoint to load the alignment toolkit. This checkpoint corresponds to `nar_t2u_aligner` asset card. + +## Usage examples + +For large-scale alignment extraction offline unit extraction is preferred. Refer to `/src/seamless_communication/cli/m4t/audio_to_units` for more details on offline unit extraction. + +**Alignment extractor initialization:** + +```python +from seamless_communication.models.aligner.alignment_extractor import AlignmentExtractor +from fairseq2.typing import Device +import torch + +extractor = AlignmentExtractor( + aligner_model_name_or_card="nar_t2u_aligner", + unit_extractor_model_name_or_card="xlsr2_1b_v2", + unit_extractor_output_layer=35, + unit_extractor_kmeans_model_uri="https://dl.fbaipublicfiles.com/seamlessM4T/models/unit_extraction/kmeans_10k.npy", +) +``` +* large unit extractor checkpoint will be downloaded, this takes time + +* by default cpu device is used, but fp16 (`dtype=torch.float16`) & cuda (`device=Device("cuda")`) are supported, see class constructor for details + + + + +**Extracting alignment** + +Ru audio example: + +* audio link: `https://models.silero.ai/denoise_models/sample0.wav` (thanks Silero team for public audio samples) + +* ru_transcription: `первое что меня поразило это необыкновенно яркий солнечный свет похожий на электросварку` + +```python +alignment_durations, _, tokenized_text_tokens = extractor.extract_alignment("sample0.wav", ru_transcription, plot=True, add_trailing_silence=True) +``` +* audio will be resampled to 16kHz for unit extraction + +* `alignment_durations` contains number of units (20ms frames) aligned per each token from `tokenized_text_tokens`. + +* `add_trailing_silence` sets extra silence token in the end of the given text sequence. That is useful when there is no terminal punctuation provided in the text itself. + +Ru alignment plot: +![Ru alignment pic](ru_alignment.png) + +En audio example: + +* audio link: `https://dl.fbaipublicfiles.com/seamlessM4T/LJ037-0171_sr16k.wav` + +* en_transcription: `the examination and testimony of the experts enabled the commision to conclude that five shots may have been fired.` + +```python +alignment_durations, _, tokenized_text_tokens = extractor.extract_alignment("LJ037-0171_sr16k.wav", en_transcription, plot=True, add_trailing_silence=False) +``` +* here we set `add_trailing_silence` to False since terminal punctuation exists, but True will also work + +En alignment plot: +![En alignment pic](en_alignment.png) + +## Integration test + +If you encounter issues with produced alignments, please run integration test with the alignment extraction toolkit to make sure that your environment works good. + +Run from the repo root: + +`pytest -vv tests/integration/models/test_unity2_aligner.py` diff --git a/seamless_communication/docs/streaming/README.md b/seamless_communication/docs/streaming/README.md new file mode 100644 index 0000000..3a1118d --- /dev/null +++ b/seamless_communication/docs/streaming/README.md @@ -0,0 +1,46 @@ +# SeamlessStreaming +SeamlessStreaming is a multilingual streaming translation model. It supports: + +- Streaming Automatic Speech Recognition on 96 languages. +- Simultaneous translation on 101 source languages for speech input. +- Simultaneous translation on 96 target languages for text output. +- Simultaneous translation on 36 target languages for speech output. + +Check out the SeamlessM4T [README](../m4t/README.md) for more details on supported languages. + + +![SeamlessStreaming architecture](streaming_arch.png) + +## SeamlessStreaming models +| Model Name | #params | checkpoint | metrics | +| ------------------ | ------- | --------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------ | +| SeamlessStreaming | 2.5B | [🤗 Model card](https://huggingface.co/facebook/seamless-streaming) - [monotonic decoder checkpoint](https://huggingface.co/facebook/seamless-streaming/resolve/main/seamless_streaming_monotonic_decoder.pt) - [streaming UnitY2 checkpoint](https://huggingface.co/facebook/seamless-streaming/resolve/main/seamless_streaming_unity.pt) | [metrics](https://dl.fbaipublicfiles.com/seamless/metrics/streaming/seamless_streaming.zip) | + +The evaluation data ids for FLEURS, CoVoST2 and CVSS-C can be found [here](https://dl.fbaipublicfiles.com/seamless/metrics/evaluation_data_ids.zip) + + +## Evaluating SeamlessStreaming models +To reproduce our results, or to evaluate using the same metrics over your own test sets, please check out the [Evaluation README here](../../src/seamless_communication/cli/streaming/README.md). Streaming evaluation depends on the SimulEval library. + +## Citation + +For EMMA, please cite : +```bibtex +@article{ma_efficient_2023, + author={Ma, Xutai and Sun, Anna and Ouyang, Siqi and Inaguma, Hirofumi and Tomasello, Paden}, + title={Efficient Monotonic Multihead Attention}, + year={2023}, + url={https://ai.meta.com/research/publications/efficient-monotonic-multihead-attention/}, +} +``` + +For SeamlessStreaming, please cite : +```bibtex +@inproceedings{seamless2023, + title="Seamless: Multilingual Expressive and Streaming Speech Translation", + author="{Seamless Communication}, Lo{\"i}c Barrault, Yu-An Chung, Mariano Coria Meglioli, David Dale, Ning Dong, Mark Duppenthaler, Paul-Ambroise Duquenne, Brian Ellis, Hady Elsahar, Justin Haaheim, John Hoffman, Min-Jae Hwang, Hirofumi Inaguma, Christopher Klaiber, Ilia Kulikov, Pengwei Li, Daniel Licht, Jean Maillard, Ruslan Mavlyutov, Alice Rakotoarison, Kaushik Ram Sadagopan, Abinesh Ramakrishnan, Tuan Tran, Guillaume Wenzek, Yilin Yang, Ethan Ye, Ivan Evtimov, Pierre Fernandez, Cynthia Gao, Prangthip Hansanti, Elahe Kalbassi, Amanda Kallet, Artyom Kozhevnikov, Gabriel Mejia, Robin San Roman, Christophe Touret, Corinne Wong, Carleigh Wood, Bokai Yu, Pierre Andrews, Can Balioglu, Peng-Jen Chen, Marta R. Costa-juss{\`a}, Maha Elbayad, Hongyu Gong, Francisco Guzm{\'a}n, Kevin Heffernan, Somya Jain, Justine Kao, Ann Lee, Xutai Ma, Alex Mourachko, Benjamin Peloquin, Juan Pino, Sravya Popuri, Christophe Ropers, Safiyyah Saleem, Holger Schwenk, Anna Sun, Paden Tomasello, Changhan Wang, Jeff Wang, Skyler Wang, Mary Williamson", + journal={ArXiv}, + year={2023} +} +``` + diff --git a/seamless_communication/docs/streaming/streaming_arch.png b/seamless_communication/docs/streaming/streaming_arch.png new file mode 100644 index 0000000..4f5bbd9 Binary files /dev/null and b/seamless_communication/docs/streaming/streaming_arch.png differ diff --git a/seamless_communication/ggml/CMakeLists.txt b/seamless_communication/ggml/CMakeLists.txt new file mode 100644 index 0000000..239ec26 --- /dev/null +++ b/seamless_communication/ggml/CMakeLists.txt @@ -0,0 +1,195 @@ +cmake_minimum_required (VERSION 3.3) +project(ggml VERSION 0.1.0) + +set(CMAKE_EXPORT_COMPILE_COMMANDS "on") +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) +set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) + set(GGML_STANDALONE ON) + include(cmake/GitVars.cmake) + include(cmake/BuildTypes.cmake) +else() + set(GGML_STANDALONE OFF) +endif() + +# options + +option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON) +option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF) + +option(GGML_SANITIZE_THREAD "ggml: enable thread sanitizer" OFF) +option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF) +option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF) + +option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE}) +option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE}) + +option(GGML_TEST_COVERAGE "ggml: enable test coverage" OFF) + +option(GGML_PERF "ggml: enable perf timings" OFF) +option(GGML_NO_ACCELERATE "ggml: disable Accelerate framework" OFF) +option(GGML_OPENBLAS "ggml: use OpenBLAS" OFF) +option(GGML_CLBLAST "ggml: use clBLAST" OFF) +option(GGML_CUBLAS "ggml: use cuBLAS" OFF) +option(GGML_METAL "ggml: use Metal" OFF) + +# sanitizers + +if (GGML_SANITIZE_THREAD) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread") +endif() + +if (GGML_SANITIZE_ADDRESS) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer") +endif() + +if (GGML_SANITIZE_UNDEFINED) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined") +endif() + +# instruction set specific +option(GGML_AVX "ggml: enable AVX" ON) +option(GGML_AVX2 "ggml: enable AVX2" ON) +option(GGML_AVX512 "ggml: enable AVX512" OFF) +option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF) +option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF) +option(GGML_FMA "ggml: enable FMA" ON) +# in MSVC F16C is implied with AVX2/AVX512 +if (NOT MSVC) + option(GGML_F16C "ggml: enable F16C" ON) +endif() + +#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math") +#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") +#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=native") + +# warning flags + +if (GGML_ALL_WARNINGS) + if (NOT MSVC) + set(c_flags -Wall -Wpedantic -Wformat=2 -Wno-unused -Wstrict-prototypes) + set(cxx_flags -Wall -Wpedantic -Wformat=2) + else() + # todo : windows + endif() + + add_compile_options( + "$<$:${c_flags}>" + "$<$:${cxx_flags}>" + ) +endif() + +if (NOT MSVC) + add_compile_options( + "$<$:-Werror=vla>" + "$<$:-Werror=vla>" + "$<$:-Xcompiler;-Werror=vla>" + ) +endif() + +# +# POSIX conformance +# + +# clock_gettime came in POSIX.1b (1993) +# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional +# posix_memalign came in POSIX.1-2001 / SUSv3 +# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985) +add_compile_definitions(_XOPEN_SOURCE=600) + +# Somehow in OpenBSD whenever POSIX conformance is specified +# some string functions rely on locale_t availability, +# which was introduced in POSIX.1-2008, forcing us to go higher +if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") + remove_definitions(-D_XOPEN_SOURCE=600) + add_compile_definitions(_XOPEN_SOURCE=700) +endif() + +# Data types, macros and functions related to controlling CPU affinity +# are available on Linux through GNU extensions in libc +if (CMAKE_SYSTEM_NAME MATCHES "Linux") + add_compile_definitions(_GNU_SOURCE) +endif() + +# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, +# and on macOS its availability depends on enabling Darwin extensions +# similarly on DragonFly, enabling BSD extensions is necessary +if (CMAKE_SYSTEM_NAME MATCHES "Darwin") + add_compile_definitions(_DARWIN_C_SOURCE) +endif() +if (CMAKE_SYSTEM_NAME MATCHES "DragonFly") + add_compile_definitions(_DARWIN_C_SOURCE) +endif() + +# alloca is a non-standard interface that is not visible on BSDs when +# POSIX conformance is specified, but not all of them provide a clean way +# to enable it in such cases +if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD") + add_compile_definitions(__BSD_VISIBLE) +endif() +if (CMAKE_SYSTEM_NAME MATCHES "NetBSD") + add_compile_definitions(_NETBSD_SOURCE) +endif() +if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") + add_compile_definitions(_BSD_SOURCE) +endif() + +if (WHISPER_PERF) + set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF) +endif() + +# dependencies + +set(CMAKE_C_STANDARD 11) +set(CMAKE_CXX_STANDARD 14) + +find_package(Threads REQUIRED) + +# main + +file(GLOB KALDI_NATIVE_FBANK_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/examples/kaldi-native-fbank/csrc/*" +) +add_library(kaldi-native-fbank STATIC ${KALDI_NATIVE_FBANK_SOURCES}) +target_include_directories(kaldi-native-fbank PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/examples/kaldi-native-fbank/csrc +) + + +if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo") +endif () + +if (GGML_BUILD_TESTS) + if (GGML_TEST_COVERAGE) + if (CMAKE_C_COMPILER_ID MATCHES "Clang") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fprofile-instr-generate -fcoverage-mapping") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-instr-generate -fcoverage-mapping") + else() + message(WARNING "Test coverage is only supported for Clang") + endif() + endif() +endif() + +add_subdirectory(src) + +if (GGML_BUILD_TESTS) + enable_testing() + add_subdirectory(tests) +endif () + +if (GGML_BUILD_EXAMPLES) + add_subdirectory(examples) +endif () + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in + ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc + @ONLY) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc + DESTINATION share/pkgconfig) diff --git a/seamless_communication/ggml/LICENSE b/seamless_communication/ggml/LICENSE new file mode 100644 index 0000000..fb7ff0c --- /dev/null +++ b/seamless_communication/ggml/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Georgi Gerganov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/seamless_communication/ggml/Makefile b/seamless_communication/ggml/Makefile new file mode 100644 index 0000000..a44f00f --- /dev/null +++ b/seamless_communication/ggml/Makefile @@ -0,0 +1,47 @@ +build: build/examples/unity/libfairseq2_cpp.so ggml/build/bin/unity + +build/examples/unity/libfairseq2_cpp.so: Makefile examples/unity/*.h examples/unity/*.cpp src/ggml*.c + mkdir -p build + cd build; cmake\ + -DGGML_OPENBLAS=ON \ + -DBUILD_SHARED_LIBS=On \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_FLAGS="-g2 -fno-omit-frame-pointer" \ + -DTRACY_ENABLE=ON \ + .. + cd build; make -j4 fairseq2_cpp + find build/ -iname '*.so' + + +ggml/build/bin/unity: Makefile examples/unity/*.h examples/unity/*.cpp src/ggml*.c + mkdir -p build + cd build; cmake\ + -DGGML_OPENBLAS=ON \ + -DBUILD_SHARED_LIBS=On \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_FLAGS="-g2 -fno-omit-frame-pointer" \ + -DTRACY_ENABLE=ON \ + .. + cd build; make -j4 unity + find build/ -iname '*.so' + + +tests: build/src/libggml.so + pytest ./*.py -s + +build/src/libggml_cuda.so: Makefile examples/unity/*.h examples/unity/*.cpp + mkdir -p build + cd build; cmake\ + -DGGML_CUBLAS=ON \ + -DBUILD_SHARED_LIBS=On \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_FLAGS="-g2" \ + .. + cd build; make -j4 ggml + mv build/src/libggml.so build/src/libggml_cuda.so + find build/ -iname '*.so' + +cuda_tests: build/src/libggml_cuda.so + sed -i 's/lib_base_name = "ggml"/lib_base_name = "ggml_cuda"/' third_party_ggml.py + pytest ./*.py -s + sed -i 's/lib_base_name = "ggml_cuda"/lib_base_name = "ggml"/' third_party_ggml.py diff --git a/seamless_communication/ggml/README.md b/seamless_communication/ggml/README.md new file mode 100644 index 0000000..aebf58a --- /dev/null +++ b/seamless_communication/ggml/README.md @@ -0,0 +1,55 @@ +# unity.cpp + +## Introduction +[GGML](https://github.com/ggerganov/ggml) is an open source library in C to enable large model inference on various hardware platforms. We implemented unity.cpp in ggml. Now it supports SeamlessM4T model for X2T tasks - Speech-to-text translation (S2TT), Acoustic speech recognition (ASR), Text-to-text translation (T2TT). + +The project is still active in development. Contributions are welcome! + +## Build +To build the interactive console for S2TT & ASR, +``` + +cd seamless_communication/ggml +mkdir build; cd build +cmake \ + -DGGML_OPENBLAS=ON \ + -DBUILD_SHARED_LIBS=On \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_FLAGS="-g2 -fno-omit-frame-pointer" \ + .. +make -j4 unity # Interactive Console + +``` +Note that `-DGGML_OPENBLAS=ON` is not necessary on macOS. + +For more build commands see [Makefile](Makefile). + +## CLI usage +Command to launch an interactive console for S2TT & ASR, note that the model already includes vocabulary needed to detokenize. +``` +OPENBLAS_NUM_THREADS=8 ./bin/unity --model seamlessM4T_medium.ggml +``` +In the console, enter the path of local waveform file and target language, separated by space. Note that the first run would include some “warm up” time so could be slow. + +Converted ggml models could be downloaded from +|SeamlessM4T_large | SeamlessM4T_medium | +|-------- | -------- | +| [model](https://dl.fbaipublicfiles.com/seamless/models/seamlessM4T_large.ggml) | [model](https://dl.fbaipublicfiles.com/seamless/models/seamlessM4T_medium.ggml) | + +## Fairseq2 model conversion +Models from fairseq2 checkpoints could be converted to ggml automatically with [ggml_convert.py](ggml_convert.py). +``` +python ggml_convert.py -m MODEL_NAME +``` +where MODEL_NAME corresponds to asset cards in fairseq2 / seamless_communication, e.g. seamlessM4T_medium, seamlessM4T_large + +## Python bindings +We also utilize ggml python bindings for better dev experience. For examples of running unity.cpp in python, refer to tests in [test_unity_cpp.py](test_unity_cpp.py). + +## [Optional]Dependencies +### OpenBLAS +We strongly suggest building with OpenBLAS, as we've seen 8x speedup on test machine. + +### libsndfile +This is needed only for the console to load waveform, but not the library. + diff --git a/seamless_communication/ggml/build.zig b/seamless_communication/ggml/build.zig new file mode 100644 index 0000000..5aa379d --- /dev/null +++ b/seamless_communication/ggml/build.zig @@ -0,0 +1,158 @@ +const std = @import("std"); +const builtin = @import("builtin"); + +// Zig Version: 0.11.0 +// Zig Build Command: zig build +// Zig Run Command: zig build -h +// zig build run_dolly-v2 +// zig build run_gpt-2 +// zig build run_gpt-j +// zig build run_gpt-neox +// zig build run_mnist +// zig build run_mpt +// zig build run_replit +// zig build run_starcoder +// zig build run_test-grad0 +// zig build run_test-mul-mat0 +// zig build run_test-mul-mat2 +// zig build run_test-opt +// zig build run_test-vec1 +// zig build run_test0 +// zig build run_test1 +// zig build run_test2 +// zig build run_test3 +// zig build run_zig_test0 +// zig build run_zig_test1 +// zig build run_zig_test2 +// zig build run_zig_test3 +pub fn build(b: *std.build.Builder) void { + const target = b.standardTargetOptions(.{}); + const optimize = b.standardOptimizeOption(.{}); + const lib = b.addStaticLibrary(.{ + .name = "ggml", + .target = target, + .optimize = optimize, + }); + lib.addIncludePath(.{ .path = "./include" }); + lib.addIncludePath(.{ .path = "./include/ggml" }); + lib.addCSourceFiles(&.{ + "src/ggml.c", + }, &.{"-std=c11"}); + lib.linkLibC(); + lib.linkLibCpp(); + b.installArtifact(lib); + + // examples + const examples = .{ + "dolly-v2", + "gpt-2", + "gpt-j", + "gpt-neox", + "mnist", + "mpt", + "replit", + "starcoder", + // "whisper", + }; + inline for (examples) |name| { + const exe = b.addExecutable(.{ + .name = name, + .target = target, + .optimize = optimize, + }); + exe.addIncludePath(.{ .path = "./include" }); + exe.addIncludePath(.{ .path = "./include/ggml" }); + exe.addIncludePath(.{ .path = "./examples" }); + // exe.addIncludePath("./examples/whisper"); + exe.addCSourceFiles(&.{ + std.fmt.comptimePrint("examples/{s}/main.cpp", .{name}), + "examples/common.cpp", + "examples/common-ggml.cpp", + // "examples/whisper/whisper.cpp", + }, &.{"-std=c++11"}); + exe.linkLibrary(lib); + b.installArtifact(exe); + const run_cmd = b.addRunArtifact(exe); + run_cmd.step.dependOn(b.getInstallStep()); + if (b.args) |args| run_cmd.addArgs(args); + const run_step = b.step("run_" ++ name, "Run examples"); + run_step.dependOn(&run_cmd.step); + } + + // tests + const tests = if (builtin.target.cpu.arch == .x86_64) .{ + // "test-blas0", + // "test-grad0", + "test-mul-mat0", + // "test-mul-mat1", + "test-mul-mat2", + // "test-opt", + // "test-svd0", + // "test-vec0", + "test-vec1", + // "test-vec2", + "test0", + "test1", + "test2", + "test3", + } else .{ + // "test-blas0", + // "test-grad0", + "test-mul-mat0", + // "test-mul-mat1", + "test-mul-mat2", + // "test-opt", + // "test-svd0", + // "test-vec0", + // "test-vec1", + // "test-vec2", + "test0", + "test1", + "test2", + "test3", + }; + inline for (tests) |name| { + const exe = b.addExecutable(.{ + .name = name, + .target = target, + .optimize = optimize, + }); + exe.addIncludePath(.{ .path = "./include" }); + exe.addIncludePath(.{ .path = "./include/ggml" }); + exe.addCSourceFiles(&.{ + std.fmt.comptimePrint("tests/{s}.c", .{name}), + }, &.{"-std=c11"}); + exe.linkLibrary(lib); + b.installArtifact(exe); + const run_cmd = b.addRunArtifact(exe); + run_cmd.step.dependOn(b.getInstallStep()); + if (b.args) |args| run_cmd.addArgs(args); + const run_step = b.step("run_" ++ name, "Run tests"); + run_step.dependOn(&run_cmd.step); + } + + // zig_tests + const zig_tests = .{ + "test0", + "test1", + "test2", + "test3", + }; + inline for (zig_tests) |name| { + const exe = b.addExecutable(.{ + .name = name, + .root_source_file = .{ .path = std.fmt.comptimePrint("tests/{s}.zig", .{name}) }, + .target = target, + .optimize = optimize, + }); + exe.addIncludePath(.{ .path = "./include" }); + exe.addIncludePath(.{ .path = "./include/ggml" }); + exe.linkLibrary(lib); + b.installArtifact(exe); + const run_cmd = b.addRunArtifact(exe); + run_cmd.step.dependOn(b.getInstallStep()); + if (b.args) |args| run_cmd.addArgs(args); + const run_step = b.step("run_zig_" ++ name, "Run zig_tests"); + run_step.dependOn(&run_cmd.step); + } +} diff --git a/seamless_communication/ggml/ci/run.sh b/seamless_communication/ggml/ci/run.sh new file mode 100644 index 0000000..abb43e7 --- /dev/null +++ b/seamless_communication/ggml/ci/run.sh @@ -0,0 +1,260 @@ +#/bin/bash +# +# sample usage: +# +# mkdir tmp +# +# # CPU-only build +# bash ./ci/run.sh ./tmp/results ./tmp/mnt +# +# # with CUDA support +# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt +# + +if [ -z "$2" ]; then + echo "usage: $0 " + exit 1 +fi + +mkdir -p "$1" +mkdir -p "$2" + +OUT=$(realpath "$1") +MNT=$(realpath "$2") + +rm -v $OUT/*.log +rm -v $OUT/*.exit +rm -v $OUT/*.md + +sd=`dirname $0` +cd $sd/../ +SRC=`pwd` + +## helpers + +# download a file if it does not exist or if it is outdated +function gg_wget { + local out=$1 + local url=$2 + + local cwd=`pwd` + + mkdir -p $out + cd $out + + # should not re-download if file is the same + wget -nv -N $url + + cd $cwd +} + +function gg_printf { + printf -- "$@" >> $OUT/README.md +} + +function gg_run { + ci=$1 + + set -o pipefail + set -x + + gg_run_$ci | tee $OUT/$ci.log + cur=$? + echo "$cur" > $OUT/$ci.exit + + set +x + set +o pipefail + + gg_sum_$ci + + ret=$((ret | cur)) +} + +## ci + +# ctest_debug + +function gg_run_ctest_debug { + cd ${SRC} + + rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug + + set -e + + (time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + + (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log + + set +e +} + +function gg_sum_ctest_debug { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs ctest in debug mode\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)" + gg_printf '```\n' + gg_printf '\n' +} + +# ctest_release + +function gg_run_ctest_release { + cd ${SRC} + + rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release + + set -e + + (time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + + if [ -z $GG_BUILD_LOW_PERF ]; then + (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log + else + (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log + fi + + set +e +} + +function gg_sum_ctest_release { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs ctest in release mode\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)" + gg_printf '```\n' +} + +# gpt_2 + +function gg_run_gpt_2 { + cd ${SRC} + + gg_wget models-mnt/gpt-2 https://huggingface.co/ggerganov/ggml/resolve/main/ggml-model-gpt-2-117M.bin + + cd build-ci-release + + set -e + + model="../models-mnt/gpt-2/ggml-model-gpt-2-117M.bin" + prompts="../examples/prompts/gpt-2.txt" + + (time ./bin/gpt-2 --model ${model} -s 1234 -n 64 -tt ${prompts} ) 2>&1 | tee -a $OUT/${ci}-tg.log + (time ./bin/gpt-2 --model ${model} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log + + set +e +} + +function gg_sum_gpt_2 { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs short GPT-2 text generation\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-tg.log)" + gg_printf '```\n' +} + +# mpt + +function gg_run_mpt { + cd ${SRC} + + gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/config.json + gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/tokenizer.json + gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/tokenizer_config.json + gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/pytorch_model.bin.index.json + gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/configuration_mpt.py + gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/resolve/main/pytorch_model-00001-of-00002.bin + gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/resolve/main/pytorch_model-00002-of-00002.bin + + cd build-ci-release + + set -e + + path_models="../models-mnt/mpt/7B" + model_f16="${path_models}/ggml-model-f16.bin" + model_q4_0="${path_models}/ggml-model-q4_0.bin" + + python3 ../examples/mpt/convert-h5-to-ggml.py ${path_models} 1 + ./bin/mpt-quantize ${model_f16} ${model_q4_0} q4_0 + + (time ./bin/mpt --model ${model_f16} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log + (time ./bin/mpt --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log + + set +e +} + +function gg_sum_mpt { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs short MPT text generation\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-tg.log)" + gg_printf '```\n' +} + +# mnist + +function gg_run_mnist { + cd ${SRC} + + cd build-ci-release + + set -e + + mkdir -p models/mnist + python3 ../examples/mnist/convert-h5-to-ggml.py ../examples/mnist/models/mnist/mnist_model.state_dict + + model_f32="./models/mnist/ggml-model-f32.bin" + samples="../examples/mnist/models/mnist/t10k-images.idx3-ubyte" + + # first command runs and exports "mnist.ggml", the second command runs the exported model + + (time ./bin/mnist ${model_f32} ${samples} ) 2>&1 | tee -a $OUT/${ci}-mnist.log + (time ./bin/mnist-cpu ./mnist.ggml ${samples} ) 2>&1 | tee -a $OUT/${ci}-mnist.log + + set +e +} + +function gg_sum_mnist { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'MNIST\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-mnist.log)" + gg_printf '```\n' +} + +## main + +if [ -z $GG_BUILD_LOW_PERF ]; then + rm -rf ${SRC}/models-mnt + + mnt_models=${MNT}/models + mkdir -p ${mnt_models} + ln -sfn ${mnt_models} ${SRC}/models-mnt +fi + +python3 -m pip install -r ${SRC}/requirements.txt + +ret=0 + +test $ret -eq 0 && gg_run ctest_debug +test $ret -eq 0 && gg_run ctest_release +test $ret -eq 0 && gg_run gpt_2 +test $ret -eq 0 && gg_run mnist + +if [ -z $GG_BUILD_LOW_PERF ]; then + test $ret -eq 0 && gg_run mpt +fi + +exit $ret diff --git a/seamless_communication/ggml/cmake/BuildTypes.cmake b/seamless_communication/ggml/cmake/BuildTypes.cmake new file mode 100644 index 0000000..a9c7b6c --- /dev/null +++ b/seamless_communication/ggml/cmake/BuildTypes.cmake @@ -0,0 +1,54 @@ +# Add new build types + +# ReleaseGG - Release with enabled asserts + +SET(CMAKE_CXX_FLAGS_RELEASEGG + "-O3" + CACHE STRING "Flags used by the c++ compiler during release builds with enabled asserts." + FORCE ) +SET(CMAKE_C_FLAGS_RELEASEGG + "-O3" + CACHE STRING "Flags used by the compiler during release builds with enabled asserts." + FORCE ) +SET(CMAKE_EXE_LINKER_FLAGS_RELEASEGG + "" + CACHE STRING "Flags used for linking binaries during release builds with enabled asserts." + FORCE ) +SET(CMAKE_SHARED_LINKER_FLAGS_RELEASEGG + "" + CACHE STRING "Flags used by the shared libraries linker during release builds with enabled asserts." + FORCE ) +MARK_AS_ADVANCED( + CMAKE_CXX_FLAGS_RELEASEGG + CMAKE_C_FLAGS_RELEASEGG + CMAKE_EXE_LINKER_FLAGS_RELEASEGG + CMAKE_SHARED_LINKER_FLAGS_RELEASEGG ) + +# RelWithDebInfoGG - RelWithDebInfo with enabled asserts + +SET(CMAKE_CXX_FLAGS_RELWITHDEBINFOGG + "-O2 -g" + CACHE STRING "Flags used by the c++ compiler during release builds with debug symbols and enabled asserts." + FORCE ) +SET(CMAKE_C_FLAGS_RELWITHDEBINFOGG + "-O2 -g" + CACHE STRING "Flags used by the compiler during release builds with debug symbols and enabled asserts." + FORCE ) +SET(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG + "" + CACHE STRING "Flags used for linking binaries during release builds with debug symbols and enabled asserts." + FORCE ) +SET(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG + "" + CACHE STRING "Flags used by the shared libraries linker during release builds with debug symbols and enabled asserts." + FORCE ) +MARK_AS_ADVANCED( + CMAKE_CXX_FLAGS_RELWITHDEBINFOGG + CMAKE_C_FLAGS_RELWITHDEBINFOGG + CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG + CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG ) + +if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo" "ReleaseGG" "RelWithDebInfoGG") +endif() diff --git a/seamless_communication/ggml/cmake/GitVars.cmake b/seamless_communication/ggml/cmake/GitVars.cmake new file mode 100644 index 0000000..1a4c24e --- /dev/null +++ b/seamless_communication/ggml/cmake/GitVars.cmake @@ -0,0 +1,22 @@ +find_package(Git) + +# the commit's SHA1 +execute_process(COMMAND + "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8 + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + OUTPUT_VARIABLE GIT_SHA1 + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + +# the date of the commit +execute_process(COMMAND + "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + OUTPUT_VARIABLE GIT_DATE + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + +# the subject of the commit +execute_process(COMMAND + "${GIT_EXECUTABLE}" log -1 --format=%s + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + OUTPUT_VARIABLE GIT_COMMIT_SUBJECT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) diff --git a/seamless_communication/ggml/ctypes_utils.py b/seamless_communication/ggml/ctypes_utils.py new file mode 100644 index 0000000..0b595bb --- /dev/null +++ b/seamless_communication/ggml/ctypes_utils.py @@ -0,0 +1,98 @@ +import ctypes +import dataclasses +import functools +import inspect +import types +from typing import Any, Callable, Generic, Optional, Type, TypeVar + +T = TypeVar("T") + + +class Ptr(Generic[T], ctypes._Pointer): # type: ignore + contents: T + + def __new__(cls, x: T) -> "Ptr[T]": + return ctypes.pointer(x) # type: ignore + + +NULLPTR: Ptr[Any] = None # type: ignore[assignment] + + +def c_struct(cls: Type[T]) -> Type[T]: + struct = types.new_class(cls.__name__, bases=(ctypes.Structure,)) + struct.__module__ = cls.__module__ + struct._fields_ = [ # type: ignore + (k, _py_type_to_ctype(v)) for k, v in cls.__annotations__.items() + ] + + def nice_init(self: T, *args: Any, **kwargs: Any) -> None: + dc = cls(*args, **kwargs) + for k, _ in self._fields_: # type: ignore + setattr(self, k, getattr(dc, k)) + + setattr(struct, "__init__", nice_init) + return struct + + +@functools.lru_cache(256) +def _py_type_to_ctype(t: type) -> type: + if isinstance(t, str): + raise ValueError( + f"Type parsing of '{t}' isn't supported, you need to provide a real type annotation." + ) + if t is None: + return None + if isinstance(t, type): + if t.__module__ == "ctypes": + return t + if issubclass(t, ctypes.Structure): + return t + if issubclass(t, ctypes._Pointer): + return t + if t is int: + return ctypes.c_int + if t is float: + return ctypes.c_float + if t is bool: + return ctypes.c_bool + if t is bytes: + return ctypes.c_char_p + if t is str: + raise ValueError("str type is't supported by ctypes ?") + + if getattr(t, "__origin__", None) is Ptr: + pointee = _py_type_to_ctype(t.__args__[0]) # type: ignore + return ctypes.POINTER(pointee) + + return ctypes.c_void_p + + +F = TypeVar("F", bound=Callable[..., Any]) + + +def _c_fn(module: Any, fn: F) -> F: + if callable(module): + c_fn = module + else: + c_fn = getattr(module, fn.__name__) + annotations = fn.__annotations__ + if "return" not in annotations: + raise ValueError( + "@c_fn decorator requires type annotations on the decorated function." + ) + + c_fn.argtypes = [ + _py_type_to_ctype(t) for k, t in fn.__annotations__.items() if k != "return" + ] + c_fn.restype = _py_type_to_ctype(fn.__annotations__["return"]) + + @functools.wraps(fn) + def actual_fn(*args, **kwargs): # type: ignore + raw_res = c_fn(*args, **kwargs) + return raw_res + + return actual_fn # type: ignore + + +def c_fn(module: Any) -> Callable[[F], F]: + return functools.partial(_c_fn, module) diff --git a/seamless_communication/ggml/examples/CMakeLists.txt b/seamless_communication/ggml/examples/CMakeLists.txt new file mode 100644 index 0000000..df7b1c4 --- /dev/null +++ b/seamless_communication/ggml/examples/CMakeLists.txt @@ -0,0 +1,21 @@ +if (GGML_ALL_WARNINGS) + if (NOT MSVC) + set(cxx_flags + # TODO(marella): Add other warnings. + -Wpedantic + -Wunused-variable + -Wno-unused-function + -Wno-multichar + ) + add_compile_options("$<$:${cxx_flags}>") + endif() +endif() + +add_library(common STATIC common.cpp) +target_include_directories(common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + +add_library(common-ggml STATIC common-ggml.cpp) +target_link_libraries(common-ggml PRIVATE ggml) +target_include_directories(common-ggml PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + +add_subdirectory(unity) diff --git a/seamless_communication/ggml/examples/common-ggml.cpp b/seamless_communication/ggml/examples/common-ggml.cpp new file mode 100644 index 0000000..33ae03a --- /dev/null +++ b/seamless_communication/ggml/examples/common-ggml.cpp @@ -0,0 +1,246 @@ +#include "common-ggml.h" + +#include +#include + +static const std::map GGML_FTYPE_MAP = { + {"q4_0", GGML_FTYPE_MOSTLY_Q4_0}, + {"q4_1", GGML_FTYPE_MOSTLY_Q4_1}, + {"q5_0", GGML_FTYPE_MOSTLY_Q5_0}, + {"q5_1", GGML_FTYPE_MOSTLY_Q5_1}, + {"q8_0", GGML_FTYPE_MOSTLY_Q8_0}, +}; + +void ggml_print_ftypes(FILE * fp) { + for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) { + fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second); + } +} + +enum ggml_ftype ggml_parse_ftype(const char * str) { + enum ggml_ftype ftype; + if (str[0] == 'q') { + const auto it = GGML_FTYPE_MAP.find(str); + if (it == GGML_FTYPE_MAP.end()) { + fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str); + return GGML_FTYPE_UNKNOWN; + } + ftype = it->second; + } else { + ftype = (enum ggml_ftype) atoi(str); + } + + return ftype; +} + +bool ggml_common_quantize_0( + std::ifstream & finp, + std::ofstream & fout, + const ggml_ftype ftype, + const std::vector & to_quant, + const std::vector & to_skip) { + + ggml_type qtype = GGML_TYPE_F32; + + switch (ftype) { + case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break; + case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break; + case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break; + case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break; + case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break; + case GGML_FTYPE_UNKNOWN: + case GGML_FTYPE_ALL_F32: + case GGML_FTYPE_MOSTLY_F16: + case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: + case GGML_FTYPE_MOSTLY_Q2_K: + case GGML_FTYPE_MOSTLY_Q3_K: + case GGML_FTYPE_MOSTLY_Q4_K: + case GGML_FTYPE_MOSTLY_Q5_K: + case GGML_FTYPE_MOSTLY_Q6_K: + { + fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype); + return false; + } + }; + + if (!ggml_is_quantized(qtype)) { + fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype)); + return false; + } + + size_t total_size_org = 0; + size_t total_size_new = 0; + + std::vector work; + + std::vector data_u8; + std::vector data_f16; + std::vector data_f32; + + std::vector hist_all(1 << 4, 0); + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ttype; + + finp.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + finp.read(reinterpret_cast(&length), sizeof(length)); + finp.read(reinterpret_cast(&ttype), sizeof(ttype)); + + if (finp.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[4] = { 1, 1, 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + finp.read (reinterpret_cast(&ne[i]), sizeof(ne[i])); + nelements *= ne[i]; + } + + std::string name(length, 0); + finp.read (&name[0], length); + + printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype)); + + bool quantize = false; + + // check if we should quantize this tensor + for (const auto & s : to_quant) { + if (std::regex_match(name, std::regex(s))) { + quantize = true; + break; + } + } + + // check if we should skip this tensor + for (const auto & s : to_skip) { + if (std::regex_match(name, std::regex(s))) { + quantize = false; + break; + } + } + + // quantize only 2D tensors + quantize &= (n_dims == 2); + + if (quantize) { + if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) { + fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype)); + return false; + } + + if (ttype == GGML_TYPE_F16) { + data_f16.resize(nelements); + finp.read(reinterpret_cast(data_f16.data()), nelements * sizeof(ggml_fp16_t)); + data_f32.resize(nelements); + for (int i = 0; i < nelements; ++i) { + data_f32[i] = ggml_fp16_to_fp32(data_f16[i]); + } + } else { + data_f32.resize(nelements); + finp.read(reinterpret_cast(data_f32.data()), nelements * sizeof(float)); + } + + ttype = qtype; + } else { + const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t); + + data_u8.resize(nelements*bpe); + finp.read(reinterpret_cast(data_u8.data()), nelements * bpe); + } + + fout.write(reinterpret_cast(&n_dims), sizeof(n_dims)); + fout.write(reinterpret_cast(&length), sizeof(length)); + fout.write(reinterpret_cast(&ttype), sizeof(ttype)); + for (int i = 0; i < n_dims; ++i) { + fout.write(reinterpret_cast(&ne[i]), sizeof(ne[i])); + } + fout.write(&name[0], length); + + if (quantize) { + work.resize(nelements); // for quantization + + size_t cur_size = 0; + std::vector hist_cur(1 << 4, 0); + + switch ((ggml_type) ttype) { + case GGML_TYPE_Q4_0: + { + cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q4_1: + { + cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q5_0: + { + cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q5_1: + { + cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q8_0: + { + cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_Q8_K: + case GGML_TYPE_COUNT: + { + fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype)); + return false; + } + } + + fout.write(reinterpret_cast(work.data()), cur_size); + total_size_new += cur_size; + + printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); + for (int i = 0; i < (int) hist_cur.size(); ++i) { + hist_all[i] += hist_cur[i]; + } + + for (int i = 0; i < (int) hist_cur.size(); ++i) { + printf("%5.3f ", hist_cur[i] / (float)nelements); + } + printf("\n"); + } else { + printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0); + fout.write(reinterpret_cast(data_u8.data()), data_u8.size()); + total_size_new += data_u8.size(); + } + + total_size_org += nelements * sizeof(float); + } + + printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); + printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype)); + + { + int64_t sum_all = 0; + for (int i = 0; i < (int) hist_all.size(); ++i) { + sum_all += hist_all[i]; + } + + printf("%s: hist: ", __func__); + for (int i = 0; i < (int) hist_all.size(); ++i) { + printf("%5.3f ", hist_all[i] / (float)sum_all); + } + printf("\n"); + } + + return true; +} diff --git a/seamless_communication/ggml/examples/common-ggml.h b/seamless_communication/ggml/examples/common-ggml.h new file mode 100644 index 0000000..477de34 --- /dev/null +++ b/seamless_communication/ggml/examples/common-ggml.h @@ -0,0 +1,18 @@ +#pragma once + +#include "ggml.h" + +#include +#include +#include + +enum ggml_ftype ggml_parse_ftype(const char * str); + +void ggml_print_ftypes(FILE * fp = stderr); + +bool ggml_common_quantize_0( + std::ifstream & finp, + std::ofstream & fout, + const ggml_ftype ftype, + const std::vector & to_quant, + const std::vector & to_skip); diff --git a/seamless_communication/ggml/examples/common.cpp b/seamless_communication/ggml/examples/common.cpp new file mode 100644 index 0000000..b754d38 --- /dev/null +++ b/seamless_communication/ggml/examples/common.cpp @@ -0,0 +1,809 @@ +#define _USE_MATH_DEFINES // for M_PI + +#include "common.h" + +// third-party utilities +// use your favorite implementations +#define DR_WAV_IMPLEMENTATION +#include "dr_wav.h" + +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +// Function to check if the next argument exists +std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) { + if (i + 1 < argc && argv[i + 1][0] != '-') { + return argv[++i]; + } else { + fprintf(stderr, "error: %s requires one argument.\n", flag.c_str()); + gpt_print_usage(argc, argv, params); + exit(0); + } +} + +bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + + if (arg == "-s" || arg == "--seed") { + params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-t" || arg == "--threads") { + params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { + params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-p" || arg == "--prompt") { + params.prompt = get_next_arg(i, argc, argv, arg, params); + } else if (arg == "-n" || arg == "--n_predict") { + params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--top_k") { + params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--top_p") { + params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--temp") { + params.temp = std::stof(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--repeat-last-n") { + params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--repeat-penalty") { + params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-b" || arg == "--batch_size") { + params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-m" || arg == "--model") { + params.model = get_next_arg(i, argc, argv, arg, params); + } else if (arg == "-i" || arg == "--interactive") { + params.interactive = true; + } else if (arg == "-ip" || arg == "--interactive-port") { + params.interactive = true; + params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-h" || arg == "--help") { + gpt_print_usage(argc, argv, params); + exit(0); + } else if (arg == "-f" || arg == "--file") { + get_next_arg(i, argc, argv, arg, params); + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + break; + } + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); + if (params.prompt.back() == '\n') { + params.prompt.pop_back(); + } + } else if (arg == "-tt" || arg == "--token_test") { + params.token_test = get_next_arg(i, argc, argv, arg, params); + } + else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + gpt_print_usage(argc, argv, params); + exit(0); + } + } + + return true; +} + +void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers); + fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); + fprintf(stderr, " prompt to start generation with (default: random)\n"); + fprintf(stderr, " -f FNAME, --file FNAME\n"); + fprintf(stderr, " load prompt from a file\n"); + fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n"); + fprintf(stderr, " test tokenization\n"); + fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); + fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k); + fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); + fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp); + fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n); + fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty); + fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); + fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, "\n"); +} + +std::string gpt_random_prompt(std::mt19937 & rng) { + const int r = rng() % 10; + switch (r) { + case 0: return "So"; + case 1: return "Once upon a time"; + case 2: return "When"; + case 3: return "The"; + case 4: return "After"; + case 5: return "If"; + case 6: return "import"; + case 7: return "He"; + case 8: return "She"; + case 9: return "They"; + default: return "To"; + } + + return "The"; +} + +std::string trim(const std::string & s) { + std::regex e("^\\s+|\\s+$"); + return std::regex_replace(s, e, ""); +} + +std::string replace(const std::string & s, const std::string & from, const std::string & to) { + std::string result = s; + size_t pos = 0; + while ((pos = result.find(from, pos)) != std::string::npos) { + result.replace(pos, from.length(), to); + pos += to.length(); + } + return result; +} + +void gpt_vocab::add_special_token(const std::string & token) { + special_tokens.push_back(token); +} + +std::map json_parse(const std::string & fname) { + std::map result; + + // read file into string + std::string json; + { + std::ifstream ifs(fname); + if (!ifs) { + fprintf(stderr, "Failed to open %s\n", fname.c_str()); + exit(1); + } + + json = std::string((std::istreambuf_iterator(ifs)), + (std::istreambuf_iterator())); + } + + if (json[0] != '{') { + return result; + } + + // parse json + { + bool has_key = false; + bool in_token = false; + + std::string str_key = ""; + std::string str_val = ""; + + int n = json.size(); + for (int i = 1; i < n; ++i) { + if (!in_token) { + if (json[i] == ' ') continue; + if (json[i] == '"') { + in_token = true; + continue; + } + } else { + if (json[i] == '\\' && i+1 < n) { + if (has_key == false) { + str_key += json[i]; + } else { + str_val += json[i]; + } + ++i; + } else if (json[i] == '"') { + if (has_key == false) { + has_key = true; + ++i; + while (json[i] == ' ') ++i; + ++i; // : + while (json[i] == ' ') ++i; + if (json[i] != '\"') { + while (json[i] != ',' && json[i] != '}') { + str_val += json[i++]; + } + has_key = false; + } else { + in_token = true; + continue; + } + } else { + has_key = false; + } + + str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space + str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line + str_key = ::replace(str_key, "\\\"", "\""); // \\\" -> " + + try { + result[str_key] = std::stoi(str_val); + } catch (...) { + //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str()); + + } + str_key = ""; + str_val = ""; + in_token = false; + continue; + } + if (has_key == false) { + str_key += json[i]; + } else { + str_val += json[i]; + } + } + } + } + + return result; +} + +std::string convert_to_utf8(const std::wstring & input) { + std::wstring_convert> converter; + return converter.to_bytes(input); +} + + +std::wstring convert_to_wstring(const std::string & input) { + std::wstring_convert> converter; + return converter.from_bytes(input); +} + +void gpt_split_words(std::string str, std::vector& words) { + const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; + const std::regex re(pattern); + std::smatch m; + + while (std::regex_search(str, m, re)) { + for (auto x : m) { + words.push_back(x); + } + str = m.suffix(); + } +} + +std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text) { + std::vector words; + + // first split the text into words + { + std::string str = text; + + // Generate the subpattern from the special_tokens vector if it's not empty + if (!vocab.special_tokens.empty()) { + const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])"); + std::string special_tokens_subpattern; + for (const auto & token : vocab.special_tokens) { + if (!special_tokens_subpattern.empty()) { + special_tokens_subpattern += "|"; + } + special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)"); + } + + std::regex re(special_tokens_subpattern); + std::smatch m; + // Split the text by special tokens. + while (std::regex_search(str, m, re)) { + // Split the substrings in-between special tokens into words. + gpt_split_words(m.prefix(), words); + // Add matched special tokens as words. + for (auto x : m) { + words.push_back(x); + } + str = m.suffix(); + } + // Remaining text without special tokens will be handled below. + } + + gpt_split_words(str, words); + } + + // find the longest token that forms each word in words: + std::vector tokens; + for (const auto & word : words) { + for (int i = 0; i < (int) word.size(); ){ + for (int j = word.size() - 1; j >= i; j--){ + auto cand = word.substr(i, j-i+1); + auto it = vocab.token_to_id.find(cand); + if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab + tokens.push_back(it->second); + i = j + 1; + break; + } + else if (j == i){ // word.substr(i, 1) has no matching + fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data()); + i++; + } + } + } + } + + return tokens; +} + +std::vector parse_tokens_from_string(const std::string& input, char delimiter) { + std::vector output; + std::stringstream ss(input); + std::string token; + + while (std::getline(ss, token, delimiter)) { + output.push_back(std::stoi(token)); + } + + return output; +} + +std::map> extract_tests_from_file(const std::string & fpath_test){ + if (fpath_test.empty()){ + fprintf(stderr, "%s : No test file found.\n", __func__); + return std::map>(); + } + + std::map> tests; + + auto fin = std::ifstream(fpath_test, std::ios_base::in); + const char * delimeter = " => "; + const char del_tok = ','; + std::string line; + while (std::getline(fin, line)) { + size_t delimiterPos = line.find(delimeter); + if (delimiterPos != std::string::npos) { + std::string text = line.substr(0, delimiterPos); + std::string s_tokens = line.substr(delimiterPos + std::strlen(delimeter)); + tests[text] = parse_tokens_from_string(s_tokens, del_tok); + } + } + return tests; +} + +void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test){ + std::map> tests = extract_tests_from_file(fpath_test); + + size_t n_fails = 0; + + for (const auto & test : tests) { + std::vector tokens = gpt_tokenize(vocab, test.first); + + if (tokens != test.second){ + n_fails++; + + // print out failure cases + fprintf(stderr, "%s : failed test: '%s'\n", __func__, test.first.c_str()); + fprintf(stderr, "%s : tokens in hf: ", __func__); + for (const auto & t : test.second) { + fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t); + } + fprintf(stderr, "\n"); + fprintf(stderr, "%s : tokens in ggml: ", __func__); + for (const auto & t : tokens) { + fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t); + } + fprintf(stderr, "\n"); + } + } + + fprintf(stderr, "%s : %zu tests failed out of %zu tests.\n", __func__, n_fails, tests.size()); +} + +bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) { + printf("%s: loading vocab from '%s'\n", __func__, fname.c_str()); + + vocab.token_to_id = ::json_parse(fname); + + for (const auto & kv : vocab.token_to_id) { + vocab.id_to_token[kv.second] = kv.first; + } + + printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size()); + + // print the vocabulary + //for (auto kv : vocab.token_to_id) { + // printf("'%s' -> %d\n", kv.first.data(), kv.second); + //} + + return true; +} + +gpt_vocab::id gpt_sample_top_k_top_p( + const gpt_vocab & vocab, + const float * logits, + int top_k, + double top_p, + double temp, + std::mt19937 & rng) { + int n_logits = vocab.id_to_token.size(); + + std::vector> logits_id; + logits_id.reserve(n_logits); + + { + const double scale = 1.0/temp; + for (int i = 0; i < n_logits; ++i) { + logits_id.push_back(std::make_pair(logits[i]*scale, i)); + } + } + + // find the top K tokens + std::partial_sort( + logits_id.begin(), + logits_id.begin() + top_k, logits_id.end(), + [](const std::pair & a, const std::pair & b) { + return a.first > b.first; + }); + + logits_id.resize(top_k); + + double maxl = -INFINITY; + for (const auto & kv : logits_id) { + maxl = std::max(maxl, kv.first); + } + + // compute probs for the top K tokens + std::vector probs; + probs.reserve(logits_id.size()); + + double sum = 0.0; + for (const auto & kv : logits_id) { + double p = exp(kv.first - maxl); + probs.push_back(p); + sum += p; + } + + // normalize the probs + for (auto & p : probs) { + p /= sum; + } + + if (top_p < 1.0f) { + double cumsum = 0.0f; + for (int i = 0; i < top_k; i++) { + cumsum += probs[i]; + if (cumsum >= top_p) { + top_k = i + 1; + probs.resize(top_k); + logits_id.resize(top_k); + break; + } + } + + cumsum = 1.0/cumsum; + for (int i = 0; i < (int) probs.size(); i++) { + probs[i] *= cumsum; + } + } + + //printf("\n"); + //for (int i = 0; i < (int) probs.size(); i++) { + // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); + //} + //exit(0); + + std::discrete_distribution<> dist(probs.begin(), probs.end()); + int idx = dist(rng); + + return logits_id[idx].second; +} + +gpt_vocab::id gpt_sample_top_k_top_p_repeat( + const gpt_vocab & vocab, + const float * logits, + const int32_t * last_n_tokens_data, + size_t last_n_tokens_data_size, + int top_k, + double top_p, + double temp, + int repeat_last_n, + float repeat_penalty, + std::mt19937 & rng) { + + int n_logits = vocab.id_to_token.size(); + + const auto * plogits = logits; + + const auto last_n_tokens = std::vector(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size); + + if (temp <= 0) { + // select the token with the highest logit directly + float max_logit = plogits[0]; + gpt_vocab::id max_id = 0; + + for (int i = 1; i < n_logits; ++i) { + if (plogits[i] > max_logit) { + max_logit = plogits[i]; + max_id = i; + } + } + return max_id; + } + + + std::vector> logits_id; + logits_id.reserve(n_logits); + + { + const float scale = 1.0f/temp; + for (int i = 0; i < n_logits; ++i) { + // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858) + // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main + if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) { + // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability + if (plogits[i] < 0.0f) { + logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i)); + } else { + logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i)); + } + } else { + logits_id.push_back(std::make_pair(plogits[i]*scale, i)); + } + } + } + + // find the top K tokens + std::partial_sort( + logits_id.begin(), + logits_id.begin() + top_k, logits_id.end(), + [](const std::pair & a, const std::pair & b) { + return a.first > b.first; + }); + + logits_id.resize(top_k); + + double maxl = -INFINITY; + for (const auto & kv : logits_id) { + maxl = std::max(maxl, kv.first); + } + + // compute probs for the top K tokens + std::vector probs; + probs.reserve(logits_id.size()); + + double sum = 0.0; + for (const auto & kv : logits_id) { + double p = exp(kv.first - maxl); + probs.push_back(p); + sum += p; + } + + // normalize the probs + for (auto & p : probs) { + p /= sum; + } + + if (top_p < 1.0f) { + double cumsum = 0.0f; + for (int i = 0; i < top_k; i++) { + cumsum += probs[i]; + if (cumsum >= top_p) { + top_k = i + 1; + probs.resize(top_k); + logits_id.resize(top_k); + break; + } + } + + cumsum = 1.0/cumsum; + for (int i = 0; i < (int) probs.size(); i++) { + probs[i] *= cumsum; + } + } + +// printf("\n"); +// for (int i = 0; i < (int) probs.size(); i++) { +// for (int i = 0; i < 10; i++) { +// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); +// } + + std::discrete_distribution<> dist(probs.begin(), probs.end()); + int idx = dist(rng); + + return logits_id[idx].second; + +} + +bool read_wav(const std::string & fname, std::vector& pcmf32, std::vector>& pcmf32s, bool stereo) { + drwav wav; + std::vector wav_data; // used for pipe input from stdin + + if (fname == "-") { + { + uint8_t buf[1024]; + while (true) + { + const size_t n = fread(buf, 1, sizeof(buf), stdin); + if (n == 0) { + break; + } + wav_data.insert(wav_data.end(), buf, buf + n); + } + } + + if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) { + fprintf(stderr, "error: failed to open WAV file from stdin\n"); + return false; + } + + fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); + } + else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) { + fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str()); + return false; + } + + if (wav.channels != 1 && wav.channels != 2) { + fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str()); + return false; + } + + if (stereo && wav.channels != 2) { + fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str()); + return false; + } + + if (wav.sampleRate != COMMON_SAMPLE_RATE) { + fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000); + return false; + } + + if (wav.bitsPerSample != 16) { + fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str()); + return false; + } + + const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8); + + std::vector pcm16; + pcm16.resize(n*wav.channels); + drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); + drwav_uninit(&wav); + + // convert to mono, float + pcmf32.resize(n); + if (wav.channels == 1) { + for (uint64_t i = 0; i < n; i++) { + pcmf32[i] = float(pcm16[i])/32768.0f; + } + } else { + for (uint64_t i = 0; i < n; i++) { + pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f; + } + } + + if (stereo) { + // convert to stereo, float + pcmf32s.resize(2); + + pcmf32s[0].resize(n); + pcmf32s[1].resize(n); + for (uint64_t i = 0; i < n; i++) { + pcmf32s[0][i] = float(pcm16[2*i])/32768.0f; + pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f; + } + } + + return true; +} + +void high_pass_filter(std::vector & data, float cutoff, float sample_rate) { + const float rc = 1.0f / (2.0f * M_PI * cutoff); + const float dt = 1.0f / sample_rate; + const float alpha = dt / (rc + dt); + + float y = data[0]; + + for (size_t i = 1; i < data.size(); i++) { + y = alpha * (y + data[i] - data[i - 1]); + data[i] = y; + } +} + +bool vad_simple(std::vector & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) { + const int n_samples = pcmf32.size(); + const int n_samples_last = (sample_rate * last_ms) / 1000; + + if (n_samples_last >= n_samples) { + // not enough samples - assume no speech + return false; + } + + if (freq_thold > 0.0f) { + high_pass_filter(pcmf32, freq_thold, sample_rate); + } + + float energy_all = 0.0f; + float energy_last = 0.0f; + + for (int i = 0; i < n_samples; i++) { + energy_all += fabsf(pcmf32[i]); + + if (i >= n_samples - n_samples_last) { + energy_last += fabsf(pcmf32[i]); + } + } + + energy_all /= n_samples; + energy_last /= n_samples_last; + + if (verbose) { + fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold); + } + + if (energy_last > vad_thold*energy_all) { + return false; + } + + return true; +} + +float similarity(const std::string & s0, const std::string & s1) { + const size_t len0 = s0.size() + 1; + const size_t len1 = s1.size() + 1; + + std::vector col(len1, 0); + std::vector prevCol(len1, 0); + + for (size_t i = 0; i < len1; i++) { + prevCol[i] = i; + } + + for (size_t i = 0; i < len0; i++) { + col[0] = i; + for (size_t j = 1; j < len1; j++) { + col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1)); + } + col.swap(prevCol); + } + + const float dist = prevCol[len1 - 1]; + + return 1.0f - (dist / std::max(s0.size(), s1.size())); +} + +bool sam_params_parse(int argc, char ** argv, sam_params & params) { + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + + if (arg == "-s" || arg == "--seed") { + params.seed = std::stoi(argv[++i]); + } else if (arg == "-t" || arg == "--threads") { + params.n_threads = std::stoi(argv[++i]); + } else if (arg == "-m" || arg == "--model") { + params.model = argv[++i]; + } else if (arg == "-i" || arg == "--inp") { + params.fname_inp = argv[++i]; + } else if (arg == "-o" || arg == "--out") { + params.fname_out = argv[++i]; + } else if (arg == "-h" || arg == "--help") { + sam_print_usage(argc, argv, params); + exit(0); + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + sam_print_usage(argc, argv, params); + exit(0); + } + } + + return true; +} + +void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, " -i FNAME, --inp FNAME\n"); + fprintf(stderr, " input file (default: %s)\n", params.fname_inp.c_str()); + fprintf(stderr, " -o FNAME, --out FNAME\n"); + fprintf(stderr, " output file (default: %s)\n", params.fname_out.c_str()); + fprintf(stderr, "\n"); +} \ No newline at end of file diff --git a/seamless_communication/ggml/examples/common.h b/seamless_communication/ggml/examples/common.h new file mode 100644 index 0000000..0462ae3 --- /dev/null +++ b/seamless_communication/ggml/examples/common.h @@ -0,0 +1,178 @@ +// Various helper functions and utilities + +#pragma once + +#include +#include +#include +#include +#include + +#define COMMON_SAMPLE_RATE 16000 + +// +// GPT CLI argument parsing +// + +struct gpt_params { + int32_t seed = -1; // RNG seed + int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + int32_t n_predict = 200; // new tokens to predict + int32_t n_batch = 8; // batch size for prompt processing + + // sampling parameters + int32_t top_k = 40; + float top_p = 0.9f; + float temp = 0.9f; + int32_t repeat_last_n = 64; + float repeat_penalty = 1.00f; + + std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path + std::string prompt = ""; + std::string token_test = ""; + + bool interactive = false; + int32_t interactive_port = -1; + + int32_t n_gpu_layers = 0; +}; + +bool gpt_params_parse(int argc, char ** argv, gpt_params & params); + +void gpt_print_usage(int argc, char ** argv, const gpt_params & params); + + + +std::string gpt_random_prompt(std::mt19937 & rng); + +// +// Vocab utils +// + +std::string trim(const std::string & s); + +std::string replace( + const std::string & s, + const std::string & from, + const std::string & to); + +struct gpt_vocab { + using id = int32_t; + using token = std::string; + + std::map token_to_id; + std::map id_to_token; + std::vector special_tokens; + + void add_special_token(const std::string & token); +}; + +// poor-man's JSON parsing +std::map json_parse(const std::string & fname); + +std::string convert_to_utf8(const std::wstring & input); + +std::wstring convert_to_wstring(const std::string & input); + +void gpt_split_words(std::string str, std::vector& words); + +// split text into tokens +// +// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53 +// +// Regex (Python): +// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" +// +// Regex (C++): +// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)" +// +std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text); + +// test outputs of gpt_tokenize +// +// - compare with tokens generated by the huggingface tokenizer +// - test cases are chosen based on the model's main language (under 'prompt' directory) +// - if all sentences are tokenized identically, print 'All tests passed.' +// - otherwise, print sentence, huggingface tokens, ggml tokens +// +void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test); + +// load the tokens from encoder.json +bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab); + +// sample next token given probabilities for each embedding +// +// - consider only the top K tokens +// - from them, consider only the top tokens with cumulative probability > P +// +// TODO: not sure if this implementation is correct +// TODO: temperature is not implemented +// +gpt_vocab::id gpt_sample_top_k_top_p( + const gpt_vocab & vocab, + const float * logits, + int top_k, + double top_p, + double temp, + std::mt19937 & rng); + +gpt_vocab::id gpt_sample_top_k_top_p_repeat( + const gpt_vocab & vocab, + const float * logits, + const int32_t * last_n_tokens_data, + size_t last_n_tokens_data_size, + int top_k, + double top_p, + double temp, + int repeat_last_n, + float repeat_penalty, + std::mt19937 & rng); + +// +// Audio utils +// + +// Read WAV audio file and store the PCM data into pcmf32 +// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE +// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM +bool read_wav( + const std::string & fname, + std::vector & pcmf32, + std::vector> & pcmf32s, + bool stereo); + +// Apply a high-pass frequency filter to PCM audio +// Suppresses frequencies below cutoff Hz +void high_pass_filter( + std::vector & data, + float cutoff, + float sample_rate); + +// Basic voice activity detection (VAD) using audio energy adaptive threshold +bool vad_simple( + std::vector & pcmf32, + int sample_rate, + int last_ms, + float vad_thold, + float freq_thold, + bool verbose); + +// compute similarity between two strings using Levenshtein distance +float similarity(const std::string & s0, const std::string & s1); + +// +// SAM argument parsing +// + +struct sam_params { + int32_t seed = -1; // RNG seed + int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + + std::string model = "models/sam-vit-b/ggml-model-f16.bin"; // model path + std::string fname_inp = "img.jpg"; + std::string fname_out = "img.out"; +}; + +bool sam_params_parse(int argc, char ** argv, sam_params & params); + +void sam_print_usage(int argc, char ** argv, const sam_params & params); diff --git a/seamless_communication/ggml/examples/dr_wav.h b/seamless_communication/ggml/examples/dr_wav.h new file mode 100644 index 0000000..fd3e95b --- /dev/null +++ b/seamless_communication/ggml/examples/dr_wav.h @@ -0,0 +1,6434 @@ +/* +WAV audio loader and writer. Choice of public domain or MIT-0. See license statements at the end of this file. +dr_wav - v0.12.16 - 2020-12-02 + +David Reid - mackron@gmail.com + +GitHub: https://github.com/mackron/dr_libs +*/ + +/* +RELEASE NOTES - VERSION 0.12 +============================ +Version 0.12 includes breaking changes to custom chunk handling. + + +Changes to Chunk Callback +------------------------- +dr_wav supports the ability to fire a callback when a chunk is encounted (except for WAVE and FMT chunks). The callback has been updated to include both the +container (RIFF or Wave64) and the FMT chunk which contains information about the format of the data in the wave file. + +Previously, there was no direct way to determine the container, and therefore no way to discriminate against the different IDs in the chunk header (RIFF and +Wave64 containers encode chunk ID's differently). The `container` parameter can be used to know which ID to use. + +Sometimes it can be useful to know the data format at the time the chunk callback is fired. A pointer to a `drwav_fmt` object is now passed into the chunk +callback which will give you information about the data format. To determine the sample format, use `drwav_fmt_get_format()`. This will return one of the +`DR_WAVE_FORMAT_*` tokens. +*/ + +/* +Introduction +============ +This is a single file library. To use it, do something like the following in one .c file. + + ```c + #define DR_WAV_IMPLEMENTATION + #include "dr_wav.h" + ``` + +You can then #include this file in other parts of the program as you would with any other header file. Do something like the following to read audio data: + + ```c + drwav wav; + if (!drwav_init_file(&wav, "my_song.wav", NULL)) { + // Error opening WAV file. + } + + drwav_int32* pDecodedInterleavedPCMFrames = malloc(wav.totalPCMFrameCount * wav.channels * sizeof(drwav_int32)); + size_t numberOfSamplesActuallyDecoded = drwav_read_pcm_frames_s32(&wav, wav.totalPCMFrameCount, pDecodedInterleavedPCMFrames); + + ... + + drwav_uninit(&wav); + ``` + +If you just want to quickly open and read the audio data in a single operation you can do something like this: + + ```c + unsigned int channels; + unsigned int sampleRate; + drwav_uint64 totalPCMFrameCount; + float* pSampleData = drwav_open_file_and_read_pcm_frames_f32("my_song.wav", &channels, &sampleRate, &totalPCMFrameCount, NULL); + if (pSampleData == NULL) { + // Error opening and reading WAV file. + } + + ... + + drwav_free(pSampleData); + ``` + +The examples above use versions of the API that convert the audio data to a consistent format (32-bit signed PCM, in this case), but you can still output the +audio data in its internal format (see notes below for supported formats): + + ```c + size_t framesRead = drwav_read_pcm_frames(&wav, wav.totalPCMFrameCount, pDecodedInterleavedPCMFrames); + ``` + +You can also read the raw bytes of audio data, which could be useful if dr_wav does not have native support for a particular data format: + + ```c + size_t bytesRead = drwav_read_raw(&wav, bytesToRead, pRawDataBuffer); + ``` + +dr_wav can also be used to output WAV files. This does not currently support compressed formats. To use this, look at `drwav_init_write()`, +`drwav_init_file_write()`, etc. Use `drwav_write_pcm_frames()` to write samples, or `drwav_write_raw()` to write raw data in the "data" chunk. + + ```c + drwav_data_format format; + format.container = drwav_container_riff; // <-- drwav_container_riff = normal WAV files, drwav_container_w64 = Sony Wave64. + format.format = DR_WAVE_FORMAT_PCM; // <-- Any of the DR_WAVE_FORMAT_* codes. + format.channels = 2; + format.sampleRate = 44100; + format.bitsPerSample = 16; + drwav_init_file_write(&wav, "data/recording.wav", &format, NULL); + + ... + + drwav_uint64 framesWritten = drwav_write_pcm_frames(pWav, frameCount, pSamples); + ``` + +dr_wav has seamless support the Sony Wave64 format. The decoder will automatically detect it and it should Just Work without any manual intervention. + + +Build Options +============= +#define these options before including this file. + +#define DR_WAV_NO_CONVERSION_API + Disables conversion APIs such as `drwav_read_pcm_frames_f32()` and `drwav_s16_to_f32()`. + +#define DR_WAV_NO_STDIO + Disables APIs that initialize a decoder from a file such as `drwav_init_file()`, `drwav_init_file_write()`, etc. + + + +Notes +===== +- Samples are always interleaved. +- The default read function does not do any data conversion. Use `drwav_read_pcm_frames_f32()`, `drwav_read_pcm_frames_s32()` and `drwav_read_pcm_frames_s16()` + to read and convert audio data to 32-bit floating point, signed 32-bit integer and signed 16-bit integer samples respectively. Tested and supported internal + formats include the following: + - Unsigned 8-bit PCM + - Signed 12-bit PCM + - Signed 16-bit PCM + - Signed 24-bit PCM + - Signed 32-bit PCM + - IEEE 32-bit floating point + - IEEE 64-bit floating point + - A-law and u-law + - Microsoft ADPCM + - IMA ADPCM (DVI, format code 0x11) +- dr_wav will try to read the WAV file as best it can, even if it's not strictly conformant to the WAV format. +*/ + +#ifndef dr_wav_h +#define dr_wav_h + +#ifdef __cplusplus +extern "C" { +#endif + +#define DRWAV_STRINGIFY(x) #x +#define DRWAV_XSTRINGIFY(x) DRWAV_STRINGIFY(x) + +#define DRWAV_VERSION_MAJOR 0 +#define DRWAV_VERSION_MINOR 12 +#define DRWAV_VERSION_REVISION 16 +#define DRWAV_VERSION_STRING DRWAV_XSTRINGIFY(DRWAV_VERSION_MAJOR) "." DRWAV_XSTRINGIFY(DRWAV_VERSION_MINOR) "." DRWAV_XSTRINGIFY(DRWAV_VERSION_REVISION) + +#include /* For size_t. */ + +/* Sized types. */ +typedef signed char drwav_int8; +typedef unsigned char drwav_uint8; +typedef signed short drwav_int16; +typedef unsigned short drwav_uint16; +typedef signed int drwav_int32; +typedef unsigned int drwav_uint32; +#if defined(_MSC_VER) + typedef signed __int64 drwav_int64; + typedef unsigned __int64 drwav_uint64; +#else + #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))) + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wlong-long" + #if defined(__clang__) + #pragma GCC diagnostic ignored "-Wc++11-long-long" + #endif + #endif + typedef signed long long drwav_int64; + typedef unsigned long long drwav_uint64; + #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))) + #pragma GCC diagnostic pop + #endif +#endif +#if defined(__LP64__) || defined(_WIN64) || (defined(__x86_64__) && !defined(__ILP32__)) || defined(_M_X64) || defined(__ia64) || defined (_M_IA64) || defined(__aarch64__) || defined(__powerpc64__) + typedef drwav_uint64 drwav_uintptr; +#else + typedef drwav_uint32 drwav_uintptr; +#endif +typedef drwav_uint8 drwav_bool8; +typedef drwav_uint32 drwav_bool32; +#define DRWAV_TRUE 1 +#define DRWAV_FALSE 0 + +#if !defined(DRWAV_API) + #if defined(DRWAV_DLL) + #if defined(_WIN32) + #define DRWAV_DLL_IMPORT __declspec(dllimport) + #define DRWAV_DLL_EXPORT __declspec(dllexport) + #define DRWAV_DLL_PRIVATE static + #else + #if defined(__GNUC__) && __GNUC__ >= 4 + #define DRWAV_DLL_IMPORT __attribute__((visibility("default"))) + #define DRWAV_DLL_EXPORT __attribute__((visibility("default"))) + #define DRWAV_DLL_PRIVATE __attribute__((visibility("hidden"))) + #else + #define DRWAV_DLL_IMPORT + #define DRWAV_DLL_EXPORT + #define DRWAV_DLL_PRIVATE static + #endif + #endif + + #if defined(DR_WAV_IMPLEMENTATION) || defined(DRWAV_IMPLEMENTATION) + #define DRWAV_API DRWAV_DLL_EXPORT + #else + #define DRWAV_API DRWAV_DLL_IMPORT + #endif + #define DRWAV_PRIVATE DRWAV_DLL_PRIVATE + #else + #define DRWAV_API extern + #define DRWAV_PRIVATE static + #endif +#endif + +typedef drwav_int32 drwav_result; +#define DRWAV_SUCCESS 0 +#define DRWAV_ERROR -1 /* A generic error. */ +#define DRWAV_INVALID_ARGS -2 +#define DRWAV_INVALID_OPERATION -3 +#define DRWAV_OUT_OF_MEMORY -4 +#define DRWAV_OUT_OF_RANGE -5 +#define DRWAV_ACCESS_DENIED -6 +#define DRWAV_DOES_NOT_EXIST -7 +#define DRWAV_ALREADY_EXISTS -8 +#define DRWAV_TOO_MANY_OPEN_FILES -9 +#define DRWAV_INVALID_FILE -10 +#define DRWAV_TOO_BIG -11 +#define DRWAV_PATH_TOO_LONG -12 +#define DRWAV_NAME_TOO_LONG -13 +#define DRWAV_NOT_DIRECTORY -14 +#define DRWAV_IS_DIRECTORY -15 +#define DRWAV_DIRECTORY_NOT_EMPTY -16 +#define DRWAV_END_OF_FILE -17 +#define DRWAV_NO_SPACE -18 +#define DRWAV_BUSY -19 +#define DRWAV_IO_ERROR -20 +#define DRWAV_INTERRUPT -21 +#define DRWAV_UNAVAILABLE -22 +#define DRWAV_ALREADY_IN_USE -23 +#define DRWAV_BAD_ADDRESS -24 +#define DRWAV_BAD_SEEK -25 +#define DRWAV_BAD_PIPE -26 +#define DRWAV_DEADLOCK -27 +#define DRWAV_TOO_MANY_LINKS -28 +#define DRWAV_NOT_IMPLEMENTED -29 +#define DRWAV_NO_MESSAGE -30 +#define DRWAV_BAD_MESSAGE -31 +#define DRWAV_NO_DATA_AVAILABLE -32 +#define DRWAV_INVALID_DATA -33 +#define DRWAV_TIMEOUT -34 +#define DRWAV_NO_NETWORK -35 +#define DRWAV_NOT_UNIQUE -36 +#define DRWAV_NOT_SOCKET -37 +#define DRWAV_NO_ADDRESS -38 +#define DRWAV_BAD_PROTOCOL -39 +#define DRWAV_PROTOCOL_UNAVAILABLE -40 +#define DRWAV_PROTOCOL_NOT_SUPPORTED -41 +#define DRWAV_PROTOCOL_FAMILY_NOT_SUPPORTED -42 +#define DRWAV_ADDRESS_FAMILY_NOT_SUPPORTED -43 +#define DRWAV_SOCKET_NOT_SUPPORTED -44 +#define DRWAV_CONNECTION_RESET -45 +#define DRWAV_ALREADY_CONNECTED -46 +#define DRWAV_NOT_CONNECTED -47 +#define DRWAV_CONNECTION_REFUSED -48 +#define DRWAV_NO_HOST -49 +#define DRWAV_IN_PROGRESS -50 +#define DRWAV_CANCELLED -51 +#define DRWAV_MEMORY_ALREADY_MAPPED -52 +#define DRWAV_AT_END -53 + +/* Common data formats. */ +#define DR_WAVE_FORMAT_PCM 0x1 +#define DR_WAVE_FORMAT_ADPCM 0x2 +#define DR_WAVE_FORMAT_IEEE_FLOAT 0x3 +#define DR_WAVE_FORMAT_ALAW 0x6 +#define DR_WAVE_FORMAT_MULAW 0x7 +#define DR_WAVE_FORMAT_DVI_ADPCM 0x11 +#define DR_WAVE_FORMAT_EXTENSIBLE 0xFFFE + +/* Constants. */ +#ifndef DRWAV_MAX_SMPL_LOOPS +#define DRWAV_MAX_SMPL_LOOPS 1 +#endif + +/* Flags to pass into drwav_init_ex(), etc. */ +#define DRWAV_SEQUENTIAL 0x00000001 + +DRWAV_API void drwav_version(drwav_uint32* pMajor, drwav_uint32* pMinor, drwav_uint32* pRevision); +DRWAV_API const char* drwav_version_string(void); + +typedef enum +{ + drwav_seek_origin_start, + drwav_seek_origin_current +} drwav_seek_origin; + +typedef enum +{ + drwav_container_riff, + drwav_container_w64, + drwav_container_rf64 +} drwav_container; + +typedef struct +{ + union + { + drwav_uint8 fourcc[4]; + drwav_uint8 guid[16]; + } id; + + /* The size in bytes of the chunk. */ + drwav_uint64 sizeInBytes; + + /* + RIFF = 2 byte alignment. + W64 = 8 byte alignment. + */ + unsigned int paddingSize; +} drwav_chunk_header; + +typedef struct +{ + /* + The format tag exactly as specified in the wave file's "fmt" chunk. This can be used by applications + that require support for data formats not natively supported by dr_wav. + */ + drwav_uint16 formatTag; + + /* The number of channels making up the audio data. When this is set to 1 it is mono, 2 is stereo, etc. */ + drwav_uint16 channels; + + /* The sample rate. Usually set to something like 44100. */ + drwav_uint32 sampleRate; + + /* Average bytes per second. You probably don't need this, but it's left here for informational purposes. */ + drwav_uint32 avgBytesPerSec; + + /* Block align. This is equal to the number of channels * bytes per sample. */ + drwav_uint16 blockAlign; + + /* Bits per sample. */ + drwav_uint16 bitsPerSample; + + /* The size of the extended data. Only used internally for validation, but left here for informational purposes. */ + drwav_uint16 extendedSize; + + /* + The number of valid bits per sample. When is equal to WAVE_FORMAT_EXTENSIBLE, + is always rounded up to the nearest multiple of 8. This variable contains information about exactly how + many bits are valid per sample. Mainly used for informational purposes. + */ + drwav_uint16 validBitsPerSample; + + /* The channel mask. Not used at the moment. */ + drwav_uint32 channelMask; + + /* The sub-format, exactly as specified by the wave file. */ + drwav_uint8 subFormat[16]; +} drwav_fmt; + +DRWAV_API drwav_uint16 drwav_fmt_get_format(const drwav_fmt* pFMT); + + +/* +Callback for when data is read. Return value is the number of bytes actually read. + +pUserData [in] The user data that was passed to drwav_init() and family. +pBufferOut [out] The output buffer. +bytesToRead [in] The number of bytes to read. + +Returns the number of bytes actually read. + +A return value of less than bytesToRead indicates the end of the stream. Do _not_ return from this callback until +either the entire bytesToRead is filled or you have reached the end of the stream. +*/ +typedef size_t (* drwav_read_proc)(void* pUserData, void* pBufferOut, size_t bytesToRead); + +/* +Callback for when data is written. Returns value is the number of bytes actually written. + +pUserData [in] The user data that was passed to drwav_init_write() and family. +pData [out] A pointer to the data to write. +bytesToWrite [in] The number of bytes to write. + +Returns the number of bytes actually written. + +If the return value differs from bytesToWrite, it indicates an error. +*/ +typedef size_t (* drwav_write_proc)(void* pUserData, const void* pData, size_t bytesToWrite); + +/* +Callback for when data needs to be seeked. + +pUserData [in] The user data that was passed to drwav_init() and family. +offset [in] The number of bytes to move, relative to the origin. Will never be negative. +origin [in] The origin of the seek - the current position or the start of the stream. + +Returns whether or not the seek was successful. + +Whether or not it is relative to the beginning or current position is determined by the "origin" parameter which will be either drwav_seek_origin_start or +drwav_seek_origin_current. +*/ +typedef drwav_bool32 (* drwav_seek_proc)(void* pUserData, int offset, drwav_seek_origin origin); + +/* +Callback for when drwav_init_ex() finds a chunk. + +pChunkUserData [in] The user data that was passed to the pChunkUserData parameter of drwav_init_ex() and family. +onRead [in] A pointer to the function to call when reading. +onSeek [in] A pointer to the function to call when seeking. +pReadSeekUserData [in] The user data that was passed to the pReadSeekUserData parameter of drwav_init_ex() and family. +pChunkHeader [in] A pointer to an object containing basic header information about the chunk. Use this to identify the chunk. +container [in] Whether or not the WAV file is a RIFF or Wave64 container. If you're unsure of the difference, assume RIFF. +pFMT [in] A pointer to the object containing the contents of the "fmt" chunk. + +Returns the number of bytes read + seeked. + +To read data from the chunk, call onRead(), passing in pReadSeekUserData as the first parameter. Do the same for seeking with onSeek(). The return value must +be the total number of bytes you have read _plus_ seeked. + +Use the `container` argument to discriminate the fields in `pChunkHeader->id`. If the container is `drwav_container_riff` or `drwav_container_rf64` you should +use `id.fourcc`, otherwise you should use `id.guid`. + +The `pFMT` parameter can be used to determine the data format of the wave file. Use `drwav_fmt_get_format()` to get the sample format, which will be one of the +`DR_WAVE_FORMAT_*` identifiers. + +The read pointer will be sitting on the first byte after the chunk's header. You must not attempt to read beyond the boundary of the chunk. +*/ +typedef drwav_uint64 (* drwav_chunk_proc)(void* pChunkUserData, drwav_read_proc onRead, drwav_seek_proc onSeek, void* pReadSeekUserData, const drwav_chunk_header* pChunkHeader, drwav_container container, const drwav_fmt* pFMT); + +typedef struct +{ + void* pUserData; + void* (* onMalloc)(size_t sz, void* pUserData); + void* (* onRealloc)(void* p, size_t sz, void* pUserData); + void (* onFree)(void* p, void* pUserData); +} drwav_allocation_callbacks; + +/* Structure for internal use. Only used for loaders opened with drwav_init_memory(). */ +typedef struct +{ + const drwav_uint8* data; + size_t dataSize; + size_t currentReadPos; +} drwav__memory_stream; + +/* Structure for internal use. Only used for writers opened with drwav_init_memory_write(). */ +typedef struct +{ + void** ppData; + size_t* pDataSize; + size_t dataSize; + size_t dataCapacity; + size_t currentWritePos; +} drwav__memory_stream_write; + +typedef struct +{ + drwav_container container; /* RIFF, W64. */ + drwav_uint32 format; /* DR_WAVE_FORMAT_* */ + drwav_uint32 channels; + drwav_uint32 sampleRate; + drwav_uint32 bitsPerSample; +} drwav_data_format; + + +/* See the following for details on the 'smpl' chunk: https://sites.google.com/site/musicgapi/technical-documents/wav-file-format#smpl */ +typedef struct +{ + drwav_uint32 cuePointId; + drwav_uint32 type; + drwav_uint32 start; + drwav_uint32 end; + drwav_uint32 fraction; + drwav_uint32 playCount; +} drwav_smpl_loop; + + typedef struct +{ + drwav_uint32 manufacturer; + drwav_uint32 product; + drwav_uint32 samplePeriod; + drwav_uint32 midiUnityNotes; + drwav_uint32 midiPitchFraction; + drwav_uint32 smpteFormat; + drwav_uint32 smpteOffset; + drwav_uint32 numSampleLoops; + drwav_uint32 samplerData; + drwav_smpl_loop loops[DRWAV_MAX_SMPL_LOOPS]; +} drwav_smpl; + +typedef struct +{ + /* A pointer to the function to call when more data is needed. */ + drwav_read_proc onRead; + + /* A pointer to the function to call when data needs to be written. Only used when the drwav object is opened in write mode. */ + drwav_write_proc onWrite; + + /* A pointer to the function to call when the wav file needs to be seeked. */ + drwav_seek_proc onSeek; + + /* The user data to pass to callbacks. */ + void* pUserData; + + /* Allocation callbacks. */ + drwav_allocation_callbacks allocationCallbacks; + + + /* Whether or not the WAV file is formatted as a standard RIFF file or W64. */ + drwav_container container; + + + /* Structure containing format information exactly as specified by the wav file. */ + drwav_fmt fmt; + + /* The sample rate. Will be set to something like 44100. */ + drwav_uint32 sampleRate; + + /* The number of channels. This will be set to 1 for monaural streams, 2 for stereo, etc. */ + drwav_uint16 channels; + + /* The bits per sample. Will be set to something like 16, 24, etc. */ + drwav_uint16 bitsPerSample; + + /* Equal to fmt.formatTag, or the value specified by fmt.subFormat if fmt.formatTag is equal to 65534 (WAVE_FORMAT_EXTENSIBLE). */ + drwav_uint16 translatedFormatTag; + + /* The total number of PCM frames making up the audio data. */ + drwav_uint64 totalPCMFrameCount; + + + /* The size in bytes of the data chunk. */ + drwav_uint64 dataChunkDataSize; + + /* The position in the stream of the first byte of the data chunk. This is used for seeking. */ + drwav_uint64 dataChunkDataPos; + + /* The number of bytes remaining in the data chunk. */ + drwav_uint64 bytesRemaining; + + + /* + Only used in sequential write mode. Keeps track of the desired size of the "data" chunk at the point of initialization time. Always + set to 0 for non-sequential writes and when the drwav object is opened in read mode. Used for validation. + */ + drwav_uint64 dataChunkDataSizeTargetWrite; + + /* Keeps track of whether or not the wav writer was initialized in sequential mode. */ + drwav_bool32 isSequentialWrite; + + + /* smpl chunk. */ + drwav_smpl smpl; + + + /* A hack to avoid a DRWAV_MALLOC() when opening a decoder with drwav_init_memory(). */ + drwav__memory_stream memoryStream; + drwav__memory_stream_write memoryStreamWrite; + + /* Generic data for compressed formats. This data is shared across all block-compressed formats. */ + struct + { + drwav_uint64 iCurrentPCMFrame; /* The index of the next PCM frame that will be read by drwav_read_*(). This is used with "totalPCMFrameCount" to ensure we don't read excess samples at the end of the last block. */ + } compressed; + + /* Microsoft ADPCM specific data. */ + struct + { + drwav_uint32 bytesRemainingInBlock; + drwav_uint16 predictor[2]; + drwav_int32 delta[2]; + drwav_int32 cachedFrames[4]; /* Samples are stored in this cache during decoding. */ + drwav_uint32 cachedFrameCount; + drwav_int32 prevFrames[2][2]; /* The previous 2 samples for each channel (2 channels at most). */ + } msadpcm; + + /* IMA ADPCM specific data. */ + struct + { + drwav_uint32 bytesRemainingInBlock; + drwav_int32 predictor[2]; + drwav_int32 stepIndex[2]; + drwav_int32 cachedFrames[16]; /* Samples are stored in this cache during decoding. */ + drwav_uint32 cachedFrameCount; + } ima; +} drwav; + + +/* +Initializes a pre-allocated drwav object for reading. + +pWav [out] A pointer to the drwav object being initialized. +onRead [in] The function to call when data needs to be read from the client. +onSeek [in] The function to call when the read position of the client data needs to move. +onChunk [in, optional] The function to call when a chunk is enumerated at initialized time. +pUserData, pReadSeekUserData [in, optional] A pointer to application defined data that will be passed to onRead and onSeek. +pChunkUserData [in, optional] A pointer to application defined data that will be passed to onChunk. +flags [in, optional] A set of flags for controlling how things are loaded. + +Returns true if successful; false otherwise. + +Close the loader with drwav_uninit(). + +This is the lowest level function for initializing a WAV file. You can also use drwav_init_file() and drwav_init_memory() +to open the stream from a file or from a block of memory respectively. + +Possible values for flags: + DRWAV_SEQUENTIAL: Never perform a backwards seek while loading. This disables the chunk callback and will cause this function + to return as soon as the data chunk is found. Any chunks after the data chunk will be ignored. + +drwav_init() is equivalent to "drwav_init_ex(pWav, onRead, onSeek, NULL, pUserData, NULL, 0);". + +The onChunk callback is not called for the WAVE or FMT chunks. The contents of the FMT chunk can be read from pWav->fmt +after the function returns. + +See also: drwav_init_file(), drwav_init_memory(), drwav_uninit() +*/ +DRWAV_API drwav_bool32 drwav_init(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_bool32 drwav_init_ex(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_chunk_proc onChunk, void* pReadSeekUserData, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks); + +/* +Initializes a pre-allocated drwav object for writing. + +onWrite [in] The function to call when data needs to be written. +onSeek [in] The function to call when the write position needs to move. +pUserData [in, optional] A pointer to application defined data that will be passed to onWrite and onSeek. + +Returns true if successful; false otherwise. + +Close the writer with drwav_uninit(). + +This is the lowest level function for initializing a WAV file. You can also use drwav_init_file_write() and drwav_init_memory_write() +to open the stream from a file or from a block of memory respectively. + +If the total sample count is known, you can use drwav_init_write_sequential(). This avoids the need for dr_wav to perform +a post-processing step for storing the total sample count and the size of the data chunk which requires a backwards seek. + +See also: drwav_init_file_write(), drwav_init_memory_write(), drwav_uninit() +*/ +DRWAV_API drwav_bool32 drwav_init_write(drwav* pWav, const drwav_data_format* pFormat, drwav_write_proc onWrite, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_bool32 drwav_init_write_sequential(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_write_proc onWrite, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_bool32 drwav_init_write_sequential_pcm_frames(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, drwav_write_proc onWrite, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks); + +/* +Utility function to determine the target size of the entire data to be written (including all headers and chunks). + +Returns the target size in bytes. + +Useful if the application needs to know the size to allocate. + +Only writing to the RIFF chunk and one data chunk is currently supported. + +See also: drwav_init_write(), drwav_init_file_write(), drwav_init_memory_write() +*/ +DRWAV_API drwav_uint64 drwav_target_write_size_bytes(const drwav_data_format* pFormat, drwav_uint64 totalSampleCount); + +/* +Uninitializes the given drwav object. + +Use this only for objects initialized with drwav_init*() functions (drwav_init(), drwav_init_ex(), drwav_init_write(), drwav_init_write_sequential()). +*/ +DRWAV_API drwav_result drwav_uninit(drwav* pWav); + + +/* +Reads raw audio data. + +This is the lowest level function for reading audio data. It simply reads the given number of +bytes of the raw internal sample data. + +Consider using drwav_read_pcm_frames_s16(), drwav_read_pcm_frames_s32() or drwav_read_pcm_frames_f32() for +reading sample data in a consistent format. + +pBufferOut can be NULL in which case a seek will be performed. + +Returns the number of bytes actually read. +*/ +DRWAV_API size_t drwav_read_raw(drwav* pWav, size_t bytesToRead, void* pBufferOut); + +/* +Reads up to the specified number of PCM frames from the WAV file. + +The output data will be in the file's internal format, converted to native-endian byte order. Use +drwav_read_pcm_frames_s16/f32/s32() to read data in a specific format. + +If the return value is less than it means the end of the file has been reached or +you have requested more PCM frames than can possibly fit in the output buffer. + +This function will only work when sample data is of a fixed size and uncompressed. If you are +using a compressed format consider using drwav_read_raw() or drwav_read_pcm_frames_s16/s32/f32(). + +pBufferOut can be NULL in which case a seek will be performed. +*/ +DRWAV_API drwav_uint64 drwav_read_pcm_frames(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut); +DRWAV_API drwav_uint64 drwav_read_pcm_frames_le(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut); +DRWAV_API drwav_uint64 drwav_read_pcm_frames_be(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut); + +/* +Seeks to the given PCM frame. + +Returns true if successful; false otherwise. +*/ +DRWAV_API drwav_bool32 drwav_seek_to_pcm_frame(drwav* pWav, drwav_uint64 targetFrameIndex); + + +/* +Writes raw audio data. + +Returns the number of bytes actually written. If this differs from bytesToWrite, it indicates an error. +*/ +DRWAV_API size_t drwav_write_raw(drwav* pWav, size_t bytesToWrite, const void* pData); + +/* +Writes PCM frames. + +Returns the number of PCM frames written. + +Input samples need to be in native-endian byte order. On big-endian architectures the input data will be converted to +little-endian. Use drwav_write_raw() to write raw audio data without performing any conversion. +*/ +DRWAV_API drwav_uint64 drwav_write_pcm_frames(drwav* pWav, drwav_uint64 framesToWrite, const void* pData); +DRWAV_API drwav_uint64 drwav_write_pcm_frames_le(drwav* pWav, drwav_uint64 framesToWrite, const void* pData); +DRWAV_API drwav_uint64 drwav_write_pcm_frames_be(drwav* pWav, drwav_uint64 framesToWrite, const void* pData); + + +/* Conversion Utilities */ +#ifndef DR_WAV_NO_CONVERSION_API + +/* +Reads a chunk of audio data and converts it to signed 16-bit PCM samples. + +pBufferOut can be NULL in which case a seek will be performed. + +Returns the number of PCM frames actually read. + +If the return value is less than it means the end of the file has been reached. +*/ +DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut); +DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16le(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut); +DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16be(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut); + +/* Low-level function for converting unsigned 8-bit PCM samples to signed 16-bit PCM samples. */ +DRWAV_API void drwav_u8_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount); + +/* Low-level function for converting signed 24-bit PCM samples to signed 16-bit PCM samples. */ +DRWAV_API void drwav_s24_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount); + +/* Low-level function for converting signed 32-bit PCM samples to signed 16-bit PCM samples. */ +DRWAV_API void drwav_s32_to_s16(drwav_int16* pOut, const drwav_int32* pIn, size_t sampleCount); + +/* Low-level function for converting IEEE 32-bit floating point samples to signed 16-bit PCM samples. */ +DRWAV_API void drwav_f32_to_s16(drwav_int16* pOut, const float* pIn, size_t sampleCount); + +/* Low-level function for converting IEEE 64-bit floating point samples to signed 16-bit PCM samples. */ +DRWAV_API void drwav_f64_to_s16(drwav_int16* pOut, const double* pIn, size_t sampleCount); + +/* Low-level function for converting A-law samples to signed 16-bit PCM samples. */ +DRWAV_API void drwav_alaw_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount); + +/* Low-level function for converting u-law samples to signed 16-bit PCM samples. */ +DRWAV_API void drwav_mulaw_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount); + + +/* +Reads a chunk of audio data and converts it to IEEE 32-bit floating point samples. + +pBufferOut can be NULL in which case a seek will be performed. + +Returns the number of PCM frames actually read. + +If the return value is less than it means the end of the file has been reached. +*/ +DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut); +DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32le(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut); +DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32be(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut); + +/* Low-level function for converting unsigned 8-bit PCM samples to IEEE 32-bit floating point samples. */ +DRWAV_API void drwav_u8_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount); + +/* Low-level function for converting signed 16-bit PCM samples to IEEE 32-bit floating point samples. */ +DRWAV_API void drwav_s16_to_f32(float* pOut, const drwav_int16* pIn, size_t sampleCount); + +/* Low-level function for converting signed 24-bit PCM samples to IEEE 32-bit floating point samples. */ +DRWAV_API void drwav_s24_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount); + +/* Low-level function for converting signed 32-bit PCM samples to IEEE 32-bit floating point samples. */ +DRWAV_API void drwav_s32_to_f32(float* pOut, const drwav_int32* pIn, size_t sampleCount); + +/* Low-level function for converting IEEE 64-bit floating point samples to IEEE 32-bit floating point samples. */ +DRWAV_API void drwav_f64_to_f32(float* pOut, const double* pIn, size_t sampleCount); + +/* Low-level function for converting A-law samples to IEEE 32-bit floating point samples. */ +DRWAV_API void drwav_alaw_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount); + +/* Low-level function for converting u-law samples to IEEE 32-bit floating point samples. */ +DRWAV_API void drwav_mulaw_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount); + + +/* +Reads a chunk of audio data and converts it to signed 32-bit PCM samples. + +pBufferOut can be NULL in which case a seek will be performed. + +Returns the number of PCM frames actually read. + +If the return value is less than it means the end of the file has been reached. +*/ +DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut); +DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32le(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut); +DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32be(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut); + +/* Low-level function for converting unsigned 8-bit PCM samples to signed 32-bit PCM samples. */ +DRWAV_API void drwav_u8_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount); + +/* Low-level function for converting signed 16-bit PCM samples to signed 32-bit PCM samples. */ +DRWAV_API void drwav_s16_to_s32(drwav_int32* pOut, const drwav_int16* pIn, size_t sampleCount); + +/* Low-level function for converting signed 24-bit PCM samples to signed 32-bit PCM samples. */ +DRWAV_API void drwav_s24_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount); + +/* Low-level function for converting IEEE 32-bit floating point samples to signed 32-bit PCM samples. */ +DRWAV_API void drwav_f32_to_s32(drwav_int32* pOut, const float* pIn, size_t sampleCount); + +/* Low-level function for converting IEEE 64-bit floating point samples to signed 32-bit PCM samples. */ +DRWAV_API void drwav_f64_to_s32(drwav_int32* pOut, const double* pIn, size_t sampleCount); + +/* Low-level function for converting A-law samples to signed 32-bit PCM samples. */ +DRWAV_API void drwav_alaw_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount); + +/* Low-level function for converting u-law samples to signed 32-bit PCM samples. */ +DRWAV_API void drwav_mulaw_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount); + +#endif /* DR_WAV_NO_CONVERSION_API */ + + +/* High-Level Convenience Helpers */ + +#ifndef DR_WAV_NO_STDIO +/* +Helper for initializing a wave file for reading using stdio. + +This holds the internal FILE object until drwav_uninit() is called. Keep this in mind if you're caching drwav +objects because the operating system may restrict the number of file handles an application can have open at +any given time. +*/ +DRWAV_API drwav_bool32 drwav_init_file(drwav* pWav, const char* filename, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_bool32 drwav_init_file_ex(drwav* pWav, const char* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_bool32 drwav_init_file_w(drwav* pWav, const wchar_t* filename, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_bool32 drwav_init_file_ex_w(drwav* pWav, const wchar_t* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks); + +/* +Helper for initializing a wave file for writing using stdio. + +This holds the internal FILE object until drwav_uninit() is called. Keep this in mind if you're caching drwav +objects because the operating system may restrict the number of file handles an application can have open at +any given time. +*/ +DRWAV_API drwav_bool32 drwav_init_file_write(drwav* pWav, const char* filename, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_bool32 drwav_init_file_write_sequential(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_bool32 drwav_init_file_write_sequential_pcm_frames(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_bool32 drwav_init_file_write_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_bool32 drwav_init_file_write_sequential_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_bool32 drwav_init_file_write_sequential_pcm_frames_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks); +#endif /* DR_WAV_NO_STDIO */ + +/* +Helper for initializing a loader from a pre-allocated memory buffer. + +This does not create a copy of the data. It is up to the application to ensure the buffer remains valid for +the lifetime of the drwav object. + +The buffer should contain the contents of the entire wave file, not just the sample data. +*/ +DRWAV_API drwav_bool32 drwav_init_memory(drwav* pWav, const void* data, size_t dataSize, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_bool32 drwav_init_memory_ex(drwav* pWav, const void* data, size_t dataSize, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks); + +/* +Helper for initializing a writer which outputs data to a memory buffer. + +dr_wav will manage the memory allocations, however it is up to the caller to free the data with drwav_free(). + +The buffer will remain allocated even after drwav_uninit() is called. The buffer should not be considered valid +until after drwav_uninit() has been called. +*/ +DRWAV_API drwav_bool32 drwav_init_memory_write(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_bool32 drwav_init_memory_write_sequential(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_bool32 drwav_init_memory_write_sequential_pcm_frames(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks); + + +#ifndef DR_WAV_NO_CONVERSION_API +/* +Opens and reads an entire wav file in a single operation. + +The return value is a heap-allocated buffer containing the audio data. Use drwav_free() to free the buffer. +*/ +DRWAV_API drwav_int16* drwav_open_and_read_pcm_frames_s16(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API float* drwav_open_and_read_pcm_frames_f32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_int32* drwav_open_and_read_pcm_frames_s32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks); +#ifndef DR_WAV_NO_STDIO +/* +Opens and decodes an entire wav file in a single operation. + +The return value is a heap-allocated buffer containing the audio data. Use drwav_free() to free the buffer. +*/ +DRWAV_API drwav_int16* drwav_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API float* drwav_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_int32* drwav_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_int16* drwav_open_file_and_read_pcm_frames_s16_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API float* drwav_open_file_and_read_pcm_frames_f32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_int32* drwav_open_file_and_read_pcm_frames_s32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks); +#endif +/* +Opens and decodes an entire wav file from a block of memory in a single operation. + +The return value is a heap-allocated buffer containing the audio data. Use drwav_free() to free the buffer. +*/ +DRWAV_API drwav_int16* drwav_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API float* drwav_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks); +DRWAV_API drwav_int32* drwav_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks); +#endif + +/* Frees data that was allocated internally by dr_wav. */ +DRWAV_API void drwav_free(void* p, const drwav_allocation_callbacks* pAllocationCallbacks); + +/* Converts bytes from a wav stream to a sized type of native endian. */ +DRWAV_API drwav_uint16 drwav_bytes_to_u16(const drwav_uint8* data); +DRWAV_API drwav_int16 drwav_bytes_to_s16(const drwav_uint8* data); +DRWAV_API drwav_uint32 drwav_bytes_to_u32(const drwav_uint8* data); +DRWAV_API drwav_int32 drwav_bytes_to_s32(const drwav_uint8* data); +DRWAV_API drwav_uint64 drwav_bytes_to_u64(const drwav_uint8* data); +DRWAV_API drwav_int64 drwav_bytes_to_s64(const drwav_uint8* data); + +/* Compares a GUID for the purpose of checking the type of a Wave64 chunk. */ +DRWAV_API drwav_bool32 drwav_guid_equal(const drwav_uint8 a[16], const drwav_uint8 b[16]); + +/* Compares a four-character-code for the purpose of checking the type of a RIFF chunk. */ +DRWAV_API drwav_bool32 drwav_fourcc_equal(const drwav_uint8* a, const char* b); + +#ifdef __cplusplus +} +#endif +#endif /* dr_wav_h */ + + +/************************************************************************************************************************************************************ + ************************************************************************************************************************************************************ + + IMPLEMENTATION + + ************************************************************************************************************************************************************ + ************************************************************************************************************************************************************/ +#if defined(DR_WAV_IMPLEMENTATION) || defined(DRWAV_IMPLEMENTATION) +#ifndef dr_wav_c +#define dr_wav_c + +#include +#include /* For memcpy(), memset() */ +#include /* For INT_MAX */ + +#ifndef DR_WAV_NO_STDIO +#include +#include +#endif + +/* Standard library stuff. */ +#ifndef DRWAV_ASSERT +#include +#define DRWAV_ASSERT(expression) assert(expression) +#endif +#ifndef DRWAV_MALLOC +#define DRWAV_MALLOC(sz) malloc((sz)) +#endif +#ifndef DRWAV_REALLOC +#define DRWAV_REALLOC(p, sz) realloc((p), (sz)) +#endif +#ifndef DRWAV_FREE +#define DRWAV_FREE(p) free((p)) +#endif +#ifndef DRWAV_COPY_MEMORY +#define DRWAV_COPY_MEMORY(dst, src, sz) memcpy((dst), (src), (sz)) +#endif +#ifndef DRWAV_ZERO_MEMORY +#define DRWAV_ZERO_MEMORY(p, sz) memset((p), 0, (sz)) +#endif +#ifndef DRWAV_ZERO_OBJECT +#define DRWAV_ZERO_OBJECT(p) DRWAV_ZERO_MEMORY((p), sizeof(*p)) +#endif + +#define drwav_countof(x) (sizeof(x) / sizeof(x[0])) +#define drwav_align(x, a) ((((x) + (a) - 1) / (a)) * (a)) +#define drwav_min(a, b) (((a) < (b)) ? (a) : (b)) +#define drwav_max(a, b) (((a) > (b)) ? (a) : (b)) +#define drwav_clamp(x, lo, hi) (drwav_max((lo), drwav_min((hi), (x)))) + +#define DRWAV_MAX_SIMD_VECTOR_SIZE 64 /* 64 for AVX-512 in the future. */ + +/* CPU architecture. */ +#if defined(__x86_64__) || defined(_M_X64) + #define DRWAV_X64 +#elif defined(__i386) || defined(_M_IX86) + #define DRWAV_X86 +#elif defined(__arm__) || defined(_M_ARM) + #define DRWAV_ARM +#endif + +#ifdef _MSC_VER + #define DRWAV_INLINE __forceinline +#elif defined(__GNUC__) + /* + I've had a bug report where GCC is emitting warnings about functions possibly not being inlineable. This warning happens when + the __attribute__((always_inline)) attribute is defined without an "inline" statement. I think therefore there must be some + case where "__inline__" is not always defined, thus the compiler emitting these warnings. When using -std=c89 or -ansi on the + command line, we cannot use the "inline" keyword and instead need to use "__inline__". In an attempt to work around this issue + I am using "__inline__" only when we're compiling in strict ANSI mode. + */ + #if defined(__STRICT_ANSI__) + #define DRWAV_INLINE __inline__ __attribute__((always_inline)) + #else + #define DRWAV_INLINE inline __attribute__((always_inline)) + #endif +#elif defined(__WATCOMC__) + #define DRWAV_INLINE __inline +#else + #define DRWAV_INLINE +#endif + +#if defined(SIZE_MAX) + #define DRWAV_SIZE_MAX SIZE_MAX +#else + #if defined(_WIN64) || defined(_LP64) || defined(__LP64__) + #define DRWAV_SIZE_MAX ((drwav_uint64)0xFFFFFFFFFFFFFFFF) + #else + #define DRWAV_SIZE_MAX 0xFFFFFFFF + #endif +#endif + +#if defined(_MSC_VER) && _MSC_VER >= 1400 + #define DRWAV_HAS_BYTESWAP16_INTRINSIC + #define DRWAV_HAS_BYTESWAP32_INTRINSIC + #define DRWAV_HAS_BYTESWAP64_INTRINSIC +#elif defined(__clang__) + #if defined(__has_builtin) + #if __has_builtin(__builtin_bswap16) + #define DRWAV_HAS_BYTESWAP16_INTRINSIC + #endif + #if __has_builtin(__builtin_bswap32) + #define DRWAV_HAS_BYTESWAP32_INTRINSIC + #endif + #if __has_builtin(__builtin_bswap64) + #define DRWAV_HAS_BYTESWAP64_INTRINSIC + #endif + #endif +#elif defined(__GNUC__) + #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) + #define DRWAV_HAS_BYTESWAP32_INTRINSIC + #define DRWAV_HAS_BYTESWAP64_INTRINSIC + #endif + #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) + #define DRWAV_HAS_BYTESWAP16_INTRINSIC + #endif +#endif + +DRWAV_API void drwav_version(drwav_uint32* pMajor, drwav_uint32* pMinor, drwav_uint32* pRevision) +{ + if (pMajor) { + *pMajor = DRWAV_VERSION_MAJOR; + } + + if (pMinor) { + *pMinor = DRWAV_VERSION_MINOR; + } + + if (pRevision) { + *pRevision = DRWAV_VERSION_REVISION; + } +} + +DRWAV_API const char* drwav_version_string(void) +{ + return DRWAV_VERSION_STRING; +} + +/* +These limits are used for basic validation when initializing the decoder. If you exceed these limits, first of all: what on Earth are +you doing?! (Let me know, I'd be curious!) Second, you can adjust these by #define-ing them before the dr_wav implementation. +*/ +#ifndef DRWAV_MAX_SAMPLE_RATE +#define DRWAV_MAX_SAMPLE_RATE 384000 +#endif +#ifndef DRWAV_MAX_CHANNELS +#define DRWAV_MAX_CHANNELS 256 +#endif +#ifndef DRWAV_MAX_BITS_PER_SAMPLE +#define DRWAV_MAX_BITS_PER_SAMPLE 64 +#endif + +static const drwav_uint8 drwavGUID_W64_RIFF[16] = {0x72,0x69,0x66,0x66, 0x2E,0x91, 0xCF,0x11, 0xA5,0xD6, 0x28,0xDB,0x04,0xC1,0x00,0x00}; /* 66666972-912E-11CF-A5D6-28DB04C10000 */ +static const drwav_uint8 drwavGUID_W64_WAVE[16] = {0x77,0x61,0x76,0x65, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A}; /* 65766177-ACF3-11D3-8CD1-00C04F8EDB8A */ +/*static const drwav_uint8 drwavGUID_W64_JUNK[16] = {0x6A,0x75,0x6E,0x6B, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};*/ /* 6B6E756A-ACF3-11D3-8CD1-00C04F8EDB8A */ +static const drwav_uint8 drwavGUID_W64_FMT [16] = {0x66,0x6D,0x74,0x20, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A}; /* 20746D66-ACF3-11D3-8CD1-00C04F8EDB8A */ +static const drwav_uint8 drwavGUID_W64_FACT[16] = {0x66,0x61,0x63,0x74, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A}; /* 74636166-ACF3-11D3-8CD1-00C04F8EDB8A */ +static const drwav_uint8 drwavGUID_W64_DATA[16] = {0x64,0x61,0x74,0x61, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A}; /* 61746164-ACF3-11D3-8CD1-00C04F8EDB8A */ +static const drwav_uint8 drwavGUID_W64_SMPL[16] = {0x73,0x6D,0x70,0x6C, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A}; /* 6C706D73-ACF3-11D3-8CD1-00C04F8EDB8A */ + +static DRWAV_INLINE drwav_bool32 drwav__guid_equal(const drwav_uint8 a[16], const drwav_uint8 b[16]) +{ + int i; + for (i = 0; i < 16; i += 1) { + if (a[i] != b[i]) { + return DRWAV_FALSE; + } + } + + return DRWAV_TRUE; +} + +static DRWAV_INLINE drwav_bool32 drwav__fourcc_equal(const drwav_uint8* a, const char* b) +{ + return + a[0] == b[0] && + a[1] == b[1] && + a[2] == b[2] && + a[3] == b[3]; +} + + + +static DRWAV_INLINE int drwav__is_little_endian(void) +{ +#if defined(DRWAV_X86) || defined(DRWAV_X64) + return DRWAV_TRUE; +#elif defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN + return DRWAV_TRUE; +#else + int n = 1; + return (*(char*)&n) == 1; +#endif +} + +static DRWAV_INLINE drwav_uint16 drwav__bytes_to_u16(const drwav_uint8* data) +{ + return (data[0] << 0) | (data[1] << 8); +} + +static DRWAV_INLINE drwav_int16 drwav__bytes_to_s16(const drwav_uint8* data) +{ + return (short)drwav__bytes_to_u16(data); +} + +static DRWAV_INLINE drwav_uint32 drwav__bytes_to_u32(const drwav_uint8* data) +{ + return (data[0] << 0) | (data[1] << 8) | (data[2] << 16) | (data[3] << 24); +} + +static DRWAV_INLINE drwav_int32 drwav__bytes_to_s32(const drwav_uint8* data) +{ + return (drwav_int32)drwav__bytes_to_u32(data); +} + +static DRWAV_INLINE drwav_uint64 drwav__bytes_to_u64(const drwav_uint8* data) +{ + return + ((drwav_uint64)data[0] << 0) | ((drwav_uint64)data[1] << 8) | ((drwav_uint64)data[2] << 16) | ((drwav_uint64)data[3] << 24) | + ((drwav_uint64)data[4] << 32) | ((drwav_uint64)data[5] << 40) | ((drwav_uint64)data[6] << 48) | ((drwav_uint64)data[7] << 56); +} + +static DRWAV_INLINE drwav_int64 drwav__bytes_to_s64(const drwav_uint8* data) +{ + return (drwav_int64)drwav__bytes_to_u64(data); +} + +static DRWAV_INLINE void drwav__bytes_to_guid(const drwav_uint8* data, drwav_uint8* guid) +{ + int i; + for (i = 0; i < 16; ++i) { + guid[i] = data[i]; + } +} + + +static DRWAV_INLINE drwav_uint16 drwav__bswap16(drwav_uint16 n) +{ +#ifdef DRWAV_HAS_BYTESWAP16_INTRINSIC + #if defined(_MSC_VER) + return _byteswap_ushort(n); + #elif defined(__GNUC__) || defined(__clang__) + return __builtin_bswap16(n); + #else + #error "This compiler does not support the byte swap intrinsic." + #endif +#else + return ((n & 0xFF00) >> 8) | + ((n & 0x00FF) << 8); +#endif +} + +static DRWAV_INLINE drwav_uint32 drwav__bswap32(drwav_uint32 n) +{ +#ifdef DRWAV_HAS_BYTESWAP32_INTRINSIC + #if defined(_MSC_VER) + return _byteswap_ulong(n); + #elif defined(__GNUC__) || defined(__clang__) + #if defined(DRWAV_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 6) && !defined(DRWAV_64BIT) /* <-- 64-bit inline assembly has not been tested, so disabling for now. */ + /* Inline assembly optimized implementation for ARM. In my testing, GCC does not generate optimized code with __builtin_bswap32(). */ + drwav_uint32 r; + __asm__ __volatile__ ( + #if defined(DRWAV_64BIT) + "rev %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(n) /* <-- This is untested. If someone in the community could test this, that would be appreciated! */ + #else + "rev %[out], %[in]" : [out]"=r"(r) : [in]"r"(n) + #endif + ); + return r; + #else + return __builtin_bswap32(n); + #endif + #else + #error "This compiler does not support the byte swap intrinsic." + #endif +#else + return ((n & 0xFF000000) >> 24) | + ((n & 0x00FF0000) >> 8) | + ((n & 0x0000FF00) << 8) | + ((n & 0x000000FF) << 24); +#endif +} + +static DRWAV_INLINE drwav_uint64 drwav__bswap64(drwav_uint64 n) +{ +#ifdef DRWAV_HAS_BYTESWAP64_INTRINSIC + #if defined(_MSC_VER) + return _byteswap_uint64(n); + #elif defined(__GNUC__) || defined(__clang__) + return __builtin_bswap64(n); + #else + #error "This compiler does not support the byte swap intrinsic." + #endif +#else + /* Weird "<< 32" bitshift is required for C89 because it doesn't support 64-bit constants. Should be optimized out by a good compiler. */ + return ((n & ((drwav_uint64)0xFF000000 << 32)) >> 56) | + ((n & ((drwav_uint64)0x00FF0000 << 32)) >> 40) | + ((n & ((drwav_uint64)0x0000FF00 << 32)) >> 24) | + ((n & ((drwav_uint64)0x000000FF << 32)) >> 8) | + ((n & ((drwav_uint64)0xFF000000 )) << 8) | + ((n & ((drwav_uint64)0x00FF0000 )) << 24) | + ((n & ((drwav_uint64)0x0000FF00 )) << 40) | + ((n & ((drwav_uint64)0x000000FF )) << 56); +#endif +} + + +static DRWAV_INLINE drwav_int16 drwav__bswap_s16(drwav_int16 n) +{ + return (drwav_int16)drwav__bswap16((drwav_uint16)n); +} + +static DRWAV_INLINE void drwav__bswap_samples_s16(drwav_int16* pSamples, drwav_uint64 sampleCount) +{ + drwav_uint64 iSample; + for (iSample = 0; iSample < sampleCount; iSample += 1) { + pSamples[iSample] = drwav__bswap_s16(pSamples[iSample]); + } +} + + +static DRWAV_INLINE void drwav__bswap_s24(drwav_uint8* p) +{ + drwav_uint8 t; + t = p[0]; + p[0] = p[2]; + p[2] = t; +} + +static DRWAV_INLINE void drwav__bswap_samples_s24(drwav_uint8* pSamples, drwav_uint64 sampleCount) +{ + drwav_uint64 iSample; + for (iSample = 0; iSample < sampleCount; iSample += 1) { + drwav_uint8* pSample = pSamples + (iSample*3); + drwav__bswap_s24(pSample); + } +} + + +static DRWAV_INLINE drwav_int32 drwav__bswap_s32(drwav_int32 n) +{ + return (drwav_int32)drwav__bswap32((drwav_uint32)n); +} + +static DRWAV_INLINE void drwav__bswap_samples_s32(drwav_int32* pSamples, drwav_uint64 sampleCount) +{ + drwav_uint64 iSample; + for (iSample = 0; iSample < sampleCount; iSample += 1) { + pSamples[iSample] = drwav__bswap_s32(pSamples[iSample]); + } +} + + +static DRWAV_INLINE float drwav__bswap_f32(float n) +{ + union { + drwav_uint32 i; + float f; + } x; + x.f = n; + x.i = drwav__bswap32(x.i); + + return x.f; +} + +static DRWAV_INLINE void drwav__bswap_samples_f32(float* pSamples, drwav_uint64 sampleCount) +{ + drwav_uint64 iSample; + for (iSample = 0; iSample < sampleCount; iSample += 1) { + pSamples[iSample] = drwav__bswap_f32(pSamples[iSample]); + } +} + + +static DRWAV_INLINE double drwav__bswap_f64(double n) +{ + union { + drwav_uint64 i; + double f; + } x; + x.f = n; + x.i = drwav__bswap64(x.i); + + return x.f; +} + +static DRWAV_INLINE void drwav__bswap_samples_f64(double* pSamples, drwav_uint64 sampleCount) +{ + drwav_uint64 iSample; + for (iSample = 0; iSample < sampleCount; iSample += 1) { + pSamples[iSample] = drwav__bswap_f64(pSamples[iSample]); + } +} + + +static DRWAV_INLINE void drwav__bswap_samples_pcm(void* pSamples, drwav_uint64 sampleCount, drwav_uint32 bytesPerSample) +{ + /* Assumes integer PCM. Floating point PCM is done in drwav__bswap_samples_ieee(). */ + switch (bytesPerSample) + { + case 2: /* s16, s12 (loosely packed) */ + { + drwav__bswap_samples_s16((drwav_int16*)pSamples, sampleCount); + } break; + case 3: /* s24 */ + { + drwav__bswap_samples_s24((drwav_uint8*)pSamples, sampleCount); + } break; + case 4: /* s32 */ + { + drwav__bswap_samples_s32((drwav_int32*)pSamples, sampleCount); + } break; + default: + { + /* Unsupported format. */ + DRWAV_ASSERT(DRWAV_FALSE); + } break; + } +} + +static DRWAV_INLINE void drwav__bswap_samples_ieee(void* pSamples, drwav_uint64 sampleCount, drwav_uint32 bytesPerSample) +{ + switch (bytesPerSample) + { + #if 0 /* Contributions welcome for f16 support. */ + case 2: /* f16 */ + { + drwav__bswap_samples_f16((drwav_float16*)pSamples, sampleCount); + } break; + #endif + case 4: /* f32 */ + { + drwav__bswap_samples_f32((float*)pSamples, sampleCount); + } break; + case 8: /* f64 */ + { + drwav__bswap_samples_f64((double*)pSamples, sampleCount); + } break; + default: + { + /* Unsupported format. */ + DRWAV_ASSERT(DRWAV_FALSE); + } break; + } +} + +static DRWAV_INLINE void drwav__bswap_samples(void* pSamples, drwav_uint64 sampleCount, drwav_uint32 bytesPerSample, drwav_uint16 format) +{ + switch (format) + { + case DR_WAVE_FORMAT_PCM: + { + drwav__bswap_samples_pcm(pSamples, sampleCount, bytesPerSample); + } break; + + case DR_WAVE_FORMAT_IEEE_FLOAT: + { + drwav__bswap_samples_ieee(pSamples, sampleCount, bytesPerSample); + } break; + + case DR_WAVE_FORMAT_ALAW: + case DR_WAVE_FORMAT_MULAW: + { + drwav__bswap_samples_s16((drwav_int16*)pSamples, sampleCount); + } break; + + case DR_WAVE_FORMAT_ADPCM: + case DR_WAVE_FORMAT_DVI_ADPCM: + default: + { + /* Unsupported format. */ + DRWAV_ASSERT(DRWAV_FALSE); + } break; + } +} + + +static void* drwav__malloc_default(size_t sz, void* pUserData) +{ + (void)pUserData; + return DRWAV_MALLOC(sz); +} + +static void* drwav__realloc_default(void* p, size_t sz, void* pUserData) +{ + (void)pUserData; + return DRWAV_REALLOC(p, sz); +} + +static void drwav__free_default(void* p, void* pUserData) +{ + (void)pUserData; + DRWAV_FREE(p); +} + + +static void* drwav__malloc_from_callbacks(size_t sz, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + if (pAllocationCallbacks == NULL) { + return NULL; + } + + if (pAllocationCallbacks->onMalloc != NULL) { + return pAllocationCallbacks->onMalloc(sz, pAllocationCallbacks->pUserData); + } + + /* Try using realloc(). */ + if (pAllocationCallbacks->onRealloc != NULL) { + return pAllocationCallbacks->onRealloc(NULL, sz, pAllocationCallbacks->pUserData); + } + + return NULL; +} + +static void* drwav__realloc_from_callbacks(void* p, size_t szNew, size_t szOld, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + if (pAllocationCallbacks == NULL) { + return NULL; + } + + if (pAllocationCallbacks->onRealloc != NULL) { + return pAllocationCallbacks->onRealloc(p, szNew, pAllocationCallbacks->pUserData); + } + + /* Try emulating realloc() in terms of malloc()/free(). */ + if (pAllocationCallbacks->onMalloc != NULL && pAllocationCallbacks->onFree != NULL) { + void* p2; + + p2 = pAllocationCallbacks->onMalloc(szNew, pAllocationCallbacks->pUserData); + if (p2 == NULL) { + return NULL; + } + + if (p != NULL) { + DRWAV_COPY_MEMORY(p2, p, szOld); + pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData); + } + + return p2; + } + + return NULL; +} + +static void drwav__free_from_callbacks(void* p, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + if (p == NULL || pAllocationCallbacks == NULL) { + return; + } + + if (pAllocationCallbacks->onFree != NULL) { + pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData); + } +} + + +static drwav_allocation_callbacks drwav_copy_allocation_callbacks_or_defaults(const drwav_allocation_callbacks* pAllocationCallbacks) +{ + if (pAllocationCallbacks != NULL) { + /* Copy. */ + return *pAllocationCallbacks; + } else { + /* Defaults. */ + drwav_allocation_callbacks allocationCallbacks; + allocationCallbacks.pUserData = NULL; + allocationCallbacks.onMalloc = drwav__malloc_default; + allocationCallbacks.onRealloc = drwav__realloc_default; + allocationCallbacks.onFree = drwav__free_default; + return allocationCallbacks; + } +} + + +static DRWAV_INLINE drwav_bool32 drwav__is_compressed_format_tag(drwav_uint16 formatTag) +{ + return + formatTag == DR_WAVE_FORMAT_ADPCM || + formatTag == DR_WAVE_FORMAT_DVI_ADPCM; +} + +static unsigned int drwav__chunk_padding_size_riff(drwav_uint64 chunkSize) +{ + return (unsigned int)(chunkSize % 2); +} + +static unsigned int drwav__chunk_padding_size_w64(drwav_uint64 chunkSize) +{ + return (unsigned int)(chunkSize % 8); +} + +static drwav_uint64 drwav_read_pcm_frames_s16__msadpcm(drwav* pWav, drwav_uint64 samplesToRead, drwav_int16* pBufferOut); +static drwav_uint64 drwav_read_pcm_frames_s16__ima(drwav* pWav, drwav_uint64 samplesToRead, drwav_int16* pBufferOut); +static drwav_bool32 drwav_init_write__internal(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount); + +static drwav_result drwav__read_chunk_header(drwav_read_proc onRead, void* pUserData, drwav_container container, drwav_uint64* pRunningBytesReadOut, drwav_chunk_header* pHeaderOut) +{ + if (container == drwav_container_riff || container == drwav_container_rf64) { + drwav_uint8 sizeInBytes[4]; + + if (onRead(pUserData, pHeaderOut->id.fourcc, 4) != 4) { + return DRWAV_AT_END; + } + + if (onRead(pUserData, sizeInBytes, 4) != 4) { + return DRWAV_INVALID_FILE; + } + + pHeaderOut->sizeInBytes = drwav__bytes_to_u32(sizeInBytes); + pHeaderOut->paddingSize = drwav__chunk_padding_size_riff(pHeaderOut->sizeInBytes); + *pRunningBytesReadOut += 8; + } else { + drwav_uint8 sizeInBytes[8]; + + if (onRead(pUserData, pHeaderOut->id.guid, 16) != 16) { + return DRWAV_AT_END; + } + + if (onRead(pUserData, sizeInBytes, 8) != 8) { + return DRWAV_INVALID_FILE; + } + + pHeaderOut->sizeInBytes = drwav__bytes_to_u64(sizeInBytes) - 24; /* <-- Subtract 24 because w64 includes the size of the header. */ + pHeaderOut->paddingSize = drwav__chunk_padding_size_w64(pHeaderOut->sizeInBytes); + *pRunningBytesReadOut += 24; + } + + return DRWAV_SUCCESS; +} + +static drwav_bool32 drwav__seek_forward(drwav_seek_proc onSeek, drwav_uint64 offset, void* pUserData) +{ + drwav_uint64 bytesRemainingToSeek = offset; + while (bytesRemainingToSeek > 0) { + if (bytesRemainingToSeek > 0x7FFFFFFF) { + if (!onSeek(pUserData, 0x7FFFFFFF, drwav_seek_origin_current)) { + return DRWAV_FALSE; + } + bytesRemainingToSeek -= 0x7FFFFFFF; + } else { + if (!onSeek(pUserData, (int)bytesRemainingToSeek, drwav_seek_origin_current)) { + return DRWAV_FALSE; + } + bytesRemainingToSeek = 0; + } + } + + return DRWAV_TRUE; +} + +static drwav_bool32 drwav__seek_from_start(drwav_seek_proc onSeek, drwav_uint64 offset, void* pUserData) +{ + if (offset <= 0x7FFFFFFF) { + return onSeek(pUserData, (int)offset, drwav_seek_origin_start); + } + + /* Larger than 32-bit seek. */ + if (!onSeek(pUserData, 0x7FFFFFFF, drwav_seek_origin_start)) { + return DRWAV_FALSE; + } + offset -= 0x7FFFFFFF; + + for (;;) { + if (offset <= 0x7FFFFFFF) { + return onSeek(pUserData, (int)offset, drwav_seek_origin_current); + } + + if (!onSeek(pUserData, 0x7FFFFFFF, drwav_seek_origin_current)) { + return DRWAV_FALSE; + } + offset -= 0x7FFFFFFF; + } + + /* Should never get here. */ + /*return DRWAV_TRUE; */ +} + + +static drwav_bool32 drwav__read_fmt(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, drwav_container container, drwav_uint64* pRunningBytesReadOut, drwav_fmt* fmtOut) +{ + drwav_chunk_header header; + drwav_uint8 fmt[16]; + + if (drwav__read_chunk_header(onRead, pUserData, container, pRunningBytesReadOut, &header) != DRWAV_SUCCESS) { + return DRWAV_FALSE; + } + + + /* Skip non-fmt chunks. */ + while (((container == drwav_container_riff || container == drwav_container_rf64) && !drwav__fourcc_equal(header.id.fourcc, "fmt ")) || (container == drwav_container_w64 && !drwav__guid_equal(header.id.guid, drwavGUID_W64_FMT))) { + if (!drwav__seek_forward(onSeek, header.sizeInBytes + header.paddingSize, pUserData)) { + return DRWAV_FALSE; + } + *pRunningBytesReadOut += header.sizeInBytes + header.paddingSize; + + /* Try the next header. */ + if (drwav__read_chunk_header(onRead, pUserData, container, pRunningBytesReadOut, &header) != DRWAV_SUCCESS) { + return DRWAV_FALSE; + } + } + + + /* Validation. */ + if (container == drwav_container_riff || container == drwav_container_rf64) { + if (!drwav__fourcc_equal(header.id.fourcc, "fmt ")) { + return DRWAV_FALSE; + } + } else { + if (!drwav__guid_equal(header.id.guid, drwavGUID_W64_FMT)) { + return DRWAV_FALSE; + } + } + + + if (onRead(pUserData, fmt, sizeof(fmt)) != sizeof(fmt)) { + return DRWAV_FALSE; + } + *pRunningBytesReadOut += sizeof(fmt); + + fmtOut->formatTag = drwav__bytes_to_u16(fmt + 0); + fmtOut->channels = drwav__bytes_to_u16(fmt + 2); + fmtOut->sampleRate = drwav__bytes_to_u32(fmt + 4); + fmtOut->avgBytesPerSec = drwav__bytes_to_u32(fmt + 8); + fmtOut->blockAlign = drwav__bytes_to_u16(fmt + 12); + fmtOut->bitsPerSample = drwav__bytes_to_u16(fmt + 14); + + fmtOut->extendedSize = 0; + fmtOut->validBitsPerSample = 0; + fmtOut->channelMask = 0; + memset(fmtOut->subFormat, 0, sizeof(fmtOut->subFormat)); + + if (header.sizeInBytes > 16) { + drwav_uint8 fmt_cbSize[2]; + int bytesReadSoFar = 0; + + if (onRead(pUserData, fmt_cbSize, sizeof(fmt_cbSize)) != sizeof(fmt_cbSize)) { + return DRWAV_FALSE; /* Expecting more data. */ + } + *pRunningBytesReadOut += sizeof(fmt_cbSize); + + bytesReadSoFar = 18; + + fmtOut->extendedSize = drwav__bytes_to_u16(fmt_cbSize); + if (fmtOut->extendedSize > 0) { + /* Simple validation. */ + if (fmtOut->formatTag == DR_WAVE_FORMAT_EXTENSIBLE) { + if (fmtOut->extendedSize != 22) { + return DRWAV_FALSE; + } + } + + if (fmtOut->formatTag == DR_WAVE_FORMAT_EXTENSIBLE) { + drwav_uint8 fmtext[22]; + if (onRead(pUserData, fmtext, fmtOut->extendedSize) != fmtOut->extendedSize) { + return DRWAV_FALSE; /* Expecting more data. */ + } + + fmtOut->validBitsPerSample = drwav__bytes_to_u16(fmtext + 0); + fmtOut->channelMask = drwav__bytes_to_u32(fmtext + 2); + drwav__bytes_to_guid(fmtext + 6, fmtOut->subFormat); + } else { + if (!onSeek(pUserData, fmtOut->extendedSize, drwav_seek_origin_current)) { + return DRWAV_FALSE; + } + } + *pRunningBytesReadOut += fmtOut->extendedSize; + + bytesReadSoFar += fmtOut->extendedSize; + } + + /* Seek past any leftover bytes. For w64 the leftover will be defined based on the chunk size. */ + if (!onSeek(pUserData, (int)(header.sizeInBytes - bytesReadSoFar), drwav_seek_origin_current)) { + return DRWAV_FALSE; + } + *pRunningBytesReadOut += (header.sizeInBytes - bytesReadSoFar); + } + + if (header.paddingSize > 0) { + if (!onSeek(pUserData, header.paddingSize, drwav_seek_origin_current)) { + return DRWAV_FALSE; + } + *pRunningBytesReadOut += header.paddingSize; + } + + return DRWAV_TRUE; +} + + +static size_t drwav__on_read(drwav_read_proc onRead, void* pUserData, void* pBufferOut, size_t bytesToRead, drwav_uint64* pCursor) +{ + size_t bytesRead; + + DRWAV_ASSERT(onRead != NULL); + DRWAV_ASSERT(pCursor != NULL); + + bytesRead = onRead(pUserData, pBufferOut, bytesToRead); + *pCursor += bytesRead; + return bytesRead; +} + +#if 0 +static drwav_bool32 drwav__on_seek(drwav_seek_proc onSeek, void* pUserData, int offset, drwav_seek_origin origin, drwav_uint64* pCursor) +{ + DRWAV_ASSERT(onSeek != NULL); + DRWAV_ASSERT(pCursor != NULL); + + if (!onSeek(pUserData, offset, origin)) { + return DRWAV_FALSE; + } + + if (origin == drwav_seek_origin_start) { + *pCursor = offset; + } else { + *pCursor += offset; + } + + return DRWAV_TRUE; +} +#endif + + + +static drwav_uint32 drwav_get_bytes_per_pcm_frame(drwav* pWav) +{ + /* + The bytes per frame is a bit ambiguous. It can be either be based on the bits per sample, or the block align. The way I'm doing it here + is that if the bits per sample is a multiple of 8, use floor(bitsPerSample*channels/8), otherwise fall back to the block align. + */ + if ((pWav->bitsPerSample & 0x7) == 0) { + /* Bits per sample is a multiple of 8. */ + return (pWav->bitsPerSample * pWav->fmt.channels) >> 3; + } else { + return pWav->fmt.blockAlign; + } +} + +DRWAV_API drwav_uint16 drwav_fmt_get_format(const drwav_fmt* pFMT) +{ + if (pFMT == NULL) { + return 0; + } + + if (pFMT->formatTag != DR_WAVE_FORMAT_EXTENSIBLE) { + return pFMT->formatTag; + } else { + return drwav__bytes_to_u16(pFMT->subFormat); /* Only the first two bytes are required. */ + } +} + +static drwav_bool32 drwav_preinit(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, void* pReadSeekUserData, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + if (pWav == NULL || onRead == NULL || onSeek == NULL) { + return DRWAV_FALSE; + } + + DRWAV_ZERO_MEMORY(pWav, sizeof(*pWav)); + pWav->onRead = onRead; + pWav->onSeek = onSeek; + pWav->pUserData = pReadSeekUserData; + pWav->allocationCallbacks = drwav_copy_allocation_callbacks_or_defaults(pAllocationCallbacks); + + if (pWav->allocationCallbacks.onFree == NULL || (pWav->allocationCallbacks.onMalloc == NULL && pWav->allocationCallbacks.onRealloc == NULL)) { + return DRWAV_FALSE; /* Invalid allocation callbacks. */ + } + + return DRWAV_TRUE; +} + +static drwav_bool32 drwav_init__internal(drwav* pWav, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags) +{ + /* This function assumes drwav_preinit() has been called beforehand. */ + + drwav_uint64 cursor; /* <-- Keeps track of the byte position so we can seek to specific locations. */ + drwav_bool32 sequential; + drwav_uint8 riff[4]; + drwav_fmt fmt; + unsigned short translatedFormatTag; + drwav_bool32 foundDataChunk; + drwav_uint64 dataChunkSize = 0; /* <-- Important! Don't explicitly set this to 0 anywhere else. Calculation of the size of the data chunk is performed in different paths depending on the container. */ + drwav_uint64 sampleCountFromFactChunk = 0; /* Same as dataChunkSize - make sure this is the only place this is initialized to 0. */ + drwav_uint64 chunkSize; + + cursor = 0; + sequential = (flags & DRWAV_SEQUENTIAL) != 0; + + /* The first 4 bytes should be the RIFF identifier. */ + if (drwav__on_read(pWav->onRead, pWav->pUserData, riff, sizeof(riff), &cursor) != sizeof(riff)) { + return DRWAV_FALSE; + } + + /* + The first 4 bytes can be used to identify the container. For RIFF files it will start with "RIFF" and for + w64 it will start with "riff". + */ + if (drwav__fourcc_equal(riff, "RIFF")) { + pWav->container = drwav_container_riff; + } else if (drwav__fourcc_equal(riff, "riff")) { + int i; + drwav_uint8 riff2[12]; + + pWav->container = drwav_container_w64; + + /* Check the rest of the GUID for validity. */ + if (drwav__on_read(pWav->onRead, pWav->pUserData, riff2, sizeof(riff2), &cursor) != sizeof(riff2)) { + return DRWAV_FALSE; + } + + for (i = 0; i < 12; ++i) { + if (riff2[i] != drwavGUID_W64_RIFF[i+4]) { + return DRWAV_FALSE; + } + } + } else if (drwav__fourcc_equal(riff, "RF64")) { + pWav->container = drwav_container_rf64; + } else { + return DRWAV_FALSE; /* Unknown or unsupported container. */ + } + + + if (pWav->container == drwav_container_riff || pWav->container == drwav_container_rf64) { + drwav_uint8 chunkSizeBytes[4]; + drwav_uint8 wave[4]; + + /* RIFF/WAVE */ + if (drwav__on_read(pWav->onRead, pWav->pUserData, chunkSizeBytes, sizeof(chunkSizeBytes), &cursor) != sizeof(chunkSizeBytes)) { + return DRWAV_FALSE; + } + + if (pWav->container == drwav_container_riff) { + if (drwav__bytes_to_u32(chunkSizeBytes) < 36) { + return DRWAV_FALSE; /* Chunk size should always be at least 36 bytes. */ + } + } else { + if (drwav__bytes_to_u32(chunkSizeBytes) != 0xFFFFFFFF) { + return DRWAV_FALSE; /* Chunk size should always be set to -1/0xFFFFFFFF for RF64. The actual size is retrieved later. */ + } + } + + if (drwav__on_read(pWav->onRead, pWav->pUserData, wave, sizeof(wave), &cursor) != sizeof(wave)) { + return DRWAV_FALSE; + } + + if (!drwav__fourcc_equal(wave, "WAVE")) { + return DRWAV_FALSE; /* Expecting "WAVE". */ + } + } else { + drwav_uint8 chunkSizeBytes[8]; + drwav_uint8 wave[16]; + + /* W64 */ + if (drwav__on_read(pWav->onRead, pWav->pUserData, chunkSizeBytes, sizeof(chunkSizeBytes), &cursor) != sizeof(chunkSizeBytes)) { + return DRWAV_FALSE; + } + + if (drwav__bytes_to_u64(chunkSizeBytes) < 80) { + return DRWAV_FALSE; + } + + if (drwav__on_read(pWav->onRead, pWav->pUserData, wave, sizeof(wave), &cursor) != sizeof(wave)) { + return DRWAV_FALSE; + } + + if (!drwav__guid_equal(wave, drwavGUID_W64_WAVE)) { + return DRWAV_FALSE; + } + } + + + /* For RF64, the "ds64" chunk must come next, before the "fmt " chunk. */ + if (pWav->container == drwav_container_rf64) { + drwav_uint8 sizeBytes[8]; + drwav_uint64 bytesRemainingInChunk; + drwav_chunk_header header; + drwav_result result = drwav__read_chunk_header(pWav->onRead, pWav->pUserData, pWav->container, &cursor, &header); + if (result != DRWAV_SUCCESS) { + return DRWAV_FALSE; + } + + if (!drwav__fourcc_equal(header.id.fourcc, "ds64")) { + return DRWAV_FALSE; /* Expecting "ds64". */ + } + + bytesRemainingInChunk = header.sizeInBytes + header.paddingSize; + + /* We don't care about the size of the RIFF chunk - skip it. */ + if (!drwav__seek_forward(pWav->onSeek, 8, pWav->pUserData)) { + return DRWAV_FALSE; + } + bytesRemainingInChunk -= 8; + cursor += 8; + + + /* Next 8 bytes is the size of the "data" chunk. */ + if (drwav__on_read(pWav->onRead, pWav->pUserData, sizeBytes, sizeof(sizeBytes), &cursor) != sizeof(sizeBytes)) { + return DRWAV_FALSE; + } + bytesRemainingInChunk -= 8; + dataChunkSize = drwav__bytes_to_u64(sizeBytes); + + + /* Next 8 bytes is the same count which we would usually derived from the FACT chunk if it was available. */ + if (drwav__on_read(pWav->onRead, pWav->pUserData, sizeBytes, sizeof(sizeBytes), &cursor) != sizeof(sizeBytes)) { + return DRWAV_FALSE; + } + bytesRemainingInChunk -= 8; + sampleCountFromFactChunk = drwav__bytes_to_u64(sizeBytes); + + + /* Skip over everything else. */ + if (!drwav__seek_forward(pWav->onSeek, bytesRemainingInChunk, pWav->pUserData)) { + return DRWAV_FALSE; + } + cursor += bytesRemainingInChunk; + } + + + /* The next bytes should be the "fmt " chunk. */ + if (!drwav__read_fmt(pWav->onRead, pWav->onSeek, pWav->pUserData, pWav->container, &cursor, &fmt)) { + return DRWAV_FALSE; /* Failed to read the "fmt " chunk. */ + } + + /* Basic validation. */ + if ((fmt.sampleRate == 0 || fmt.sampleRate > DRWAV_MAX_SAMPLE_RATE) || + (fmt.channels == 0 || fmt.channels > DRWAV_MAX_CHANNELS) || + (fmt.bitsPerSample == 0 || fmt.bitsPerSample > DRWAV_MAX_BITS_PER_SAMPLE) || + fmt.blockAlign == 0) { + return DRWAV_FALSE; /* Probably an invalid WAV file. */ + } + + + /* Translate the internal format. */ + translatedFormatTag = fmt.formatTag; + if (translatedFormatTag == DR_WAVE_FORMAT_EXTENSIBLE) { + translatedFormatTag = drwav__bytes_to_u16(fmt.subFormat + 0); + } + + + /* + We need to enumerate over each chunk for two reasons: + 1) The "data" chunk may not be the next one + 2) We may want to report each chunk back to the client + + In order to correctly report each chunk back to the client we will need to keep looping until the end of the file. + */ + foundDataChunk = DRWAV_FALSE; + + /* The next chunk we care about is the "data" chunk. This is not necessarily the next chunk so we'll need to loop. */ + for (;;) + { + drwav_chunk_header header; + drwav_result result = drwav__read_chunk_header(pWav->onRead, pWav->pUserData, pWav->container, &cursor, &header); + if (result != DRWAV_SUCCESS) { + if (!foundDataChunk) { + return DRWAV_FALSE; + } else { + break; /* Probably at the end of the file. Get out of the loop. */ + } + } + + /* Tell the client about this chunk. */ + if (!sequential && onChunk != NULL) { + drwav_uint64 callbackBytesRead = onChunk(pChunkUserData, pWav->onRead, pWav->onSeek, pWav->pUserData, &header, pWav->container, &fmt); + + /* + dr_wav may need to read the contents of the chunk, so we now need to seek back to the position before + we called the callback. + */ + if (callbackBytesRead > 0) { + if (!drwav__seek_from_start(pWav->onSeek, cursor, pWav->pUserData)) { + return DRWAV_FALSE; + } + } + } + + + if (!foundDataChunk) { + pWav->dataChunkDataPos = cursor; + } + + chunkSize = header.sizeInBytes; + if (pWav->container == drwav_container_riff || pWav->container == drwav_container_rf64) { + if (drwav__fourcc_equal(header.id.fourcc, "data")) { + foundDataChunk = DRWAV_TRUE; + if (pWav->container != drwav_container_rf64) { /* The data chunk size for RF64 will always be set to 0xFFFFFFFF here. It was set to it's true value earlier. */ + dataChunkSize = chunkSize; + } + } + } else { + if (drwav__guid_equal(header.id.guid, drwavGUID_W64_DATA)) { + foundDataChunk = DRWAV_TRUE; + dataChunkSize = chunkSize; + } + } + + /* + If at this point we have found the data chunk and we're running in sequential mode, we need to break out of this loop. The reason for + this is that we would otherwise require a backwards seek which sequential mode forbids. + */ + if (foundDataChunk && sequential) { + break; + } + + /* Optional. Get the total sample count from the FACT chunk. This is useful for compressed formats. */ + if (pWav->container == drwav_container_riff) { + if (drwav__fourcc_equal(header.id.fourcc, "fact")) { + drwav_uint32 sampleCount; + if (drwav__on_read(pWav->onRead, pWav->pUserData, &sampleCount, 4, &cursor) != 4) { + return DRWAV_FALSE; + } + chunkSize -= 4; + + if (!foundDataChunk) { + pWav->dataChunkDataPos = cursor; + } + + /* + The sample count in the "fact" chunk is either unreliable, or I'm not understanding it properly. For now I am only enabling this + for Microsoft ADPCM formats. + */ + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) { + sampleCountFromFactChunk = sampleCount; + } else { + sampleCountFromFactChunk = 0; + } + } + } else if (pWav->container == drwav_container_w64) { + if (drwav__guid_equal(header.id.guid, drwavGUID_W64_FACT)) { + if (drwav__on_read(pWav->onRead, pWav->pUserData, &sampleCountFromFactChunk, 8, &cursor) != 8) { + return DRWAV_FALSE; + } + chunkSize -= 8; + + if (!foundDataChunk) { + pWav->dataChunkDataPos = cursor; + } + } + } else if (pWav->container == drwav_container_rf64) { + /* We retrieved the sample count from the ds64 chunk earlier so no need to do that here. */ + } + + /* "smpl" chunk. */ + if (pWav->container == drwav_container_riff || pWav->container == drwav_container_rf64) { + if (drwav__fourcc_equal(header.id.fourcc, "smpl")) { + drwav_uint8 smplHeaderData[36]; /* 36 = size of the smpl header section, not including the loop data. */ + if (chunkSize >= sizeof(smplHeaderData)) { + drwav_uint64 bytesJustRead = drwav__on_read(pWav->onRead, pWav->pUserData, smplHeaderData, sizeof(smplHeaderData), &cursor); + chunkSize -= bytesJustRead; + + if (bytesJustRead == sizeof(smplHeaderData)) { + drwav_uint32 iLoop; + + pWav->smpl.manufacturer = drwav__bytes_to_u32(smplHeaderData+0); + pWav->smpl.product = drwav__bytes_to_u32(smplHeaderData+4); + pWav->smpl.samplePeriod = drwav__bytes_to_u32(smplHeaderData+8); + pWav->smpl.midiUnityNotes = drwav__bytes_to_u32(smplHeaderData+12); + pWav->smpl.midiPitchFraction = drwav__bytes_to_u32(smplHeaderData+16); + pWav->smpl.smpteFormat = drwav__bytes_to_u32(smplHeaderData+20); + pWav->smpl.smpteOffset = drwav__bytes_to_u32(smplHeaderData+24); + pWav->smpl.numSampleLoops = drwav__bytes_to_u32(smplHeaderData+28); + pWav->smpl.samplerData = drwav__bytes_to_u32(smplHeaderData+32); + + for (iLoop = 0; iLoop < pWav->smpl.numSampleLoops && iLoop < drwav_countof(pWav->smpl.loops); ++iLoop) { + drwav_uint8 smplLoopData[24]; /* 24 = size of a loop section in the smpl chunk. */ + bytesJustRead = drwav__on_read(pWav->onRead, pWav->pUserData, smplLoopData, sizeof(smplLoopData), &cursor); + chunkSize -= bytesJustRead; + + if (bytesJustRead == sizeof(smplLoopData)) { + pWav->smpl.loops[iLoop].cuePointId = drwav__bytes_to_u32(smplLoopData+0); + pWav->smpl.loops[iLoop].type = drwav__bytes_to_u32(smplLoopData+4); + pWav->smpl.loops[iLoop].start = drwav__bytes_to_u32(smplLoopData+8); + pWav->smpl.loops[iLoop].end = drwav__bytes_to_u32(smplLoopData+12); + pWav->smpl.loops[iLoop].fraction = drwav__bytes_to_u32(smplLoopData+16); + pWav->smpl.loops[iLoop].playCount = drwav__bytes_to_u32(smplLoopData+20); + } else { + break; /* Break from the smpl loop for loop. */ + } + } + } + } else { + /* Looks like invalid data. Ignore the chunk. */ + } + } + } else { + if (drwav__guid_equal(header.id.guid, drwavGUID_W64_SMPL)) { + /* + This path will be hit when a W64 WAV file contains a smpl chunk. I don't have a sample file to test this path, so a contribution + is welcome to add support for this. + */ + } + } + + /* Make sure we seek past the padding. */ + chunkSize += header.paddingSize; + if (!drwav__seek_forward(pWav->onSeek, chunkSize, pWav->pUserData)) { + break; + } + cursor += chunkSize; + + if (!foundDataChunk) { + pWav->dataChunkDataPos = cursor; + } + } + + /* If we haven't found a data chunk, return an error. */ + if (!foundDataChunk) { + return DRWAV_FALSE; + } + + /* We may have moved passed the data chunk. If so we need to move back. If running in sequential mode we can assume we are already sitting on the data chunk. */ + if (!sequential) { + if (!drwav__seek_from_start(pWav->onSeek, pWav->dataChunkDataPos, pWav->pUserData)) { + return DRWAV_FALSE; + } + cursor = pWav->dataChunkDataPos; + } + + + /* At this point we should be sitting on the first byte of the raw audio data. */ + + pWav->fmt = fmt; + pWav->sampleRate = fmt.sampleRate; + pWav->channels = fmt.channels; + pWav->bitsPerSample = fmt.bitsPerSample; + pWav->bytesRemaining = dataChunkSize; + pWav->translatedFormatTag = translatedFormatTag; + pWav->dataChunkDataSize = dataChunkSize; + + if (sampleCountFromFactChunk != 0) { + pWav->totalPCMFrameCount = sampleCountFromFactChunk; + } else { + pWav->totalPCMFrameCount = dataChunkSize / drwav_get_bytes_per_pcm_frame(pWav); + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) { + drwav_uint64 totalBlockHeaderSizeInBytes; + drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign; + + /* Make sure any trailing partial block is accounted for. */ + if ((blockCount * fmt.blockAlign) < dataChunkSize) { + blockCount += 1; + } + + /* We decode two samples per byte. There will be blockCount headers in the data chunk. This is enough to know how to calculate the total PCM frame count. */ + totalBlockHeaderSizeInBytes = blockCount * (6*fmt.channels); + pWav->totalPCMFrameCount = ((dataChunkSize - totalBlockHeaderSizeInBytes) * 2) / fmt.channels; + } + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) { + drwav_uint64 totalBlockHeaderSizeInBytes; + drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign; + + /* Make sure any trailing partial block is accounted for. */ + if ((blockCount * fmt.blockAlign) < dataChunkSize) { + blockCount += 1; + } + + /* We decode two samples per byte. There will be blockCount headers in the data chunk. This is enough to know how to calculate the total PCM frame count. */ + totalBlockHeaderSizeInBytes = blockCount * (4*fmt.channels); + pWav->totalPCMFrameCount = ((dataChunkSize - totalBlockHeaderSizeInBytes) * 2) / fmt.channels; + + /* The header includes a decoded sample for each channel which acts as the initial predictor sample. */ + pWav->totalPCMFrameCount += blockCount; + } + } + + /* Some formats only support a certain number of channels. */ + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM || pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) { + if (pWav->channels > 2) { + return DRWAV_FALSE; + } + } + +#ifdef DR_WAV_LIBSNDFILE_COMPAT + /* + I use libsndfile as a benchmark for testing, however in the version I'm using (from the Windows installer on the libsndfile website), + it appears the total sample count libsndfile uses for MS-ADPCM is incorrect. It would seem they are computing the total sample count + from the number of blocks, however this results in the inclusion of extra silent samples at the end of the last block. The correct + way to know the total sample count is to inspect the "fact" chunk, which should always be present for compressed formats, and should + always include the sample count. This little block of code below is only used to emulate the libsndfile logic so I can properly run my + correctness tests against libsndfile, and is disabled by default. + */ + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) { + drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign; + pWav->totalPCMFrameCount = (((blockCount * (fmt.blockAlign - (6*pWav->channels))) * 2)) / fmt.channels; /* x2 because two samples per byte. */ + } + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) { + drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign; + pWav->totalPCMFrameCount = (((blockCount * (fmt.blockAlign - (4*pWav->channels))) * 2) + (blockCount * pWav->channels)) / fmt.channels; + } +#endif + + return DRWAV_TRUE; +} + +DRWAV_API drwav_bool32 drwav_init(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + return drwav_init_ex(pWav, onRead, onSeek, NULL, pUserData, NULL, 0, pAllocationCallbacks); +} + +DRWAV_API drwav_bool32 drwav_init_ex(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_chunk_proc onChunk, void* pReadSeekUserData, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + if (!drwav_preinit(pWav, onRead, onSeek, pReadSeekUserData, pAllocationCallbacks)) { + return DRWAV_FALSE; + } + + return drwav_init__internal(pWav, onChunk, pChunkUserData, flags); +} + + +static drwav_uint32 drwav__riff_chunk_size_riff(drwav_uint64 dataChunkSize) +{ + drwav_uint64 chunkSize = 4 + 24 + dataChunkSize + drwav__chunk_padding_size_riff(dataChunkSize); /* 4 = "WAVE". 24 = "fmt " chunk. */ + if (chunkSize > 0xFFFFFFFFUL) { + chunkSize = 0xFFFFFFFFUL; + } + + return (drwav_uint32)chunkSize; /* Safe cast due to the clamp above. */ +} + +static drwav_uint32 drwav__data_chunk_size_riff(drwav_uint64 dataChunkSize) +{ + if (dataChunkSize <= 0xFFFFFFFFUL) { + return (drwav_uint32)dataChunkSize; + } else { + return 0xFFFFFFFFUL; + } +} + +static drwav_uint64 drwav__riff_chunk_size_w64(drwav_uint64 dataChunkSize) +{ + drwav_uint64 dataSubchunkPaddingSize = drwav__chunk_padding_size_w64(dataChunkSize); + + return 80 + 24 + dataChunkSize + dataSubchunkPaddingSize; /* +24 because W64 includes the size of the GUID and size fields. */ +} + +static drwav_uint64 drwav__data_chunk_size_w64(drwav_uint64 dataChunkSize) +{ + return 24 + dataChunkSize; /* +24 because W64 includes the size of the GUID and size fields. */ +} + +static drwav_uint64 drwav__riff_chunk_size_rf64(drwav_uint64 dataChunkSize) +{ + drwav_uint64 chunkSize = 4 + 36 + 24 + dataChunkSize + drwav__chunk_padding_size_riff(dataChunkSize); /* 4 = "WAVE". 36 = "ds64" chunk. 24 = "fmt " chunk. */ + if (chunkSize > 0xFFFFFFFFUL) { + chunkSize = 0xFFFFFFFFUL; + } + + return chunkSize; +} + +static drwav_uint64 drwav__data_chunk_size_rf64(drwav_uint64 dataChunkSize) +{ + return dataChunkSize; +} + + +static size_t drwav__write(drwav* pWav, const void* pData, size_t dataSize) +{ + DRWAV_ASSERT(pWav != NULL); + DRWAV_ASSERT(pWav->onWrite != NULL); + + /* Generic write. Assumes no byte reordering required. */ + return pWav->onWrite(pWav->pUserData, pData, dataSize); +} + +static size_t drwav__write_u16ne_to_le(drwav* pWav, drwav_uint16 value) +{ + DRWAV_ASSERT(pWav != NULL); + DRWAV_ASSERT(pWav->onWrite != NULL); + + if (!drwav__is_little_endian()) { + value = drwav__bswap16(value); + } + + return drwav__write(pWav, &value, 2); +} + +static size_t drwav__write_u32ne_to_le(drwav* pWav, drwav_uint32 value) +{ + DRWAV_ASSERT(pWav != NULL); + DRWAV_ASSERT(pWav->onWrite != NULL); + + if (!drwav__is_little_endian()) { + value = drwav__bswap32(value); + } + + return drwav__write(pWav, &value, 4); +} + +static size_t drwav__write_u64ne_to_le(drwav* pWav, drwav_uint64 value) +{ + DRWAV_ASSERT(pWav != NULL); + DRWAV_ASSERT(pWav->onWrite != NULL); + + if (!drwav__is_little_endian()) { + value = drwav__bswap64(value); + } + + return drwav__write(pWav, &value, 8); +} + + +static drwav_bool32 drwav_preinit_write(drwav* pWav, const drwav_data_format* pFormat, drwav_bool32 isSequential, drwav_write_proc onWrite, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + if (pWav == NULL || onWrite == NULL) { + return DRWAV_FALSE; + } + + if (!isSequential && onSeek == NULL) { + return DRWAV_FALSE; /* <-- onSeek is required when in non-sequential mode. */ + } + + /* Not currently supporting compressed formats. Will need to add support for the "fact" chunk before we enable this. */ + if (pFormat->format == DR_WAVE_FORMAT_EXTENSIBLE) { + return DRWAV_FALSE; + } + if (pFormat->format == DR_WAVE_FORMAT_ADPCM || pFormat->format == DR_WAVE_FORMAT_DVI_ADPCM) { + return DRWAV_FALSE; + } + + DRWAV_ZERO_MEMORY(pWav, sizeof(*pWav)); + pWav->onWrite = onWrite; + pWav->onSeek = onSeek; + pWav->pUserData = pUserData; + pWav->allocationCallbacks = drwav_copy_allocation_callbacks_or_defaults(pAllocationCallbacks); + + if (pWav->allocationCallbacks.onFree == NULL || (pWav->allocationCallbacks.onMalloc == NULL && pWav->allocationCallbacks.onRealloc == NULL)) { + return DRWAV_FALSE; /* Invalid allocation callbacks. */ + } + + pWav->fmt.formatTag = (drwav_uint16)pFormat->format; + pWav->fmt.channels = (drwav_uint16)pFormat->channels; + pWav->fmt.sampleRate = pFormat->sampleRate; + pWav->fmt.avgBytesPerSec = (drwav_uint32)((pFormat->bitsPerSample * pFormat->sampleRate * pFormat->channels) / 8); + pWav->fmt.blockAlign = (drwav_uint16)((pFormat->channels * pFormat->bitsPerSample) / 8); + pWav->fmt.bitsPerSample = (drwav_uint16)pFormat->bitsPerSample; + pWav->fmt.extendedSize = 0; + pWav->isSequentialWrite = isSequential; + + return DRWAV_TRUE; +} + +static drwav_bool32 drwav_init_write__internal(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount) +{ + /* The function assumes drwav_preinit_write() was called beforehand. */ + + size_t runningPos = 0; + drwav_uint64 initialDataChunkSize = 0; + drwav_uint64 chunkSizeFMT; + + /* + The initial values for the "RIFF" and "data" chunks depends on whether or not we are initializing in sequential mode or not. In + sequential mode we set this to its final values straight away since they can be calculated from the total sample count. In non- + sequential mode we initialize it all to zero and fill it out in drwav_uninit() using a backwards seek. + */ + if (pWav->isSequentialWrite) { + initialDataChunkSize = (totalSampleCount * pWav->fmt.bitsPerSample) / 8; + + /* + The RIFF container has a limit on the number of samples. drwav is not allowing this. There's no practical limits for Wave64 + so for the sake of simplicity I'm not doing any validation for that. + */ + if (pFormat->container == drwav_container_riff) { + if (initialDataChunkSize > (0xFFFFFFFFUL - 36)) { + return DRWAV_FALSE; /* Not enough room to store every sample. */ + } + } + } + + pWav->dataChunkDataSizeTargetWrite = initialDataChunkSize; + + + /* "RIFF" chunk. */ + if (pFormat->container == drwav_container_riff) { + drwav_uint32 chunkSizeRIFF = 28 + (drwav_uint32)initialDataChunkSize; /* +28 = "WAVE" + [sizeof "fmt " chunk] */ + runningPos += drwav__write(pWav, "RIFF", 4); + runningPos += drwav__write_u32ne_to_le(pWav, chunkSizeRIFF); + runningPos += drwav__write(pWav, "WAVE", 4); + } else if (pFormat->container == drwav_container_w64) { + drwav_uint64 chunkSizeRIFF = 80 + 24 + initialDataChunkSize; /* +24 because W64 includes the size of the GUID and size fields. */ + runningPos += drwav__write(pWav, drwavGUID_W64_RIFF, 16); + runningPos += drwav__write_u64ne_to_le(pWav, chunkSizeRIFF); + runningPos += drwav__write(pWav, drwavGUID_W64_WAVE, 16); + } else if (pFormat->container == drwav_container_rf64) { + runningPos += drwav__write(pWav, "RF64", 4); + runningPos += drwav__write_u32ne_to_le(pWav, 0xFFFFFFFF); /* Always 0xFFFFFFFF for RF64. Set to a proper value in the "ds64" chunk. */ + runningPos += drwav__write(pWav, "WAVE", 4); + } + + + /* "ds64" chunk (RF64 only). */ + if (pFormat->container == drwav_container_rf64) { + drwav_uint32 initialds64ChunkSize = 28; /* 28 = [Size of RIFF (8 bytes)] + [Size of DATA (8 bytes)] + [Sample Count (8 bytes)] + [Table Length (4 bytes)]. Table length always set to 0. */ + drwav_uint64 initialRiffChunkSize = 8 + initialds64ChunkSize + initialDataChunkSize; /* +8 for the ds64 header. */ + + runningPos += drwav__write(pWav, "ds64", 4); + runningPos += drwav__write_u32ne_to_le(pWav, initialds64ChunkSize); /* Size of ds64. */ + runningPos += drwav__write_u64ne_to_le(pWav, initialRiffChunkSize); /* Size of RIFF. Set to true value at the end. */ + runningPos += drwav__write_u64ne_to_le(pWav, initialDataChunkSize); /* Size of DATA. Set to true value at the end. */ + runningPos += drwav__write_u64ne_to_le(pWav, totalSampleCount); /* Sample count. */ + runningPos += drwav__write_u32ne_to_le(pWav, 0); /* Table length. Always set to zero in our case since we're not doing any other chunks than "DATA". */ + } + + + /* "fmt " chunk. */ + if (pFormat->container == drwav_container_riff || pFormat->container == drwav_container_rf64) { + chunkSizeFMT = 16; + runningPos += drwav__write(pWav, "fmt ", 4); + runningPos += drwav__write_u32ne_to_le(pWav, (drwav_uint32)chunkSizeFMT); + } else if (pFormat->container == drwav_container_w64) { + chunkSizeFMT = 40; + runningPos += drwav__write(pWav, drwavGUID_W64_FMT, 16); + runningPos += drwav__write_u64ne_to_le(pWav, chunkSizeFMT); + } + + runningPos += drwav__write_u16ne_to_le(pWav, pWav->fmt.formatTag); + runningPos += drwav__write_u16ne_to_le(pWav, pWav->fmt.channels); + runningPos += drwav__write_u32ne_to_le(pWav, pWav->fmt.sampleRate); + runningPos += drwav__write_u32ne_to_le(pWav, pWav->fmt.avgBytesPerSec); + runningPos += drwav__write_u16ne_to_le(pWav, pWav->fmt.blockAlign); + runningPos += drwav__write_u16ne_to_le(pWav, pWav->fmt.bitsPerSample); + + pWav->dataChunkDataPos = runningPos; + + /* "data" chunk. */ + if (pFormat->container == drwav_container_riff) { + drwav_uint32 chunkSizeDATA = (drwav_uint32)initialDataChunkSize; + runningPos += drwav__write(pWav, "data", 4); + runningPos += drwav__write_u32ne_to_le(pWav, chunkSizeDATA); + } else if (pFormat->container == drwav_container_w64) { + drwav_uint64 chunkSizeDATA = 24 + initialDataChunkSize; /* +24 because W64 includes the size of the GUID and size fields. */ + runningPos += drwav__write(pWav, drwavGUID_W64_DATA, 16); + runningPos += drwav__write_u64ne_to_le(pWav, chunkSizeDATA); + } else if (pFormat->container == drwav_container_rf64) { + runningPos += drwav__write(pWav, "data", 4); + runningPos += drwav__write_u32ne_to_le(pWav, 0xFFFFFFFF); /* Always set to 0xFFFFFFFF for RF64. The true size of the data chunk is specified in the ds64 chunk. */ + } + + /* + The runningPos variable is incremented in the section above but is left unused which is causing some static analysis tools to detect it + as a dead store. I'm leaving this as-is for safety just in case I want to expand this function later to include other tags and want to + keep track of the running position for whatever reason. The line below should silence the static analysis tools. + */ + (void)runningPos; + + /* Set some properties for the client's convenience. */ + pWav->container = pFormat->container; + pWav->channels = (drwav_uint16)pFormat->channels; + pWav->sampleRate = pFormat->sampleRate; + pWav->bitsPerSample = (drwav_uint16)pFormat->bitsPerSample; + pWav->translatedFormatTag = (drwav_uint16)pFormat->format; + + return DRWAV_TRUE; +} + + +DRWAV_API drwav_bool32 drwav_init_write(drwav* pWav, const drwav_data_format* pFormat, drwav_write_proc onWrite, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + if (!drwav_preinit_write(pWav, pFormat, DRWAV_FALSE, onWrite, onSeek, pUserData, pAllocationCallbacks)) { + return DRWAV_FALSE; + } + + return drwav_init_write__internal(pWav, pFormat, 0); /* DRWAV_FALSE = Not Sequential */ +} + +DRWAV_API drwav_bool32 drwav_init_write_sequential(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_write_proc onWrite, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + if (!drwav_preinit_write(pWav, pFormat, DRWAV_TRUE, onWrite, NULL, pUserData, pAllocationCallbacks)) { + return DRWAV_FALSE; + } + + return drwav_init_write__internal(pWav, pFormat, totalSampleCount); /* DRWAV_TRUE = Sequential */ +} + +DRWAV_API drwav_bool32 drwav_init_write_sequential_pcm_frames(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, drwav_write_proc onWrite, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + if (pFormat == NULL) { + return DRWAV_FALSE; + } + + return drwav_init_write_sequential(pWav, pFormat, totalPCMFrameCount*pFormat->channels, onWrite, pUserData, pAllocationCallbacks); +} + +DRWAV_API drwav_uint64 drwav_target_write_size_bytes(const drwav_data_format* pFormat, drwav_uint64 totalSampleCount) +{ + /* Casting totalSampleCount to drwav_int64 for VC6 compatibility. No issues in practice because nobody is going to exhaust the whole 63 bits. */ + drwav_uint64 targetDataSizeBytes = (drwav_uint64)((drwav_int64)totalSampleCount * pFormat->channels * pFormat->bitsPerSample/8.0); + drwav_uint64 riffChunkSizeBytes; + drwav_uint64 fileSizeBytes = 0; + + if (pFormat->container == drwav_container_riff) { + riffChunkSizeBytes = drwav__riff_chunk_size_riff(targetDataSizeBytes); + fileSizeBytes = (8 + riffChunkSizeBytes); /* +8 because WAV doesn't include the size of the ChunkID and ChunkSize fields. */ + } else if (pFormat->container == drwav_container_w64) { + riffChunkSizeBytes = drwav__riff_chunk_size_w64(targetDataSizeBytes); + fileSizeBytes = riffChunkSizeBytes; + } else if (pFormat->container == drwav_container_rf64) { + riffChunkSizeBytes = drwav__riff_chunk_size_rf64(targetDataSizeBytes); + fileSizeBytes = (8 + riffChunkSizeBytes); /* +8 because WAV doesn't include the size of the ChunkID and ChunkSize fields. */ + } + + return fileSizeBytes; +} + + +#ifndef DR_WAV_NO_STDIO + +/* drwav_result_from_errno() is only used for fopen() and wfopen() so putting it inside DR_WAV_NO_STDIO for now. If something else needs this later we can move it out. */ +#include +static drwav_result drwav_result_from_errno(int e) +{ + switch (e) + { + case 0: return DRWAV_SUCCESS; + #ifdef EPERM + case EPERM: return DRWAV_INVALID_OPERATION; + #endif + #ifdef ENOENT + case ENOENT: return DRWAV_DOES_NOT_EXIST; + #endif + #ifdef ESRCH + case ESRCH: return DRWAV_DOES_NOT_EXIST; + #endif + #ifdef EINTR + case EINTR: return DRWAV_INTERRUPT; + #endif + #ifdef EIO + case EIO: return DRWAV_IO_ERROR; + #endif + #ifdef ENXIO + case ENXIO: return DRWAV_DOES_NOT_EXIST; + #endif + #ifdef E2BIG + case E2BIG: return DRWAV_INVALID_ARGS; + #endif + #ifdef ENOEXEC + case ENOEXEC: return DRWAV_INVALID_FILE; + #endif + #ifdef EBADF + case EBADF: return DRWAV_INVALID_FILE; + #endif + #ifdef ECHILD + case ECHILD: return DRWAV_ERROR; + #endif + #ifdef EAGAIN + case EAGAIN: return DRWAV_UNAVAILABLE; + #endif + #ifdef ENOMEM + case ENOMEM: return DRWAV_OUT_OF_MEMORY; + #endif + #ifdef EACCES + case EACCES: return DRWAV_ACCESS_DENIED; + #endif + #ifdef EFAULT + case EFAULT: return DRWAV_BAD_ADDRESS; + #endif + #ifdef ENOTBLK + case ENOTBLK: return DRWAV_ERROR; + #endif + #ifdef EBUSY + case EBUSY: return DRWAV_BUSY; + #endif + #ifdef EEXIST + case EEXIST: return DRWAV_ALREADY_EXISTS; + #endif + #ifdef EXDEV + case EXDEV: return DRWAV_ERROR; + #endif + #ifdef ENODEV + case ENODEV: return DRWAV_DOES_NOT_EXIST; + #endif + #ifdef ENOTDIR + case ENOTDIR: return DRWAV_NOT_DIRECTORY; + #endif + #ifdef EISDIR + case EISDIR: return DRWAV_IS_DIRECTORY; + #endif + #ifdef EINVAL + case EINVAL: return DRWAV_INVALID_ARGS; + #endif + #ifdef ENFILE + case ENFILE: return DRWAV_TOO_MANY_OPEN_FILES; + #endif + #ifdef EMFILE + case EMFILE: return DRWAV_TOO_MANY_OPEN_FILES; + #endif + #ifdef ENOTTY + case ENOTTY: return DRWAV_INVALID_OPERATION; + #endif + #ifdef ETXTBSY + case ETXTBSY: return DRWAV_BUSY; + #endif + #ifdef EFBIG + case EFBIG: return DRWAV_TOO_BIG; + #endif + #ifdef ENOSPC + case ENOSPC: return DRWAV_NO_SPACE; + #endif + #ifdef ESPIPE + case ESPIPE: return DRWAV_BAD_SEEK; + #endif + #ifdef EROFS + case EROFS: return DRWAV_ACCESS_DENIED; + #endif + #ifdef EMLINK + case EMLINK: return DRWAV_TOO_MANY_LINKS; + #endif + #ifdef EPIPE + case EPIPE: return DRWAV_BAD_PIPE; + #endif + #ifdef EDOM + case EDOM: return DRWAV_OUT_OF_RANGE; + #endif + #ifdef ERANGE + case ERANGE: return DRWAV_OUT_OF_RANGE; + #endif + #ifdef EDEADLK + case EDEADLK: return DRWAV_DEADLOCK; + #endif + #ifdef ENAMETOOLONG + case ENAMETOOLONG: return DRWAV_PATH_TOO_LONG; + #endif + #ifdef ENOLCK + case ENOLCK: return DRWAV_ERROR; + #endif + #ifdef ENOSYS + case ENOSYS: return DRWAV_NOT_IMPLEMENTED; + #endif + #ifdef ENOTEMPTY + case ENOTEMPTY: return DRWAV_DIRECTORY_NOT_EMPTY; + #endif + #ifdef ELOOP + case ELOOP: return DRWAV_TOO_MANY_LINKS; + #endif + #ifdef ENOMSG + case ENOMSG: return DRWAV_NO_MESSAGE; + #endif + #ifdef EIDRM + case EIDRM: return DRWAV_ERROR; + #endif + #ifdef ECHRNG + case ECHRNG: return DRWAV_ERROR; + #endif + #ifdef EL2NSYNC + case EL2NSYNC: return DRWAV_ERROR; + #endif + #ifdef EL3HLT + case EL3HLT: return DRWAV_ERROR; + #endif + #ifdef EL3RST + case EL3RST: return DRWAV_ERROR; + #endif + #ifdef ELNRNG + case ELNRNG: return DRWAV_OUT_OF_RANGE; + #endif + #ifdef EUNATCH + case EUNATCH: return DRWAV_ERROR; + #endif + #ifdef ENOCSI + case ENOCSI: return DRWAV_ERROR; + #endif + #ifdef EL2HLT + case EL2HLT: return DRWAV_ERROR; + #endif + #ifdef EBADE + case EBADE: return DRWAV_ERROR; + #endif + #ifdef EBADR + case EBADR: return DRWAV_ERROR; + #endif + #ifdef EXFULL + case EXFULL: return DRWAV_ERROR; + #endif + #ifdef ENOANO + case ENOANO: return DRWAV_ERROR; + #endif + #ifdef EBADRQC + case EBADRQC: return DRWAV_ERROR; + #endif + #ifdef EBADSLT + case EBADSLT: return DRWAV_ERROR; + #endif + #ifdef EBFONT + case EBFONT: return DRWAV_INVALID_FILE; + #endif + #ifdef ENOSTR + case ENOSTR: return DRWAV_ERROR; + #endif + #ifdef ENODATA + case ENODATA: return DRWAV_NO_DATA_AVAILABLE; + #endif + #ifdef ETIME + case ETIME: return DRWAV_TIMEOUT; + #endif + #ifdef ENOSR + case ENOSR: return DRWAV_NO_DATA_AVAILABLE; + #endif + #ifdef ENONET + case ENONET: return DRWAV_NO_NETWORK; + #endif + #ifdef ENOPKG + case ENOPKG: return DRWAV_ERROR; + #endif + #ifdef EREMOTE + case EREMOTE: return DRWAV_ERROR; + #endif + #ifdef ENOLINK + case ENOLINK: return DRWAV_ERROR; + #endif + #ifdef EADV + case EADV: return DRWAV_ERROR; + #endif + #ifdef ESRMNT + case ESRMNT: return DRWAV_ERROR; + #endif + #ifdef ECOMM + case ECOMM: return DRWAV_ERROR; + #endif + #ifdef EPROTO + case EPROTO: return DRWAV_ERROR; + #endif + #ifdef EMULTIHOP + case EMULTIHOP: return DRWAV_ERROR; + #endif + #ifdef EDOTDOT + case EDOTDOT: return DRWAV_ERROR; + #endif + #ifdef EBADMSG + case EBADMSG: return DRWAV_BAD_MESSAGE; + #endif + #ifdef EOVERFLOW + case EOVERFLOW: return DRWAV_TOO_BIG; + #endif + #ifdef ENOTUNIQ + case ENOTUNIQ: return DRWAV_NOT_UNIQUE; + #endif + #ifdef EBADFD + case EBADFD: return DRWAV_ERROR; + #endif + #ifdef EREMCHG + case EREMCHG: return DRWAV_ERROR; + #endif + #ifdef ELIBACC + case ELIBACC: return DRWAV_ACCESS_DENIED; + #endif + #ifdef ELIBBAD + case ELIBBAD: return DRWAV_INVALID_FILE; + #endif + #ifdef ELIBSCN + case ELIBSCN: return DRWAV_INVALID_FILE; + #endif + #ifdef ELIBMAX + case ELIBMAX: return DRWAV_ERROR; + #endif + #ifdef ELIBEXEC + case ELIBEXEC: return DRWAV_ERROR; + #endif + #ifdef EILSEQ + case EILSEQ: return DRWAV_INVALID_DATA; + #endif + #ifdef ERESTART + case ERESTART: return DRWAV_ERROR; + #endif + #ifdef ESTRPIPE + case ESTRPIPE: return DRWAV_ERROR; + #endif + #ifdef EUSERS + case EUSERS: return DRWAV_ERROR; + #endif + #ifdef ENOTSOCK + case ENOTSOCK: return DRWAV_NOT_SOCKET; + #endif + #ifdef EDESTADDRREQ + case EDESTADDRREQ: return DRWAV_NO_ADDRESS; + #endif + #ifdef EMSGSIZE + case EMSGSIZE: return DRWAV_TOO_BIG; + #endif + #ifdef EPROTOTYPE + case EPROTOTYPE: return DRWAV_BAD_PROTOCOL; + #endif + #ifdef ENOPROTOOPT + case ENOPROTOOPT: return DRWAV_PROTOCOL_UNAVAILABLE; + #endif + #ifdef EPROTONOSUPPORT + case EPROTONOSUPPORT: return DRWAV_PROTOCOL_NOT_SUPPORTED; + #endif + #ifdef ESOCKTNOSUPPORT + case ESOCKTNOSUPPORT: return DRWAV_SOCKET_NOT_SUPPORTED; + #endif + #ifdef EOPNOTSUPP + case EOPNOTSUPP: return DRWAV_INVALID_OPERATION; + #endif + #ifdef EPFNOSUPPORT + case EPFNOSUPPORT: return DRWAV_PROTOCOL_FAMILY_NOT_SUPPORTED; + #endif + #ifdef EAFNOSUPPORT + case EAFNOSUPPORT: return DRWAV_ADDRESS_FAMILY_NOT_SUPPORTED; + #endif + #ifdef EADDRINUSE + case EADDRINUSE: return DRWAV_ALREADY_IN_USE; + #endif + #ifdef EADDRNOTAVAIL + case EADDRNOTAVAIL: return DRWAV_ERROR; + #endif + #ifdef ENETDOWN + case ENETDOWN: return DRWAV_NO_NETWORK; + #endif + #ifdef ENETUNREACH + case ENETUNREACH: return DRWAV_NO_NETWORK; + #endif + #ifdef ENETRESET + case ENETRESET: return DRWAV_NO_NETWORK; + #endif + #ifdef ECONNABORTED + case ECONNABORTED: return DRWAV_NO_NETWORK; + #endif + #ifdef ECONNRESET + case ECONNRESET: return DRWAV_CONNECTION_RESET; + #endif + #ifdef ENOBUFS + case ENOBUFS: return DRWAV_NO_SPACE; + #endif + #ifdef EISCONN + case EISCONN: return DRWAV_ALREADY_CONNECTED; + #endif + #ifdef ENOTCONN + case ENOTCONN: return DRWAV_NOT_CONNECTED; + #endif + #ifdef ESHUTDOWN + case ESHUTDOWN: return DRWAV_ERROR; + #endif + #ifdef ETOOMANYREFS + case ETOOMANYREFS: return DRWAV_ERROR; + #endif + #ifdef ETIMEDOUT + case ETIMEDOUT: return DRWAV_TIMEOUT; + #endif + #ifdef ECONNREFUSED + case ECONNREFUSED: return DRWAV_CONNECTION_REFUSED; + #endif + #ifdef EHOSTDOWN + case EHOSTDOWN: return DRWAV_NO_HOST; + #endif + #ifdef EHOSTUNREACH + case EHOSTUNREACH: return DRWAV_NO_HOST; + #endif + #ifdef EALREADY + case EALREADY: return DRWAV_IN_PROGRESS; + #endif + #ifdef EINPROGRESS + case EINPROGRESS: return DRWAV_IN_PROGRESS; + #endif + #ifdef ESTALE + case ESTALE: return DRWAV_INVALID_FILE; + #endif + #ifdef EUCLEAN + case EUCLEAN: return DRWAV_ERROR; + #endif + #ifdef ENOTNAM + case ENOTNAM: return DRWAV_ERROR; + #endif + #ifdef ENAVAIL + case ENAVAIL: return DRWAV_ERROR; + #endif + #ifdef EISNAM + case EISNAM: return DRWAV_ERROR; + #endif + #ifdef EREMOTEIO + case EREMOTEIO: return DRWAV_IO_ERROR; + #endif + #ifdef EDQUOT + case EDQUOT: return DRWAV_NO_SPACE; + #endif + #ifdef ENOMEDIUM + case ENOMEDIUM: return DRWAV_DOES_NOT_EXIST; + #endif + #ifdef EMEDIUMTYPE + case EMEDIUMTYPE: return DRWAV_ERROR; + #endif + #ifdef ECANCELED + case ECANCELED: return DRWAV_CANCELLED; + #endif + #ifdef ENOKEY + case ENOKEY: return DRWAV_ERROR; + #endif + #ifdef EKEYEXPIRED + case EKEYEXPIRED: return DRWAV_ERROR; + #endif + #ifdef EKEYREVOKED + case EKEYREVOKED: return DRWAV_ERROR; + #endif + #ifdef EKEYREJECTED + case EKEYREJECTED: return DRWAV_ERROR; + #endif + #ifdef EOWNERDEAD + case EOWNERDEAD: return DRWAV_ERROR; + #endif + #ifdef ENOTRECOVERABLE + case ENOTRECOVERABLE: return DRWAV_ERROR; + #endif + #ifdef ERFKILL + case ERFKILL: return DRWAV_ERROR; + #endif + #ifdef EHWPOISON + case EHWPOISON: return DRWAV_ERROR; + #endif + default: return DRWAV_ERROR; + } +} + +static drwav_result drwav_fopen(FILE** ppFile, const char* pFilePath, const char* pOpenMode) +{ +#if _MSC_VER && _MSC_VER >= 1400 + errno_t err; +#endif + + if (ppFile != NULL) { + *ppFile = NULL; /* Safety. */ + } + + if (pFilePath == NULL || pOpenMode == NULL || ppFile == NULL) { + return DRWAV_INVALID_ARGS; + } + +#if _MSC_VER && _MSC_VER >= 1400 + err = fopen_s(ppFile, pFilePath, pOpenMode); + if (err != 0) { + return drwav_result_from_errno(err); + } +#else +#if defined(_WIN32) || defined(__APPLE__) + *ppFile = fopen(pFilePath, pOpenMode); +#else + #if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64 && defined(_LARGEFILE64_SOURCE) + *ppFile = fopen64(pFilePath, pOpenMode); + #else + *ppFile = fopen(pFilePath, pOpenMode); + #endif +#endif + if (*ppFile == NULL) { + drwav_result result = drwav_result_from_errno(errno); + if (result == DRWAV_SUCCESS) { + result = DRWAV_ERROR; /* Just a safety check to make sure we never ever return success when pFile == NULL. */ + } + + return result; + } +#endif + + return DRWAV_SUCCESS; +} + +/* +_wfopen() isn't always available in all compilation environments. + + * Windows only. + * MSVC seems to support it universally as far back as VC6 from what I can tell (haven't checked further back). + * MinGW-64 (both 32- and 64-bit) seems to support it. + * MinGW wraps it in !defined(__STRICT_ANSI__). + * OpenWatcom wraps it in !defined(_NO_EXT_KEYS). + +This can be reviewed as compatibility issues arise. The preference is to use _wfopen_s() and _wfopen() as opposed to the wcsrtombs() +fallback, so if you notice your compiler not detecting this properly I'm happy to look at adding support. +*/ +#if defined(_WIN32) + #if defined(_MSC_VER) || defined(__MINGW64__) || (!defined(__STRICT_ANSI__) && !defined(_NO_EXT_KEYS)) + #define DRWAV_HAS_WFOPEN + #endif +#endif + +static drwav_result drwav_wfopen(FILE** ppFile, const wchar_t* pFilePath, const wchar_t* pOpenMode, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + if (ppFile != NULL) { + *ppFile = NULL; /* Safety. */ + } + + if (pFilePath == NULL || pOpenMode == NULL || ppFile == NULL) { + return DRWAV_INVALID_ARGS; + } + +#if defined(DRWAV_HAS_WFOPEN) + { + /* Use _wfopen() on Windows. */ + #if defined(_MSC_VER) && _MSC_VER >= 1400 + errno_t err = _wfopen_s(ppFile, pFilePath, pOpenMode); + if (err != 0) { + return drwav_result_from_errno(err); + } + #else + *ppFile = _wfopen(pFilePath, pOpenMode); + if (*ppFile == NULL) { + return drwav_result_from_errno(errno); + } + #endif + (void)pAllocationCallbacks; + } +#else + /* + Use fopen() on anything other than Windows. Requires a conversion. This is annoying because fopen() is locale specific. The only real way I can + think of to do this is with wcsrtombs(). Note that wcstombs() is apparently not thread-safe because it uses a static global mbstate_t object for + maintaining state. I've checked this with -std=c89 and it works, but if somebody get's a compiler error I'll look into improving compatibility. + */ + { + mbstate_t mbs; + size_t lenMB; + const wchar_t* pFilePathTemp = pFilePath; + char* pFilePathMB = NULL; + char pOpenModeMB[32] = {0}; + + /* Get the length first. */ + DRWAV_ZERO_OBJECT(&mbs); + lenMB = wcsrtombs(NULL, &pFilePathTemp, 0, &mbs); + if (lenMB == (size_t)-1) { + return drwav_result_from_errno(errno); + } + + pFilePathMB = (char*)drwav__malloc_from_callbacks(lenMB + 1, pAllocationCallbacks); + if (pFilePathMB == NULL) { + return DRWAV_OUT_OF_MEMORY; + } + + pFilePathTemp = pFilePath; + DRWAV_ZERO_OBJECT(&mbs); + wcsrtombs(pFilePathMB, &pFilePathTemp, lenMB + 1, &mbs); + + /* The open mode should always consist of ASCII characters so we should be able to do a trivial conversion. */ + { + size_t i = 0; + for (;;) { + if (pOpenMode[i] == 0) { + pOpenModeMB[i] = '\0'; + break; + } + + pOpenModeMB[i] = (char)pOpenMode[i]; + i += 1; + } + } + + *ppFile = fopen(pFilePathMB, pOpenModeMB); + + drwav__free_from_callbacks(pFilePathMB, pAllocationCallbacks); + } + + if (*ppFile == NULL) { + return DRWAV_ERROR; + } +#endif + + return DRWAV_SUCCESS; +} + + +static size_t drwav__on_read_stdio(void* pUserData, void* pBufferOut, size_t bytesToRead) +{ + return fread(pBufferOut, 1, bytesToRead, (FILE*)pUserData); +} + +static size_t drwav__on_write_stdio(void* pUserData, const void* pData, size_t bytesToWrite) +{ + return fwrite(pData, 1, bytesToWrite, (FILE*)pUserData); +} + +static drwav_bool32 drwav__on_seek_stdio(void* pUserData, int offset, drwav_seek_origin origin) +{ + return fseek((FILE*)pUserData, offset, (origin == drwav_seek_origin_current) ? SEEK_CUR : SEEK_SET) == 0; +} + +DRWAV_API drwav_bool32 drwav_init_file(drwav* pWav, const char* filename, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + return drwav_init_file_ex(pWav, filename, NULL, NULL, 0, pAllocationCallbacks); +} + + +static drwav_bool32 drwav_init_file__internal_FILE(drwav* pWav, FILE* pFile, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + drwav_bool32 result; + + result = drwav_preinit(pWav, drwav__on_read_stdio, drwav__on_seek_stdio, (void*)pFile, pAllocationCallbacks); + if (result != DRWAV_TRUE) { + fclose(pFile); + return result; + } + + result = drwav_init__internal(pWav, onChunk, pChunkUserData, flags); + if (result != DRWAV_TRUE) { + fclose(pFile); + return result; + } + + return DRWAV_TRUE; +} + +DRWAV_API drwav_bool32 drwav_init_file_ex(drwav* pWav, const char* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + FILE* pFile; + if (drwav_fopen(&pFile, filename, "rb") != DRWAV_SUCCESS) { + return DRWAV_FALSE; + } + + /* This takes ownership of the FILE* object. */ + return drwav_init_file__internal_FILE(pWav, pFile, onChunk, pChunkUserData, flags, pAllocationCallbacks); +} + +DRWAV_API drwav_bool32 drwav_init_file_w(drwav* pWav, const wchar_t* filename, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + return drwav_init_file_ex_w(pWav, filename, NULL, NULL, 0, pAllocationCallbacks); +} + +DRWAV_API drwav_bool32 drwav_init_file_ex_w(drwav* pWav, const wchar_t* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + FILE* pFile; + if (drwav_wfopen(&pFile, filename, L"rb", pAllocationCallbacks) != DRWAV_SUCCESS) { + return DRWAV_FALSE; + } + + /* This takes ownership of the FILE* object. */ + return drwav_init_file__internal_FILE(pWav, pFile, onChunk, pChunkUserData, flags, pAllocationCallbacks); +} + + +static drwav_bool32 drwav_init_file_write__internal_FILE(drwav* pWav, FILE* pFile, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_bool32 isSequential, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + drwav_bool32 result; + + result = drwav_preinit_write(pWav, pFormat, isSequential, drwav__on_write_stdio, drwav__on_seek_stdio, (void*)pFile, pAllocationCallbacks); + if (result != DRWAV_TRUE) { + fclose(pFile); + return result; + } + + result = drwav_init_write__internal(pWav, pFormat, totalSampleCount); + if (result != DRWAV_TRUE) { + fclose(pFile); + return result; + } + + return DRWAV_TRUE; +} + +static drwav_bool32 drwav_init_file_write__internal(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_bool32 isSequential, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + FILE* pFile; + if (drwav_fopen(&pFile, filename, "wb") != DRWAV_SUCCESS) { + return DRWAV_FALSE; + } + + /* This takes ownership of the FILE* object. */ + return drwav_init_file_write__internal_FILE(pWav, pFile, pFormat, totalSampleCount, isSequential, pAllocationCallbacks); +} + +static drwav_bool32 drwav_init_file_write_w__internal(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_bool32 isSequential, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + FILE* pFile; + if (drwav_wfopen(&pFile, filename, L"wb", pAllocationCallbacks) != DRWAV_SUCCESS) { + return DRWAV_FALSE; + } + + /* This takes ownership of the FILE* object. */ + return drwav_init_file_write__internal_FILE(pWav, pFile, pFormat, totalSampleCount, isSequential, pAllocationCallbacks); +} + +DRWAV_API drwav_bool32 drwav_init_file_write(drwav* pWav, const char* filename, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + return drwav_init_file_write__internal(pWav, filename, pFormat, 0, DRWAV_FALSE, pAllocationCallbacks); +} + +DRWAV_API drwav_bool32 drwav_init_file_write_sequential(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + return drwav_init_file_write__internal(pWav, filename, pFormat, totalSampleCount, DRWAV_TRUE, pAllocationCallbacks); +} + +DRWAV_API drwav_bool32 drwav_init_file_write_sequential_pcm_frames(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + if (pFormat == NULL) { + return DRWAV_FALSE; + } + + return drwav_init_file_write_sequential(pWav, filename, pFormat, totalPCMFrameCount*pFormat->channels, pAllocationCallbacks); +} + +DRWAV_API drwav_bool32 drwav_init_file_write_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + return drwav_init_file_write_w__internal(pWav, filename, pFormat, 0, DRWAV_FALSE, pAllocationCallbacks); +} + +DRWAV_API drwav_bool32 drwav_init_file_write_sequential_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + return drwav_init_file_write_w__internal(pWav, filename, pFormat, totalSampleCount, DRWAV_TRUE, pAllocationCallbacks); +} + +DRWAV_API drwav_bool32 drwav_init_file_write_sequential_pcm_frames_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + if (pFormat == NULL) { + return DRWAV_FALSE; + } + + return drwav_init_file_write_sequential_w(pWav, filename, pFormat, totalPCMFrameCount*pFormat->channels, pAllocationCallbacks); +} +#endif /* DR_WAV_NO_STDIO */ + + +static size_t drwav__on_read_memory(void* pUserData, void* pBufferOut, size_t bytesToRead) +{ + drwav* pWav = (drwav*)pUserData; + size_t bytesRemaining; + + DRWAV_ASSERT(pWav != NULL); + DRWAV_ASSERT(pWav->memoryStream.dataSize >= pWav->memoryStream.currentReadPos); + + bytesRemaining = pWav->memoryStream.dataSize - pWav->memoryStream.currentReadPos; + if (bytesToRead > bytesRemaining) { + bytesToRead = bytesRemaining; + } + + if (bytesToRead > 0) { + DRWAV_COPY_MEMORY(pBufferOut, pWav->memoryStream.data + pWav->memoryStream.currentReadPos, bytesToRead); + pWav->memoryStream.currentReadPos += bytesToRead; + } + + return bytesToRead; +} + +static drwav_bool32 drwav__on_seek_memory(void* pUserData, int offset, drwav_seek_origin origin) +{ + drwav* pWav = (drwav*)pUserData; + DRWAV_ASSERT(pWav != NULL); + + if (origin == drwav_seek_origin_current) { + if (offset > 0) { + if (pWav->memoryStream.currentReadPos + offset > pWav->memoryStream.dataSize) { + return DRWAV_FALSE; /* Trying to seek too far forward. */ + } + } else { + if (pWav->memoryStream.currentReadPos < (size_t)-offset) { + return DRWAV_FALSE; /* Trying to seek too far backwards. */ + } + } + + /* This will never underflow thanks to the clamps above. */ + pWav->memoryStream.currentReadPos += offset; + } else { + if ((drwav_uint32)offset <= pWav->memoryStream.dataSize) { + pWav->memoryStream.currentReadPos = offset; + } else { + return DRWAV_FALSE; /* Trying to seek too far forward. */ + } + } + + return DRWAV_TRUE; +} + +static size_t drwav__on_write_memory(void* pUserData, const void* pDataIn, size_t bytesToWrite) +{ + drwav* pWav = (drwav*)pUserData; + size_t bytesRemaining; + + DRWAV_ASSERT(pWav != NULL); + DRWAV_ASSERT(pWav->memoryStreamWrite.dataCapacity >= pWav->memoryStreamWrite.currentWritePos); + + bytesRemaining = pWav->memoryStreamWrite.dataCapacity - pWav->memoryStreamWrite.currentWritePos; + if (bytesRemaining < bytesToWrite) { + /* Need to reallocate. */ + void* pNewData; + size_t newDataCapacity = (pWav->memoryStreamWrite.dataCapacity == 0) ? 256 : pWav->memoryStreamWrite.dataCapacity * 2; + + /* If doubling wasn't enough, just make it the minimum required size to write the data. */ + if ((newDataCapacity - pWav->memoryStreamWrite.currentWritePos) < bytesToWrite) { + newDataCapacity = pWav->memoryStreamWrite.currentWritePos + bytesToWrite; + } + + pNewData = drwav__realloc_from_callbacks(*pWav->memoryStreamWrite.ppData, newDataCapacity, pWav->memoryStreamWrite.dataCapacity, &pWav->allocationCallbacks); + if (pNewData == NULL) { + return 0; + } + + *pWav->memoryStreamWrite.ppData = pNewData; + pWav->memoryStreamWrite.dataCapacity = newDataCapacity; + } + + DRWAV_COPY_MEMORY(((drwav_uint8*)(*pWav->memoryStreamWrite.ppData)) + pWav->memoryStreamWrite.currentWritePos, pDataIn, bytesToWrite); + + pWav->memoryStreamWrite.currentWritePos += bytesToWrite; + if (pWav->memoryStreamWrite.dataSize < pWav->memoryStreamWrite.currentWritePos) { + pWav->memoryStreamWrite.dataSize = pWav->memoryStreamWrite.currentWritePos; + } + + *pWav->memoryStreamWrite.pDataSize = pWav->memoryStreamWrite.dataSize; + + return bytesToWrite; +} + +static drwav_bool32 drwav__on_seek_memory_write(void* pUserData, int offset, drwav_seek_origin origin) +{ + drwav* pWav = (drwav*)pUserData; + DRWAV_ASSERT(pWav != NULL); + + if (origin == drwav_seek_origin_current) { + if (offset > 0) { + if (pWav->memoryStreamWrite.currentWritePos + offset > pWav->memoryStreamWrite.dataSize) { + offset = (int)(pWav->memoryStreamWrite.dataSize - pWav->memoryStreamWrite.currentWritePos); /* Trying to seek too far forward. */ + } + } else { + if (pWav->memoryStreamWrite.currentWritePos < (size_t)-offset) { + offset = -(int)pWav->memoryStreamWrite.currentWritePos; /* Trying to seek too far backwards. */ + } + } + + /* This will never underflow thanks to the clamps above. */ + pWav->memoryStreamWrite.currentWritePos += offset; + } else { + if ((drwav_uint32)offset <= pWav->memoryStreamWrite.dataSize) { + pWav->memoryStreamWrite.currentWritePos = offset; + } else { + pWav->memoryStreamWrite.currentWritePos = pWav->memoryStreamWrite.dataSize; /* Trying to seek too far forward. */ + } + } + + return DRWAV_TRUE; +} + +DRWAV_API drwav_bool32 drwav_init_memory(drwav* pWav, const void* data, size_t dataSize, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + return drwav_init_memory_ex(pWav, data, dataSize, NULL, NULL, 0, pAllocationCallbacks); +} + +DRWAV_API drwav_bool32 drwav_init_memory_ex(drwav* pWav, const void* data, size_t dataSize, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + if (data == NULL || dataSize == 0) { + return DRWAV_FALSE; + } + + if (!drwav_preinit(pWav, drwav__on_read_memory, drwav__on_seek_memory, pWav, pAllocationCallbacks)) { + return DRWAV_FALSE; + } + + pWav->memoryStream.data = (const drwav_uint8*)data; + pWav->memoryStream.dataSize = dataSize; + pWav->memoryStream.currentReadPos = 0; + + return drwav_init__internal(pWav, onChunk, pChunkUserData, flags); +} + + +static drwav_bool32 drwav_init_memory_write__internal(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_bool32 isSequential, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + if (ppData == NULL || pDataSize == NULL) { + return DRWAV_FALSE; + } + + *ppData = NULL; /* Important because we're using realloc()! */ + *pDataSize = 0; + + if (!drwav_preinit_write(pWav, pFormat, isSequential, drwav__on_write_memory, drwav__on_seek_memory_write, pWav, pAllocationCallbacks)) { + return DRWAV_FALSE; + } + + pWav->memoryStreamWrite.ppData = ppData; + pWav->memoryStreamWrite.pDataSize = pDataSize; + pWav->memoryStreamWrite.dataSize = 0; + pWav->memoryStreamWrite.dataCapacity = 0; + pWav->memoryStreamWrite.currentWritePos = 0; + + return drwav_init_write__internal(pWav, pFormat, totalSampleCount); +} + +DRWAV_API drwav_bool32 drwav_init_memory_write(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + return drwav_init_memory_write__internal(pWav, ppData, pDataSize, pFormat, 0, DRWAV_FALSE, pAllocationCallbacks); +} + +DRWAV_API drwav_bool32 drwav_init_memory_write_sequential(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + return drwav_init_memory_write__internal(pWav, ppData, pDataSize, pFormat, totalSampleCount, DRWAV_TRUE, pAllocationCallbacks); +} + +DRWAV_API drwav_bool32 drwav_init_memory_write_sequential_pcm_frames(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + if (pFormat == NULL) { + return DRWAV_FALSE; + } + + return drwav_init_memory_write_sequential(pWav, ppData, pDataSize, pFormat, totalPCMFrameCount*pFormat->channels, pAllocationCallbacks); +} + + + +DRWAV_API drwav_result drwav_uninit(drwav* pWav) +{ + drwav_result result = DRWAV_SUCCESS; + + if (pWav == NULL) { + return DRWAV_INVALID_ARGS; + } + + /* + If the drwav object was opened in write mode we'll need to finalize a few things: + - Make sure the "data" chunk is aligned to 16-bits for RIFF containers, or 64 bits for W64 containers. + - Set the size of the "data" chunk. + */ + if (pWav->onWrite != NULL) { + drwav_uint32 paddingSize = 0; + + /* Padding. Do not adjust pWav->dataChunkDataSize - this should not include the padding. */ + if (pWav->container == drwav_container_riff || pWav->container == drwav_container_rf64) { + paddingSize = drwav__chunk_padding_size_riff(pWav->dataChunkDataSize); + } else { + paddingSize = drwav__chunk_padding_size_w64(pWav->dataChunkDataSize); + } + + if (paddingSize > 0) { + drwav_uint64 paddingData = 0; + drwav__write(pWav, &paddingData, paddingSize); /* Byte order does not matter for this. */ + } + + /* + Chunk sizes. When using sequential mode, these will have been filled in at initialization time. We only need + to do this when using non-sequential mode. + */ + if (pWav->onSeek && !pWav->isSequentialWrite) { + if (pWav->container == drwav_container_riff) { + /* The "RIFF" chunk size. */ + if (pWav->onSeek(pWav->pUserData, 4, drwav_seek_origin_start)) { + drwav_uint32 riffChunkSize = drwav__riff_chunk_size_riff(pWav->dataChunkDataSize); + drwav__write_u32ne_to_le(pWav, riffChunkSize); + } + + /* the "data" chunk size. */ + if (pWav->onSeek(pWav->pUserData, (int)pWav->dataChunkDataPos + 4, drwav_seek_origin_start)) { + drwav_uint32 dataChunkSize = drwav__data_chunk_size_riff(pWav->dataChunkDataSize); + drwav__write_u32ne_to_le(pWav, dataChunkSize); + } + } else if (pWav->container == drwav_container_w64) { + /* The "RIFF" chunk size. */ + if (pWav->onSeek(pWav->pUserData, 16, drwav_seek_origin_start)) { + drwav_uint64 riffChunkSize = drwav__riff_chunk_size_w64(pWav->dataChunkDataSize); + drwav__write_u64ne_to_le(pWav, riffChunkSize); + } + + /* The "data" chunk size. */ + if (pWav->onSeek(pWav->pUserData, (int)pWav->dataChunkDataPos + 16, drwav_seek_origin_start)) { + drwav_uint64 dataChunkSize = drwav__data_chunk_size_w64(pWav->dataChunkDataSize); + drwav__write_u64ne_to_le(pWav, dataChunkSize); + } + } else if (pWav->container == drwav_container_rf64) { + /* We only need to update the ds64 chunk. The "RIFF" and "data" chunks always have their sizes set to 0xFFFFFFFF for RF64. */ + int ds64BodyPos = 12 + 8; + + /* The "RIFF" chunk size. */ + if (pWav->onSeek(pWav->pUserData, ds64BodyPos + 0, drwav_seek_origin_start)) { + drwav_uint64 riffChunkSize = drwav__riff_chunk_size_rf64(pWav->dataChunkDataSize); + drwav__write_u64ne_to_le(pWav, riffChunkSize); + } + + /* The "data" chunk size. */ + if (pWav->onSeek(pWav->pUserData, ds64BodyPos + 8, drwav_seek_origin_start)) { + drwav_uint64 dataChunkSize = drwav__data_chunk_size_rf64(pWav->dataChunkDataSize); + drwav__write_u64ne_to_le(pWav, dataChunkSize); + } + } + } + + /* Validation for sequential mode. */ + if (pWav->isSequentialWrite) { + if (pWav->dataChunkDataSize != pWav->dataChunkDataSizeTargetWrite) { + result = DRWAV_INVALID_FILE; + } + } + } + +#ifndef DR_WAV_NO_STDIO + /* + If we opened the file with drwav_open_file() we will want to close the file handle. We can know whether or not drwav_open_file() + was used by looking at the onRead and onSeek callbacks. + */ + if (pWav->onRead == drwav__on_read_stdio || pWav->onWrite == drwav__on_write_stdio) { + fclose((FILE*)pWav->pUserData); + } +#endif + + return result; +} + + + +DRWAV_API size_t drwav_read_raw(drwav* pWav, size_t bytesToRead, void* pBufferOut) +{ + size_t bytesRead; + + if (pWav == NULL || bytesToRead == 0) { + return 0; + } + + if (bytesToRead > pWav->bytesRemaining) { + bytesToRead = (size_t)pWav->bytesRemaining; + } + + if (pBufferOut != NULL) { + bytesRead = pWav->onRead(pWav->pUserData, pBufferOut, bytesToRead); + } else { + /* We need to seek. If we fail, we need to read-and-discard to make sure we get a good byte count. */ + bytesRead = 0; + while (bytesRead < bytesToRead) { + size_t bytesToSeek = (bytesToRead - bytesRead); + if (bytesToSeek > 0x7FFFFFFF) { + bytesToSeek = 0x7FFFFFFF; + } + + if (pWav->onSeek(pWav->pUserData, (int)bytesToSeek, drwav_seek_origin_current) == DRWAV_FALSE) { + break; + } + + bytesRead += bytesToSeek; + } + + /* When we get here we may need to read-and-discard some data. */ + while (bytesRead < bytesToRead) { + drwav_uint8 buffer[4096]; + size_t bytesSeeked; + size_t bytesToSeek = (bytesToRead - bytesRead); + if (bytesToSeek > sizeof(buffer)) { + bytesToSeek = sizeof(buffer); + } + + bytesSeeked = pWav->onRead(pWav->pUserData, buffer, bytesToSeek); + bytesRead += bytesSeeked; + + if (bytesSeeked < bytesToSeek) { + break; /* Reached the end. */ + } + } + } + + pWav->bytesRemaining -= bytesRead; + return bytesRead; +} + + + +DRWAV_API drwav_uint64 drwav_read_pcm_frames_le(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut) +{ + drwav_uint32 bytesPerFrame; + drwav_uint64 bytesToRead; /* Intentionally uint64 instead of size_t so we can do a check that we're not reading too much on 32-bit builds. */ + + if (pWav == NULL || framesToRead == 0) { + return 0; + } + + /* Cannot use this function for compressed formats. */ + if (drwav__is_compressed_format_tag(pWav->translatedFormatTag)) { + return 0; + } + + bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav); + if (bytesPerFrame == 0) { + return 0; + } + + /* Don't try to read more samples than can potentially fit in the output buffer. */ + bytesToRead = framesToRead * bytesPerFrame; + if (bytesToRead > DRWAV_SIZE_MAX) { + bytesToRead = (DRWAV_SIZE_MAX / bytesPerFrame) * bytesPerFrame; /* Round the number of bytes to read to a clean frame boundary. */ + } + + /* + Doing an explicit check here just to make it clear that we don't want to be attempt to read anything if there's no bytes to read. There + *could* be a time where it evaluates to 0 due to overflowing. + */ + if (bytesToRead == 0) { + return 0; + } + + return drwav_read_raw(pWav, (size_t)bytesToRead, pBufferOut) / bytesPerFrame; +} + +DRWAV_API drwav_uint64 drwav_read_pcm_frames_be(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut) +{ + drwav_uint64 framesRead = drwav_read_pcm_frames_le(pWav, framesToRead, pBufferOut); + + if (pBufferOut != NULL) { + drwav__bswap_samples(pBufferOut, framesRead*pWav->channels, drwav_get_bytes_per_pcm_frame(pWav)/pWav->channels, pWav->translatedFormatTag); + } + + return framesRead; +} + +DRWAV_API drwav_uint64 drwav_read_pcm_frames(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut) +{ + if (drwav__is_little_endian()) { + return drwav_read_pcm_frames_le(pWav, framesToRead, pBufferOut); + } else { + return drwav_read_pcm_frames_be(pWav, framesToRead, pBufferOut); + } +} + + + +DRWAV_API drwav_bool32 drwav_seek_to_first_pcm_frame(drwav* pWav) +{ + if (pWav->onWrite != NULL) { + return DRWAV_FALSE; /* No seeking in write mode. */ + } + + if (!pWav->onSeek(pWav->pUserData, (int)pWav->dataChunkDataPos, drwav_seek_origin_start)) { + return DRWAV_FALSE; + } + + if (drwav__is_compressed_format_tag(pWav->translatedFormatTag)) { + pWav->compressed.iCurrentPCMFrame = 0; + + /* Cached data needs to be cleared for compressed formats. */ + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) { + DRWAV_ZERO_OBJECT(&pWav->msadpcm); + } else if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) { + DRWAV_ZERO_OBJECT(&pWav->ima); + } else { + DRWAV_ASSERT(DRWAV_FALSE); /* If this assertion is triggered it means I've implemented a new compressed format but forgot to add a branch for it here. */ + } + } + + pWav->bytesRemaining = pWav->dataChunkDataSize; + return DRWAV_TRUE; +} + +DRWAV_API drwav_bool32 drwav_seek_to_pcm_frame(drwav* pWav, drwav_uint64 targetFrameIndex) +{ + /* Seeking should be compatible with wave files > 2GB. */ + + if (pWav == NULL || pWav->onSeek == NULL) { + return DRWAV_FALSE; + } + + /* No seeking in write mode. */ + if (pWav->onWrite != NULL) { + return DRWAV_FALSE; + } + + /* If there are no samples, just return DRWAV_TRUE without doing anything. */ + if (pWav->totalPCMFrameCount == 0) { + return DRWAV_TRUE; + } + + /* Make sure the sample is clamped. */ + if (targetFrameIndex >= pWav->totalPCMFrameCount) { + targetFrameIndex = pWav->totalPCMFrameCount - 1; + } + + /* + For compressed formats we just use a slow generic seek. If we are seeking forward we just seek forward. If we are going backwards we need + to seek back to the start. + */ + if (drwav__is_compressed_format_tag(pWav->translatedFormatTag)) { + /* TODO: This can be optimized. */ + + /* + If we're seeking forward it's simple - just keep reading samples until we hit the sample we're requesting. If we're seeking backwards, + we first need to seek back to the start and then just do the same thing as a forward seek. + */ + if (targetFrameIndex < pWav->compressed.iCurrentPCMFrame) { + if (!drwav_seek_to_first_pcm_frame(pWav)) { + return DRWAV_FALSE; + } + } + + if (targetFrameIndex > pWav->compressed.iCurrentPCMFrame) { + drwav_uint64 offsetInFrames = targetFrameIndex - pWav->compressed.iCurrentPCMFrame; + + drwav_int16 devnull[2048]; + while (offsetInFrames > 0) { + drwav_uint64 framesRead = 0; + drwav_uint64 framesToRead = offsetInFrames; + if (framesToRead > drwav_countof(devnull)/pWav->channels) { + framesToRead = drwav_countof(devnull)/pWav->channels; + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) { + framesRead = drwav_read_pcm_frames_s16__msadpcm(pWav, framesToRead, devnull); + } else if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) { + framesRead = drwav_read_pcm_frames_s16__ima(pWav, framesToRead, devnull); + } else { + DRWAV_ASSERT(DRWAV_FALSE); /* If this assertion is triggered it means I've implemented a new compressed format but forgot to add a branch for it here. */ + } + + if (framesRead != framesToRead) { + return DRWAV_FALSE; + } + + offsetInFrames -= framesRead; + } + } + } else { + drwav_uint64 totalSizeInBytes; + drwav_uint64 currentBytePos; + drwav_uint64 targetBytePos; + drwav_uint64 offset; + + totalSizeInBytes = pWav->totalPCMFrameCount * drwav_get_bytes_per_pcm_frame(pWav); + DRWAV_ASSERT(totalSizeInBytes >= pWav->bytesRemaining); + + currentBytePos = totalSizeInBytes - pWav->bytesRemaining; + targetBytePos = targetFrameIndex * drwav_get_bytes_per_pcm_frame(pWav); + + if (currentBytePos < targetBytePos) { + /* Offset forwards. */ + offset = (targetBytePos - currentBytePos); + } else { + /* Offset backwards. */ + if (!drwav_seek_to_first_pcm_frame(pWav)) { + return DRWAV_FALSE; + } + offset = targetBytePos; + } + + while (offset > 0) { + int offset32 = ((offset > INT_MAX) ? INT_MAX : (int)offset); + if (!pWav->onSeek(pWav->pUserData, offset32, drwav_seek_origin_current)) { + return DRWAV_FALSE; + } + + pWav->bytesRemaining -= offset32; + offset -= offset32; + } + } + + return DRWAV_TRUE; +} + + +DRWAV_API size_t drwav_write_raw(drwav* pWav, size_t bytesToWrite, const void* pData) +{ + size_t bytesWritten; + + if (pWav == NULL || bytesToWrite == 0 || pData == NULL) { + return 0; + } + + bytesWritten = pWav->onWrite(pWav->pUserData, pData, bytesToWrite); + pWav->dataChunkDataSize += bytesWritten; + + return bytesWritten; +} + + +DRWAV_API drwav_uint64 drwav_write_pcm_frames_le(drwav* pWav, drwav_uint64 framesToWrite, const void* pData) +{ + drwav_uint64 bytesToWrite; + drwav_uint64 bytesWritten; + const drwav_uint8* pRunningData; + + if (pWav == NULL || framesToWrite == 0 || pData == NULL) { + return 0; + } + + bytesToWrite = ((framesToWrite * pWav->channels * pWav->bitsPerSample) / 8); + if (bytesToWrite > DRWAV_SIZE_MAX) { + return 0; + } + + bytesWritten = 0; + pRunningData = (const drwav_uint8*)pData; + + while (bytesToWrite > 0) { + size_t bytesJustWritten; + drwav_uint64 bytesToWriteThisIteration; + + bytesToWriteThisIteration = bytesToWrite; + DRWAV_ASSERT(bytesToWriteThisIteration <= DRWAV_SIZE_MAX); /* <-- This is checked above. */ + + bytesJustWritten = drwav_write_raw(pWav, (size_t)bytesToWriteThisIteration, pRunningData); + if (bytesJustWritten == 0) { + break; + } + + bytesToWrite -= bytesJustWritten; + bytesWritten += bytesJustWritten; + pRunningData += bytesJustWritten; + } + + return (bytesWritten * 8) / pWav->bitsPerSample / pWav->channels; +} + +DRWAV_API drwav_uint64 drwav_write_pcm_frames_be(drwav* pWav, drwav_uint64 framesToWrite, const void* pData) +{ + drwav_uint64 bytesToWrite; + drwav_uint64 bytesWritten; + drwav_uint32 bytesPerSample; + const drwav_uint8* pRunningData; + + if (pWav == NULL || framesToWrite == 0 || pData == NULL) { + return 0; + } + + bytesToWrite = ((framesToWrite * pWav->channels * pWav->bitsPerSample) / 8); + if (bytesToWrite > DRWAV_SIZE_MAX) { + return 0; + } + + bytesWritten = 0; + pRunningData = (const drwav_uint8*)pData; + + bytesPerSample = drwav_get_bytes_per_pcm_frame(pWav) / pWav->channels; + + while (bytesToWrite > 0) { + drwav_uint8 temp[4096]; + drwav_uint32 sampleCount; + size_t bytesJustWritten; + drwav_uint64 bytesToWriteThisIteration; + + bytesToWriteThisIteration = bytesToWrite; + DRWAV_ASSERT(bytesToWriteThisIteration <= DRWAV_SIZE_MAX); /* <-- This is checked above. */ + + /* + WAV files are always little-endian. We need to byte swap on big-endian architectures. Since our input buffer is read-only we need + to use an intermediary buffer for the conversion. + */ + sampleCount = sizeof(temp)/bytesPerSample; + + if (bytesToWriteThisIteration > ((drwav_uint64)sampleCount)*bytesPerSample) { + bytesToWriteThisIteration = ((drwav_uint64)sampleCount)*bytesPerSample; + } + + DRWAV_COPY_MEMORY(temp, pRunningData, (size_t)bytesToWriteThisIteration); + drwav__bswap_samples(temp, sampleCount, bytesPerSample, pWav->translatedFormatTag); + + bytesJustWritten = drwav_write_raw(pWav, (size_t)bytesToWriteThisIteration, temp); + if (bytesJustWritten == 0) { + break; + } + + bytesToWrite -= bytesJustWritten; + bytesWritten += bytesJustWritten; + pRunningData += bytesJustWritten; + } + + return (bytesWritten * 8) / pWav->bitsPerSample / pWav->channels; +} + +DRWAV_API drwav_uint64 drwav_write_pcm_frames(drwav* pWav, drwav_uint64 framesToWrite, const void* pData) +{ + if (drwav__is_little_endian()) { + return drwav_write_pcm_frames_le(pWav, framesToWrite, pData); + } else { + return drwav_write_pcm_frames_be(pWav, framesToWrite, pData); + } +} + + +static drwav_uint64 drwav_read_pcm_frames_s16__msadpcm(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut) +{ + drwav_uint64 totalFramesRead = 0; + + DRWAV_ASSERT(pWav != NULL); + DRWAV_ASSERT(framesToRead > 0); + + /* TODO: Lots of room for optimization here. */ + + while (framesToRead > 0 && pWav->compressed.iCurrentPCMFrame < pWav->totalPCMFrameCount) { + /* If there are no cached frames we need to load a new block. */ + if (pWav->msadpcm.cachedFrameCount == 0 && pWav->msadpcm.bytesRemainingInBlock == 0) { + if (pWav->channels == 1) { + /* Mono. */ + drwav_uint8 header[7]; + if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) { + return totalFramesRead; + } + pWav->msadpcm.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header); + + pWav->msadpcm.predictor[0] = header[0]; + pWav->msadpcm.delta[0] = drwav__bytes_to_s16(header + 1); + pWav->msadpcm.prevFrames[0][1] = (drwav_int32)drwav__bytes_to_s16(header + 3); + pWav->msadpcm.prevFrames[0][0] = (drwav_int32)drwav__bytes_to_s16(header + 5); + pWav->msadpcm.cachedFrames[2] = pWav->msadpcm.prevFrames[0][0]; + pWav->msadpcm.cachedFrames[3] = pWav->msadpcm.prevFrames[0][1]; + pWav->msadpcm.cachedFrameCount = 2; + } else { + /* Stereo. */ + drwav_uint8 header[14]; + if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) { + return totalFramesRead; + } + pWav->msadpcm.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header); + + pWav->msadpcm.predictor[0] = header[0]; + pWav->msadpcm.predictor[1] = header[1]; + pWav->msadpcm.delta[0] = drwav__bytes_to_s16(header + 2); + pWav->msadpcm.delta[1] = drwav__bytes_to_s16(header + 4); + pWav->msadpcm.prevFrames[0][1] = (drwav_int32)drwav__bytes_to_s16(header + 6); + pWav->msadpcm.prevFrames[1][1] = (drwav_int32)drwav__bytes_to_s16(header + 8); + pWav->msadpcm.prevFrames[0][0] = (drwav_int32)drwav__bytes_to_s16(header + 10); + pWav->msadpcm.prevFrames[1][0] = (drwav_int32)drwav__bytes_to_s16(header + 12); + + pWav->msadpcm.cachedFrames[0] = pWav->msadpcm.prevFrames[0][0]; + pWav->msadpcm.cachedFrames[1] = pWav->msadpcm.prevFrames[1][0]; + pWav->msadpcm.cachedFrames[2] = pWav->msadpcm.prevFrames[0][1]; + pWav->msadpcm.cachedFrames[3] = pWav->msadpcm.prevFrames[1][1]; + pWav->msadpcm.cachedFrameCount = 2; + } + } + + /* Output anything that's cached. */ + while (framesToRead > 0 && pWav->msadpcm.cachedFrameCount > 0 && pWav->compressed.iCurrentPCMFrame < pWav->totalPCMFrameCount) { + if (pBufferOut != NULL) { + drwav_uint32 iSample = 0; + for (iSample = 0; iSample < pWav->channels; iSample += 1) { + pBufferOut[iSample] = (drwav_int16)pWav->msadpcm.cachedFrames[(drwav_countof(pWav->msadpcm.cachedFrames) - (pWav->msadpcm.cachedFrameCount*pWav->channels)) + iSample]; + } + + pBufferOut += pWav->channels; + } + + framesToRead -= 1; + totalFramesRead += 1; + pWav->compressed.iCurrentPCMFrame += 1; + pWav->msadpcm.cachedFrameCount -= 1; + } + + if (framesToRead == 0) { + return totalFramesRead; + } + + + /* + If there's nothing left in the cache, just go ahead and load more. If there's nothing left to load in the current block we just continue to the next + loop iteration which will trigger the loading of a new block. + */ + if (pWav->msadpcm.cachedFrameCount == 0) { + if (pWav->msadpcm.bytesRemainingInBlock == 0) { + continue; + } else { + static drwav_int32 adaptationTable[] = { + 230, 230, 230, 230, 307, 409, 512, 614, + 768, 614, 512, 409, 307, 230, 230, 230 + }; + static drwav_int32 coeff1Table[] = { 256, 512, 0, 192, 240, 460, 392 }; + static drwav_int32 coeff2Table[] = { 0, -256, 0, 64, 0, -208, -232 }; + + drwav_uint8 nibbles; + drwav_int32 nibble0; + drwav_int32 nibble1; + + if (pWav->onRead(pWav->pUserData, &nibbles, 1) != 1) { + return totalFramesRead; + } + pWav->msadpcm.bytesRemainingInBlock -= 1; + + /* TODO: Optimize away these if statements. */ + nibble0 = ((nibbles & 0xF0) >> 4); if ((nibbles & 0x80)) { nibble0 |= 0xFFFFFFF0UL; } + nibble1 = ((nibbles & 0x0F) >> 0); if ((nibbles & 0x08)) { nibble1 |= 0xFFFFFFF0UL; } + + if (pWav->channels == 1) { + /* Mono. */ + drwav_int32 newSample0; + drwav_int32 newSample1; + + newSample0 = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8; + newSample0 += nibble0 * pWav->msadpcm.delta[0]; + newSample0 = drwav_clamp(newSample0, -32768, 32767); + + pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0xF0) >> 4)] * pWav->msadpcm.delta[0]) >> 8; + if (pWav->msadpcm.delta[0] < 16) { + pWav->msadpcm.delta[0] = 16; + } + + pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1]; + pWav->msadpcm.prevFrames[0][1] = newSample0; + + + newSample1 = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8; + newSample1 += nibble1 * pWav->msadpcm.delta[0]; + newSample1 = drwav_clamp(newSample1, -32768, 32767); + + pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0x0F) >> 0)] * pWav->msadpcm.delta[0]) >> 8; + if (pWav->msadpcm.delta[0] < 16) { + pWav->msadpcm.delta[0] = 16; + } + + pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1]; + pWav->msadpcm.prevFrames[0][1] = newSample1; + + + pWav->msadpcm.cachedFrames[2] = newSample0; + pWav->msadpcm.cachedFrames[3] = newSample1; + pWav->msadpcm.cachedFrameCount = 2; + } else { + /* Stereo. */ + drwav_int32 newSample0; + drwav_int32 newSample1; + + /* Left. */ + newSample0 = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8; + newSample0 += nibble0 * pWav->msadpcm.delta[0]; + newSample0 = drwav_clamp(newSample0, -32768, 32767); + + pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0xF0) >> 4)] * pWav->msadpcm.delta[0]) >> 8; + if (pWav->msadpcm.delta[0] < 16) { + pWav->msadpcm.delta[0] = 16; + } + + pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1]; + pWav->msadpcm.prevFrames[0][1] = newSample0; + + + /* Right. */ + newSample1 = ((pWav->msadpcm.prevFrames[1][1] * coeff1Table[pWav->msadpcm.predictor[1]]) + (pWav->msadpcm.prevFrames[1][0] * coeff2Table[pWav->msadpcm.predictor[1]])) >> 8; + newSample1 += nibble1 * pWav->msadpcm.delta[1]; + newSample1 = drwav_clamp(newSample1, -32768, 32767); + + pWav->msadpcm.delta[1] = (adaptationTable[((nibbles & 0x0F) >> 0)] * pWav->msadpcm.delta[1]) >> 8; + if (pWav->msadpcm.delta[1] < 16) { + pWav->msadpcm.delta[1] = 16; + } + + pWav->msadpcm.prevFrames[1][0] = pWav->msadpcm.prevFrames[1][1]; + pWav->msadpcm.prevFrames[1][1] = newSample1; + + pWav->msadpcm.cachedFrames[2] = newSample0; + pWav->msadpcm.cachedFrames[3] = newSample1; + pWav->msadpcm.cachedFrameCount = 1; + } + } + } + } + + return totalFramesRead; +} + + +static drwav_uint64 drwav_read_pcm_frames_s16__ima(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut) +{ + drwav_uint64 totalFramesRead = 0; + drwav_uint32 iChannel; + + static drwav_int32 indexTable[16] = { + -1, -1, -1, -1, 2, 4, 6, 8, + -1, -1, -1, -1, 2, 4, 6, 8 + }; + + static drwav_int32 stepTable[89] = { + 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, + 19, 21, 23, 25, 28, 31, 34, 37, 41, 45, + 50, 55, 60, 66, 73, 80, 88, 97, 107, 118, + 130, 143, 157, 173, 190, 209, 230, 253, 279, 307, + 337, 371, 408, 449, 494, 544, 598, 658, 724, 796, + 876, 963, 1060, 1166, 1282, 1411, 1552, 1707, 1878, 2066, + 2272, 2499, 2749, 3024, 3327, 3660, 4026, 4428, 4871, 5358, + 5894, 6484, 7132, 7845, 8630, 9493, 10442, 11487, 12635, 13899, + 15289, 16818, 18500, 20350, 22385, 24623, 27086, 29794, 32767 + }; + + DRWAV_ASSERT(pWav != NULL); + DRWAV_ASSERT(framesToRead > 0); + + /* TODO: Lots of room for optimization here. */ + + while (framesToRead > 0 && pWav->compressed.iCurrentPCMFrame < pWav->totalPCMFrameCount) { + /* If there are no cached samples we need to load a new block. */ + if (pWav->ima.cachedFrameCount == 0 && pWav->ima.bytesRemainingInBlock == 0) { + if (pWav->channels == 1) { + /* Mono. */ + drwav_uint8 header[4]; + if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) { + return totalFramesRead; + } + pWav->ima.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header); + + if (header[2] >= drwav_countof(stepTable)) { + pWav->onSeek(pWav->pUserData, pWav->ima.bytesRemainingInBlock, drwav_seek_origin_current); + pWav->ima.bytesRemainingInBlock = 0; + return totalFramesRead; /* Invalid data. */ + } + + pWav->ima.predictor[0] = drwav__bytes_to_s16(header + 0); + pWav->ima.stepIndex[0] = header[2]; + pWav->ima.cachedFrames[drwav_countof(pWav->ima.cachedFrames) - 1] = pWav->ima.predictor[0]; + pWav->ima.cachedFrameCount = 1; + } else { + /* Stereo. */ + drwav_uint8 header[8]; + if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) { + return totalFramesRead; + } + pWav->ima.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header); + + if (header[2] >= drwav_countof(stepTable) || header[6] >= drwav_countof(stepTable)) { + pWav->onSeek(pWav->pUserData, pWav->ima.bytesRemainingInBlock, drwav_seek_origin_current); + pWav->ima.bytesRemainingInBlock = 0; + return totalFramesRead; /* Invalid data. */ + } + + pWav->ima.predictor[0] = drwav__bytes_to_s16(header + 0); + pWav->ima.stepIndex[0] = header[2]; + pWav->ima.predictor[1] = drwav__bytes_to_s16(header + 4); + pWav->ima.stepIndex[1] = header[6]; + + pWav->ima.cachedFrames[drwav_countof(pWav->ima.cachedFrames) - 2] = pWav->ima.predictor[0]; + pWav->ima.cachedFrames[drwav_countof(pWav->ima.cachedFrames) - 1] = pWav->ima.predictor[1]; + pWav->ima.cachedFrameCount = 1; + } + } + + /* Output anything that's cached. */ + while (framesToRead > 0 && pWav->ima.cachedFrameCount > 0 && pWav->compressed.iCurrentPCMFrame < pWav->totalPCMFrameCount) { + if (pBufferOut != NULL) { + drwav_uint32 iSample; + for (iSample = 0; iSample < pWav->channels; iSample += 1) { + pBufferOut[iSample] = (drwav_int16)pWav->ima.cachedFrames[(drwav_countof(pWav->ima.cachedFrames) - (pWav->ima.cachedFrameCount*pWav->channels)) + iSample]; + } + pBufferOut += pWav->channels; + } + + framesToRead -= 1; + totalFramesRead += 1; + pWav->compressed.iCurrentPCMFrame += 1; + pWav->ima.cachedFrameCount -= 1; + } + + if (framesToRead == 0) { + return totalFramesRead; + } + + /* + If there's nothing left in the cache, just go ahead and load more. If there's nothing left to load in the current block we just continue to the next + loop iteration which will trigger the loading of a new block. + */ + if (pWav->ima.cachedFrameCount == 0) { + if (pWav->ima.bytesRemainingInBlock == 0) { + continue; + } else { + /* + From what I can tell with stereo streams, it looks like every 4 bytes (8 samples) is for one channel. So it goes 4 bytes for the + left channel, 4 bytes for the right channel. + */ + pWav->ima.cachedFrameCount = 8; + for (iChannel = 0; iChannel < pWav->channels; ++iChannel) { + drwav_uint32 iByte; + drwav_uint8 nibbles[4]; + if (pWav->onRead(pWav->pUserData, &nibbles, 4) != 4) { + pWav->ima.cachedFrameCount = 0; + return totalFramesRead; + } + pWav->ima.bytesRemainingInBlock -= 4; + + for (iByte = 0; iByte < 4; ++iByte) { + drwav_uint8 nibble0 = ((nibbles[iByte] & 0x0F) >> 0); + drwav_uint8 nibble1 = ((nibbles[iByte] & 0xF0) >> 4); + + drwav_int32 step = stepTable[pWav->ima.stepIndex[iChannel]]; + drwav_int32 predictor = pWav->ima.predictor[iChannel]; + + drwav_int32 diff = step >> 3; + if (nibble0 & 1) diff += step >> 2; + if (nibble0 & 2) diff += step >> 1; + if (nibble0 & 4) diff += step; + if (nibble0 & 8) diff = -diff; + + predictor = drwav_clamp(predictor + diff, -32768, 32767); + pWav->ima.predictor[iChannel] = predictor; + pWav->ima.stepIndex[iChannel] = drwav_clamp(pWav->ima.stepIndex[iChannel] + indexTable[nibble0], 0, (drwav_int32)drwav_countof(stepTable)-1); + pWav->ima.cachedFrames[(drwav_countof(pWav->ima.cachedFrames) - (pWav->ima.cachedFrameCount*pWav->channels)) + (iByte*2+0)*pWav->channels + iChannel] = predictor; + + + step = stepTable[pWav->ima.stepIndex[iChannel]]; + predictor = pWav->ima.predictor[iChannel]; + + diff = step >> 3; + if (nibble1 & 1) diff += step >> 2; + if (nibble1 & 2) diff += step >> 1; + if (nibble1 & 4) diff += step; + if (nibble1 & 8) diff = -diff; + + predictor = drwav_clamp(predictor + diff, -32768, 32767); + pWav->ima.predictor[iChannel] = predictor; + pWav->ima.stepIndex[iChannel] = drwav_clamp(pWav->ima.stepIndex[iChannel] + indexTable[nibble1], 0, (drwav_int32)drwav_countof(stepTable)-1); + pWav->ima.cachedFrames[(drwav_countof(pWav->ima.cachedFrames) - (pWav->ima.cachedFrameCount*pWav->channels)) + (iByte*2+1)*pWav->channels + iChannel] = predictor; + } + } + } + } + } + + return totalFramesRead; +} + + +#ifndef DR_WAV_NO_CONVERSION_API +static unsigned short g_drwavAlawTable[256] = { + 0xEA80, 0xEB80, 0xE880, 0xE980, 0xEE80, 0xEF80, 0xEC80, 0xED80, 0xE280, 0xE380, 0xE080, 0xE180, 0xE680, 0xE780, 0xE480, 0xE580, + 0xF540, 0xF5C0, 0xF440, 0xF4C0, 0xF740, 0xF7C0, 0xF640, 0xF6C0, 0xF140, 0xF1C0, 0xF040, 0xF0C0, 0xF340, 0xF3C0, 0xF240, 0xF2C0, + 0xAA00, 0xAE00, 0xA200, 0xA600, 0xBA00, 0xBE00, 0xB200, 0xB600, 0x8A00, 0x8E00, 0x8200, 0x8600, 0x9A00, 0x9E00, 0x9200, 0x9600, + 0xD500, 0xD700, 0xD100, 0xD300, 0xDD00, 0xDF00, 0xD900, 0xDB00, 0xC500, 0xC700, 0xC100, 0xC300, 0xCD00, 0xCF00, 0xC900, 0xCB00, + 0xFEA8, 0xFEB8, 0xFE88, 0xFE98, 0xFEE8, 0xFEF8, 0xFEC8, 0xFED8, 0xFE28, 0xFE38, 0xFE08, 0xFE18, 0xFE68, 0xFE78, 0xFE48, 0xFE58, + 0xFFA8, 0xFFB8, 0xFF88, 0xFF98, 0xFFE8, 0xFFF8, 0xFFC8, 0xFFD8, 0xFF28, 0xFF38, 0xFF08, 0xFF18, 0xFF68, 0xFF78, 0xFF48, 0xFF58, + 0xFAA0, 0xFAE0, 0xFA20, 0xFA60, 0xFBA0, 0xFBE0, 0xFB20, 0xFB60, 0xF8A0, 0xF8E0, 0xF820, 0xF860, 0xF9A0, 0xF9E0, 0xF920, 0xF960, + 0xFD50, 0xFD70, 0xFD10, 0xFD30, 0xFDD0, 0xFDF0, 0xFD90, 0xFDB0, 0xFC50, 0xFC70, 0xFC10, 0xFC30, 0xFCD0, 0xFCF0, 0xFC90, 0xFCB0, + 0x1580, 0x1480, 0x1780, 0x1680, 0x1180, 0x1080, 0x1380, 0x1280, 0x1D80, 0x1C80, 0x1F80, 0x1E80, 0x1980, 0x1880, 0x1B80, 0x1A80, + 0x0AC0, 0x0A40, 0x0BC0, 0x0B40, 0x08C0, 0x0840, 0x09C0, 0x0940, 0x0EC0, 0x0E40, 0x0FC0, 0x0F40, 0x0CC0, 0x0C40, 0x0DC0, 0x0D40, + 0x5600, 0x5200, 0x5E00, 0x5A00, 0x4600, 0x4200, 0x4E00, 0x4A00, 0x7600, 0x7200, 0x7E00, 0x7A00, 0x6600, 0x6200, 0x6E00, 0x6A00, + 0x2B00, 0x2900, 0x2F00, 0x2D00, 0x2300, 0x2100, 0x2700, 0x2500, 0x3B00, 0x3900, 0x3F00, 0x3D00, 0x3300, 0x3100, 0x3700, 0x3500, + 0x0158, 0x0148, 0x0178, 0x0168, 0x0118, 0x0108, 0x0138, 0x0128, 0x01D8, 0x01C8, 0x01F8, 0x01E8, 0x0198, 0x0188, 0x01B8, 0x01A8, + 0x0058, 0x0048, 0x0078, 0x0068, 0x0018, 0x0008, 0x0038, 0x0028, 0x00D8, 0x00C8, 0x00F8, 0x00E8, 0x0098, 0x0088, 0x00B8, 0x00A8, + 0x0560, 0x0520, 0x05E0, 0x05A0, 0x0460, 0x0420, 0x04E0, 0x04A0, 0x0760, 0x0720, 0x07E0, 0x07A0, 0x0660, 0x0620, 0x06E0, 0x06A0, + 0x02B0, 0x0290, 0x02F0, 0x02D0, 0x0230, 0x0210, 0x0270, 0x0250, 0x03B0, 0x0390, 0x03F0, 0x03D0, 0x0330, 0x0310, 0x0370, 0x0350 +}; + +static unsigned short g_drwavMulawTable[256] = { + 0x8284, 0x8684, 0x8A84, 0x8E84, 0x9284, 0x9684, 0x9A84, 0x9E84, 0xA284, 0xA684, 0xAA84, 0xAE84, 0xB284, 0xB684, 0xBA84, 0xBE84, + 0xC184, 0xC384, 0xC584, 0xC784, 0xC984, 0xCB84, 0xCD84, 0xCF84, 0xD184, 0xD384, 0xD584, 0xD784, 0xD984, 0xDB84, 0xDD84, 0xDF84, + 0xE104, 0xE204, 0xE304, 0xE404, 0xE504, 0xE604, 0xE704, 0xE804, 0xE904, 0xEA04, 0xEB04, 0xEC04, 0xED04, 0xEE04, 0xEF04, 0xF004, + 0xF0C4, 0xF144, 0xF1C4, 0xF244, 0xF2C4, 0xF344, 0xF3C4, 0xF444, 0xF4C4, 0xF544, 0xF5C4, 0xF644, 0xF6C4, 0xF744, 0xF7C4, 0xF844, + 0xF8A4, 0xF8E4, 0xF924, 0xF964, 0xF9A4, 0xF9E4, 0xFA24, 0xFA64, 0xFAA4, 0xFAE4, 0xFB24, 0xFB64, 0xFBA4, 0xFBE4, 0xFC24, 0xFC64, + 0xFC94, 0xFCB4, 0xFCD4, 0xFCF4, 0xFD14, 0xFD34, 0xFD54, 0xFD74, 0xFD94, 0xFDB4, 0xFDD4, 0xFDF4, 0xFE14, 0xFE34, 0xFE54, 0xFE74, + 0xFE8C, 0xFE9C, 0xFEAC, 0xFEBC, 0xFECC, 0xFEDC, 0xFEEC, 0xFEFC, 0xFF0C, 0xFF1C, 0xFF2C, 0xFF3C, 0xFF4C, 0xFF5C, 0xFF6C, 0xFF7C, + 0xFF88, 0xFF90, 0xFF98, 0xFFA0, 0xFFA8, 0xFFB0, 0xFFB8, 0xFFC0, 0xFFC8, 0xFFD0, 0xFFD8, 0xFFE0, 0xFFE8, 0xFFF0, 0xFFF8, 0x0000, + 0x7D7C, 0x797C, 0x757C, 0x717C, 0x6D7C, 0x697C, 0x657C, 0x617C, 0x5D7C, 0x597C, 0x557C, 0x517C, 0x4D7C, 0x497C, 0x457C, 0x417C, + 0x3E7C, 0x3C7C, 0x3A7C, 0x387C, 0x367C, 0x347C, 0x327C, 0x307C, 0x2E7C, 0x2C7C, 0x2A7C, 0x287C, 0x267C, 0x247C, 0x227C, 0x207C, + 0x1EFC, 0x1DFC, 0x1CFC, 0x1BFC, 0x1AFC, 0x19FC, 0x18FC, 0x17FC, 0x16FC, 0x15FC, 0x14FC, 0x13FC, 0x12FC, 0x11FC, 0x10FC, 0x0FFC, + 0x0F3C, 0x0EBC, 0x0E3C, 0x0DBC, 0x0D3C, 0x0CBC, 0x0C3C, 0x0BBC, 0x0B3C, 0x0ABC, 0x0A3C, 0x09BC, 0x093C, 0x08BC, 0x083C, 0x07BC, + 0x075C, 0x071C, 0x06DC, 0x069C, 0x065C, 0x061C, 0x05DC, 0x059C, 0x055C, 0x051C, 0x04DC, 0x049C, 0x045C, 0x041C, 0x03DC, 0x039C, + 0x036C, 0x034C, 0x032C, 0x030C, 0x02EC, 0x02CC, 0x02AC, 0x028C, 0x026C, 0x024C, 0x022C, 0x020C, 0x01EC, 0x01CC, 0x01AC, 0x018C, + 0x0174, 0x0164, 0x0154, 0x0144, 0x0134, 0x0124, 0x0114, 0x0104, 0x00F4, 0x00E4, 0x00D4, 0x00C4, 0x00B4, 0x00A4, 0x0094, 0x0084, + 0x0078, 0x0070, 0x0068, 0x0060, 0x0058, 0x0050, 0x0048, 0x0040, 0x0038, 0x0030, 0x0028, 0x0020, 0x0018, 0x0010, 0x0008, 0x0000 +}; + +static DRWAV_INLINE drwav_int16 drwav__alaw_to_s16(drwav_uint8 sampleIn) +{ + return (short)g_drwavAlawTable[sampleIn]; +} + +static DRWAV_INLINE drwav_int16 drwav__mulaw_to_s16(drwav_uint8 sampleIn) +{ + return (short)g_drwavMulawTable[sampleIn]; +} + + + +static void drwav__pcm_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample) +{ + unsigned int i; + + /* Special case for 8-bit sample data because it's treated as unsigned. */ + if (bytesPerSample == 1) { + drwav_u8_to_s16(pOut, pIn, totalSampleCount); + return; + } + + + /* Slightly more optimal implementation for common formats. */ + if (bytesPerSample == 2) { + for (i = 0; i < totalSampleCount; ++i) { + *pOut++ = ((const drwav_int16*)pIn)[i]; + } + return; + } + if (bytesPerSample == 3) { + drwav_s24_to_s16(pOut, pIn, totalSampleCount); + return; + } + if (bytesPerSample == 4) { + drwav_s32_to_s16(pOut, (const drwav_int32*)pIn, totalSampleCount); + return; + } + + + /* Anything more than 64 bits per sample is not supported. */ + if (bytesPerSample > 8) { + DRWAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut)); + return; + } + + + /* Generic, slow converter. */ + for (i = 0; i < totalSampleCount; ++i) { + drwav_uint64 sample = 0; + unsigned int shift = (8 - bytesPerSample) * 8; + + unsigned int j; + for (j = 0; j < bytesPerSample; j += 1) { + DRWAV_ASSERT(j < 8); + sample |= (drwav_uint64)(pIn[j]) << shift; + shift += 8; + } + + pIn += j; + *pOut++ = (drwav_int16)((drwav_int64)sample >> 48); + } +} + +static void drwav__ieee_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample) +{ + if (bytesPerSample == 4) { + drwav_f32_to_s16(pOut, (const float*)pIn, totalSampleCount); + return; + } else if (bytesPerSample == 8) { + drwav_f64_to_s16(pOut, (const double*)pIn, totalSampleCount); + return; + } else { + /* Only supporting 32- and 64-bit float. Output silence in all other cases. Contributions welcome for 16-bit float. */ + DRWAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut)); + return; + } +} + +static drwav_uint64 drwav_read_pcm_frames_s16__pcm(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut) +{ + drwav_uint32 bytesPerFrame; + drwav_uint64 totalFramesRead; + drwav_uint8 sampleData[4096]; + + /* Fast path. */ + if ((pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM && pWav->bitsPerSample == 16) || pBufferOut == NULL) { + return drwav_read_pcm_frames(pWav, framesToRead, pBufferOut); + } + + bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav); + if (bytesPerFrame == 0) { + return 0; + } + + totalFramesRead = 0; + + while (framesToRead > 0) { + drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData); + if (framesRead == 0) { + break; + } + + drwav__pcm_to_s16(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels), bytesPerFrame/pWav->channels); + + pBufferOut += framesRead*pWav->channels; + framesToRead -= framesRead; + totalFramesRead += framesRead; + } + + return totalFramesRead; +} + +static drwav_uint64 drwav_read_pcm_frames_s16__ieee(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut) +{ + drwav_uint64 totalFramesRead; + drwav_uint8 sampleData[4096]; + drwav_uint32 bytesPerFrame; + + if (pBufferOut == NULL) { + return drwav_read_pcm_frames(pWav, framesToRead, NULL); + } + + bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav); + if (bytesPerFrame == 0) { + return 0; + } + + totalFramesRead = 0; + + while (framesToRead > 0) { + drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData); + if (framesRead == 0) { + break; + } + + drwav__ieee_to_s16(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels), bytesPerFrame/pWav->channels); + + pBufferOut += framesRead*pWav->channels; + framesToRead -= framesRead; + totalFramesRead += framesRead; + } + + return totalFramesRead; +} + +static drwav_uint64 drwav_read_pcm_frames_s16__alaw(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut) +{ + drwav_uint64 totalFramesRead; + drwav_uint8 sampleData[4096]; + drwav_uint32 bytesPerFrame; + + if (pBufferOut == NULL) { + return drwav_read_pcm_frames(pWav, framesToRead, NULL); + } + + bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav); + if (bytesPerFrame == 0) { + return 0; + } + + totalFramesRead = 0; + + while (framesToRead > 0) { + drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData); + if (framesRead == 0) { + break; + } + + drwav_alaw_to_s16(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels)); + + pBufferOut += framesRead*pWav->channels; + framesToRead -= framesRead; + totalFramesRead += framesRead; + } + + return totalFramesRead; +} + +static drwav_uint64 drwav_read_pcm_frames_s16__mulaw(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut) +{ + drwav_uint64 totalFramesRead; + drwav_uint8 sampleData[4096]; + drwav_uint32 bytesPerFrame; + + if (pBufferOut == NULL) { + return drwav_read_pcm_frames(pWav, framesToRead, NULL); + } + + bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav); + if (bytesPerFrame == 0) { + return 0; + } + + totalFramesRead = 0; + + while (framesToRead > 0) { + drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData); + if (framesRead == 0) { + break; + } + + drwav_mulaw_to_s16(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels)); + + pBufferOut += framesRead*pWav->channels; + framesToRead -= framesRead; + totalFramesRead += framesRead; + } + + return totalFramesRead; +} + +DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut) +{ + if (pWav == NULL || framesToRead == 0) { + return 0; + } + + if (pBufferOut == NULL) { + return drwav_read_pcm_frames(pWav, framesToRead, NULL); + } + + /* Don't try to read more samples than can potentially fit in the output buffer. */ + if (framesToRead * pWav->channels * sizeof(drwav_int16) > DRWAV_SIZE_MAX) { + framesToRead = DRWAV_SIZE_MAX / sizeof(drwav_int16) / pWav->channels; + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM) { + return drwav_read_pcm_frames_s16__pcm(pWav, framesToRead, pBufferOut); + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_IEEE_FLOAT) { + return drwav_read_pcm_frames_s16__ieee(pWav, framesToRead, pBufferOut); + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ALAW) { + return drwav_read_pcm_frames_s16__alaw(pWav, framesToRead, pBufferOut); + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_MULAW) { + return drwav_read_pcm_frames_s16__mulaw(pWav, framesToRead, pBufferOut); + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) { + return drwav_read_pcm_frames_s16__msadpcm(pWav, framesToRead, pBufferOut); + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) { + return drwav_read_pcm_frames_s16__ima(pWav, framesToRead, pBufferOut); + } + + return 0; +} + +DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16le(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut) +{ + drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, framesToRead, pBufferOut); + if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_FALSE) { + drwav__bswap_samples_s16(pBufferOut, framesRead*pWav->channels); + } + + return framesRead; +} + +DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16be(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut) +{ + drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, framesToRead, pBufferOut); + if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_TRUE) { + drwav__bswap_samples_s16(pBufferOut, framesRead*pWav->channels); + } + + return framesRead; +} + + +DRWAV_API void drwav_u8_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount) +{ + int r; + size_t i; + for (i = 0; i < sampleCount; ++i) { + int x = pIn[i]; + r = x << 8; + r = r - 32768; + pOut[i] = (short)r; + } +} + +DRWAV_API void drwav_s24_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount) +{ + int r; + size_t i; + for (i = 0; i < sampleCount; ++i) { + int x = ((int)(((unsigned int)(((const drwav_uint8*)pIn)[i*3+0]) << 8) | ((unsigned int)(((const drwav_uint8*)pIn)[i*3+1]) << 16) | ((unsigned int)(((const drwav_uint8*)pIn)[i*3+2])) << 24)) >> 8; + r = x >> 8; + pOut[i] = (short)r; + } +} + +DRWAV_API void drwav_s32_to_s16(drwav_int16* pOut, const drwav_int32* pIn, size_t sampleCount) +{ + int r; + size_t i; + for (i = 0; i < sampleCount; ++i) { + int x = pIn[i]; + r = x >> 16; + pOut[i] = (short)r; + } +} + +DRWAV_API void drwav_f32_to_s16(drwav_int16* pOut, const float* pIn, size_t sampleCount) +{ + int r; + size_t i; + for (i = 0; i < sampleCount; ++i) { + float x = pIn[i]; + float c; + c = ((x < -1) ? -1 : ((x > 1) ? 1 : x)); + c = c + 1; + r = (int)(c * 32767.5f); + r = r - 32768; + pOut[i] = (short)r; + } +} + +DRWAV_API void drwav_f64_to_s16(drwav_int16* pOut, const double* pIn, size_t sampleCount) +{ + int r; + size_t i; + for (i = 0; i < sampleCount; ++i) { + double x = pIn[i]; + double c; + c = ((x < -1) ? -1 : ((x > 1) ? 1 : x)); + c = c + 1; + r = (int)(c * 32767.5); + r = r - 32768; + pOut[i] = (short)r; + } +} + +DRWAV_API void drwav_alaw_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount) +{ + size_t i; + for (i = 0; i < sampleCount; ++i) { + pOut[i] = drwav__alaw_to_s16(pIn[i]); + } +} + +DRWAV_API void drwav_mulaw_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount) +{ + size_t i; + for (i = 0; i < sampleCount; ++i) { + pOut[i] = drwav__mulaw_to_s16(pIn[i]); + } +} + + + +static void drwav__pcm_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount, unsigned int bytesPerSample) +{ + unsigned int i; + + /* Special case for 8-bit sample data because it's treated as unsigned. */ + if (bytesPerSample == 1) { + drwav_u8_to_f32(pOut, pIn, sampleCount); + return; + } + + /* Slightly more optimal implementation for common formats. */ + if (bytesPerSample == 2) { + drwav_s16_to_f32(pOut, (const drwav_int16*)pIn, sampleCount); + return; + } + if (bytesPerSample == 3) { + drwav_s24_to_f32(pOut, pIn, sampleCount); + return; + } + if (bytesPerSample == 4) { + drwav_s32_to_f32(pOut, (const drwav_int32*)pIn, sampleCount); + return; + } + + + /* Anything more than 64 bits per sample is not supported. */ + if (bytesPerSample > 8) { + DRWAV_ZERO_MEMORY(pOut, sampleCount * sizeof(*pOut)); + return; + } + + + /* Generic, slow converter. */ + for (i = 0; i < sampleCount; ++i) { + drwav_uint64 sample = 0; + unsigned int shift = (8 - bytesPerSample) * 8; + + unsigned int j; + for (j = 0; j < bytesPerSample; j += 1) { + DRWAV_ASSERT(j < 8); + sample |= (drwav_uint64)(pIn[j]) << shift; + shift += 8; + } + + pIn += j; + *pOut++ = (float)((drwav_int64)sample / 9223372036854775807.0); + } +} + +static void drwav__ieee_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount, unsigned int bytesPerSample) +{ + if (bytesPerSample == 4) { + unsigned int i; + for (i = 0; i < sampleCount; ++i) { + *pOut++ = ((const float*)pIn)[i]; + } + return; + } else if (bytesPerSample == 8) { + drwav_f64_to_f32(pOut, (const double*)pIn, sampleCount); + return; + } else { + /* Only supporting 32- and 64-bit float. Output silence in all other cases. Contributions welcome for 16-bit float. */ + DRWAV_ZERO_MEMORY(pOut, sampleCount * sizeof(*pOut)); + return; + } +} + + +static drwav_uint64 drwav_read_pcm_frames_f32__pcm(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut) +{ + drwav_uint64 totalFramesRead; + drwav_uint8 sampleData[4096]; + + drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav); + if (bytesPerFrame == 0) { + return 0; + } + + totalFramesRead = 0; + + while (framesToRead > 0) { + drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData); + if (framesRead == 0) { + break; + } + + drwav__pcm_to_f32(pBufferOut, sampleData, (size_t)framesRead*pWav->channels, bytesPerFrame/pWav->channels); + + pBufferOut += framesRead*pWav->channels; + framesToRead -= framesRead; + totalFramesRead += framesRead; + } + + return totalFramesRead; +} + +static drwav_uint64 drwav_read_pcm_frames_f32__msadpcm(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut) +{ + /* + We're just going to borrow the implementation from the drwav_read_s16() since ADPCM is a little bit more complicated than other formats and I don't + want to duplicate that code. + */ + drwav_uint64 totalFramesRead = 0; + drwav_int16 samples16[2048]; + while (framesToRead > 0) { + drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, drwav_min(framesToRead, drwav_countof(samples16)/pWav->channels), samples16); + if (framesRead == 0) { + break; + } + + drwav_s16_to_f32(pBufferOut, samples16, (size_t)(framesRead*pWav->channels)); /* <-- Safe cast because we're clamping to 2048. */ + + pBufferOut += framesRead*pWav->channels; + framesToRead -= framesRead; + totalFramesRead += framesRead; + } + + return totalFramesRead; +} + +static drwav_uint64 drwav_read_pcm_frames_f32__ima(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut) +{ + /* + We're just going to borrow the implementation from the drwav_read_s16() since IMA-ADPCM is a little bit more complicated than other formats and I don't + want to duplicate that code. + */ + drwav_uint64 totalFramesRead = 0; + drwav_int16 samples16[2048]; + while (framesToRead > 0) { + drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, drwav_min(framesToRead, drwav_countof(samples16)/pWav->channels), samples16); + if (framesRead == 0) { + break; + } + + drwav_s16_to_f32(pBufferOut, samples16, (size_t)(framesRead*pWav->channels)); /* <-- Safe cast because we're clamping to 2048. */ + + pBufferOut += framesRead*pWav->channels; + framesToRead -= framesRead; + totalFramesRead += framesRead; + } + + return totalFramesRead; +} + +static drwav_uint64 drwav_read_pcm_frames_f32__ieee(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut) +{ + drwav_uint64 totalFramesRead; + drwav_uint8 sampleData[4096]; + drwav_uint32 bytesPerFrame; + + /* Fast path. */ + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_IEEE_FLOAT && pWav->bitsPerSample == 32) { + return drwav_read_pcm_frames(pWav, framesToRead, pBufferOut); + } + + bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav); + if (bytesPerFrame == 0) { + return 0; + } + + totalFramesRead = 0; + + while (framesToRead > 0) { + drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData); + if (framesRead == 0) { + break; + } + + drwav__ieee_to_f32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels), bytesPerFrame/pWav->channels); + + pBufferOut += framesRead*pWav->channels; + framesToRead -= framesRead; + totalFramesRead += framesRead; + } + + return totalFramesRead; +} + +static drwav_uint64 drwav_read_pcm_frames_f32__alaw(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut) +{ + drwav_uint64 totalFramesRead; + drwav_uint8 sampleData[4096]; + drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav); + if (bytesPerFrame == 0) { + return 0; + } + + totalFramesRead = 0; + + while (framesToRead > 0) { + drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData); + if (framesRead == 0) { + break; + } + + drwav_alaw_to_f32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels)); + + pBufferOut += framesRead*pWav->channels; + framesToRead -= framesRead; + totalFramesRead += framesRead; + } + + return totalFramesRead; +} + +static drwav_uint64 drwav_read_pcm_frames_f32__mulaw(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut) +{ + drwav_uint64 totalFramesRead; + drwav_uint8 sampleData[4096]; + + drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav); + if (bytesPerFrame == 0) { + return 0; + } + + totalFramesRead = 0; + + while (framesToRead > 0) { + drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData); + if (framesRead == 0) { + break; + } + + drwav_mulaw_to_f32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels)); + + pBufferOut += framesRead*pWav->channels; + framesToRead -= framesRead; + totalFramesRead += framesRead; + } + + return totalFramesRead; +} + +DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut) +{ + if (pWav == NULL || framesToRead == 0) { + return 0; + } + + if (pBufferOut == NULL) { + return drwav_read_pcm_frames(pWav, framesToRead, NULL); + } + + /* Don't try to read more samples than can potentially fit in the output buffer. */ + if (framesToRead * pWav->channels * sizeof(float) > DRWAV_SIZE_MAX) { + framesToRead = DRWAV_SIZE_MAX / sizeof(float) / pWav->channels; + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM) { + return drwav_read_pcm_frames_f32__pcm(pWav, framesToRead, pBufferOut); + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) { + return drwav_read_pcm_frames_f32__msadpcm(pWav, framesToRead, pBufferOut); + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_IEEE_FLOAT) { + return drwav_read_pcm_frames_f32__ieee(pWav, framesToRead, pBufferOut); + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ALAW) { + return drwav_read_pcm_frames_f32__alaw(pWav, framesToRead, pBufferOut); + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_MULAW) { + return drwav_read_pcm_frames_f32__mulaw(pWav, framesToRead, pBufferOut); + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) { + return drwav_read_pcm_frames_f32__ima(pWav, framesToRead, pBufferOut); + } + + return 0; +} + +DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32le(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut) +{ + drwav_uint64 framesRead = drwav_read_pcm_frames_f32(pWav, framesToRead, pBufferOut); + if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_FALSE) { + drwav__bswap_samples_f32(pBufferOut, framesRead*pWav->channels); + } + + return framesRead; +} + +DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32be(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut) +{ + drwav_uint64 framesRead = drwav_read_pcm_frames_f32(pWav, framesToRead, pBufferOut); + if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_TRUE) { + drwav__bswap_samples_f32(pBufferOut, framesRead*pWav->channels); + } + + return framesRead; +} + + +DRWAV_API void drwav_u8_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount) +{ + size_t i; + + if (pOut == NULL || pIn == NULL) { + return; + } + +#ifdef DR_WAV_LIBSNDFILE_COMPAT + /* + It appears libsndfile uses slightly different logic for the u8 -> f32 conversion to dr_wav, which in my opinion is incorrect. It appears + libsndfile performs the conversion something like "f32 = (u8 / 256) * 2 - 1", however I think it should be "f32 = (u8 / 255) * 2 - 1" (note + the divisor of 256 vs 255). I use libsndfile as a benchmark for testing, so I'm therefore leaving this block here just for my automated + correctness testing. This is disabled by default. + */ + for (i = 0; i < sampleCount; ++i) { + *pOut++ = (pIn[i] / 256.0f) * 2 - 1; + } +#else + for (i = 0; i < sampleCount; ++i) { + float x = pIn[i]; + x = x * 0.00784313725490196078f; /* 0..255 to 0..2 */ + x = x - 1; /* 0..2 to -1..1 */ + + *pOut++ = x; + } +#endif +} + +DRWAV_API void drwav_s16_to_f32(float* pOut, const drwav_int16* pIn, size_t sampleCount) +{ + size_t i; + + if (pOut == NULL || pIn == NULL) { + return; + } + + for (i = 0; i < sampleCount; ++i) { + *pOut++ = pIn[i] * 0.000030517578125f; + } +} + +DRWAV_API void drwav_s24_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount) +{ + size_t i; + + if (pOut == NULL || pIn == NULL) { + return; + } + + for (i = 0; i < sampleCount; ++i) { + double x; + drwav_uint32 a = ((drwav_uint32)(pIn[i*3+0]) << 8); + drwav_uint32 b = ((drwav_uint32)(pIn[i*3+1]) << 16); + drwav_uint32 c = ((drwav_uint32)(pIn[i*3+2]) << 24); + + x = (double)((drwav_int32)(a | b | c) >> 8); + *pOut++ = (float)(x * 0.00000011920928955078125); + } +} + +DRWAV_API void drwav_s32_to_f32(float* pOut, const drwav_int32* pIn, size_t sampleCount) +{ + size_t i; + if (pOut == NULL || pIn == NULL) { + return; + } + + for (i = 0; i < sampleCount; ++i) { + *pOut++ = (float)(pIn[i] / 2147483648.0); + } +} + +DRWAV_API void drwav_f64_to_f32(float* pOut, const double* pIn, size_t sampleCount) +{ + size_t i; + + if (pOut == NULL || pIn == NULL) { + return; + } + + for (i = 0; i < sampleCount; ++i) { + *pOut++ = (float)pIn[i]; + } +} + +DRWAV_API void drwav_alaw_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount) +{ + size_t i; + + if (pOut == NULL || pIn == NULL) { + return; + } + + for (i = 0; i < sampleCount; ++i) { + *pOut++ = drwav__alaw_to_s16(pIn[i]) / 32768.0f; + } +} + +DRWAV_API void drwav_mulaw_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount) +{ + size_t i; + + if (pOut == NULL || pIn == NULL) { + return; + } + + for (i = 0; i < sampleCount; ++i) { + *pOut++ = drwav__mulaw_to_s16(pIn[i]) / 32768.0f; + } +} + + + +static void drwav__pcm_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample) +{ + unsigned int i; + + /* Special case for 8-bit sample data because it's treated as unsigned. */ + if (bytesPerSample == 1) { + drwav_u8_to_s32(pOut, pIn, totalSampleCount); + return; + } + + /* Slightly more optimal implementation for common formats. */ + if (bytesPerSample == 2) { + drwav_s16_to_s32(pOut, (const drwav_int16*)pIn, totalSampleCount); + return; + } + if (bytesPerSample == 3) { + drwav_s24_to_s32(pOut, pIn, totalSampleCount); + return; + } + if (bytesPerSample == 4) { + for (i = 0; i < totalSampleCount; ++i) { + *pOut++ = ((const drwav_int32*)pIn)[i]; + } + return; + } + + + /* Anything more than 64 bits per sample is not supported. */ + if (bytesPerSample > 8) { + DRWAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut)); + return; + } + + + /* Generic, slow converter. */ + for (i = 0; i < totalSampleCount; ++i) { + drwav_uint64 sample = 0; + unsigned int shift = (8 - bytesPerSample) * 8; + + unsigned int j; + for (j = 0; j < bytesPerSample; j += 1) { + DRWAV_ASSERT(j < 8); + sample |= (drwav_uint64)(pIn[j]) << shift; + shift += 8; + } + + pIn += j; + *pOut++ = (drwav_int32)((drwav_int64)sample >> 32); + } +} + +static void drwav__ieee_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample) +{ + if (bytesPerSample == 4) { + drwav_f32_to_s32(pOut, (const float*)pIn, totalSampleCount); + return; + } else if (bytesPerSample == 8) { + drwav_f64_to_s32(pOut, (const double*)pIn, totalSampleCount); + return; + } else { + /* Only supporting 32- and 64-bit float. Output silence in all other cases. Contributions welcome for 16-bit float. */ + DRWAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut)); + return; + } +} + + +static drwav_uint64 drwav_read_pcm_frames_s32__pcm(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut) +{ + drwav_uint64 totalFramesRead; + drwav_uint8 sampleData[4096]; + drwav_uint32 bytesPerFrame; + + /* Fast path. */ + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM && pWav->bitsPerSample == 32) { + return drwav_read_pcm_frames(pWav, framesToRead, pBufferOut); + } + + bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav); + if (bytesPerFrame == 0) { + return 0; + } + + totalFramesRead = 0; + + while (framesToRead > 0) { + drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData); + if (framesRead == 0) { + break; + } + + drwav__pcm_to_s32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels), bytesPerFrame/pWav->channels); + + pBufferOut += framesRead*pWav->channels; + framesToRead -= framesRead; + totalFramesRead += framesRead; + } + + return totalFramesRead; +} + +static drwav_uint64 drwav_read_pcm_frames_s32__msadpcm(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut) +{ + /* + We're just going to borrow the implementation from the drwav_read_s16() since ADPCM is a little bit more complicated than other formats and I don't + want to duplicate that code. + */ + drwav_uint64 totalFramesRead = 0; + drwav_int16 samples16[2048]; + while (framesToRead > 0) { + drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, drwav_min(framesToRead, drwav_countof(samples16)/pWav->channels), samples16); + if (framesRead == 0) { + break; + } + + drwav_s16_to_s32(pBufferOut, samples16, (size_t)(framesRead*pWav->channels)); /* <-- Safe cast because we're clamping to 2048. */ + + pBufferOut += framesRead*pWav->channels; + framesToRead -= framesRead; + totalFramesRead += framesRead; + } + + return totalFramesRead; +} + +static drwav_uint64 drwav_read_pcm_frames_s32__ima(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut) +{ + /* + We're just going to borrow the implementation from the drwav_read_s16() since IMA-ADPCM is a little bit more complicated than other formats and I don't + want to duplicate that code. + */ + drwav_uint64 totalFramesRead = 0; + drwav_int16 samples16[2048]; + while (framesToRead > 0) { + drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, drwav_min(framesToRead, drwav_countof(samples16)/pWav->channels), samples16); + if (framesRead == 0) { + break; + } + + drwav_s16_to_s32(pBufferOut, samples16, (size_t)(framesRead*pWav->channels)); /* <-- Safe cast because we're clamping to 2048. */ + + pBufferOut += framesRead*pWav->channels; + framesToRead -= framesRead; + totalFramesRead += framesRead; + } + + return totalFramesRead; +} + +static drwav_uint64 drwav_read_pcm_frames_s32__ieee(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut) +{ + drwav_uint64 totalFramesRead; + drwav_uint8 sampleData[4096]; + + drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav); + if (bytesPerFrame == 0) { + return 0; + } + + totalFramesRead = 0; + + while (framesToRead > 0) { + drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData); + if (framesRead == 0) { + break; + } + + drwav__ieee_to_s32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels), bytesPerFrame/pWav->channels); + + pBufferOut += framesRead*pWav->channels; + framesToRead -= framesRead; + totalFramesRead += framesRead; + } + + return totalFramesRead; +} + +static drwav_uint64 drwav_read_pcm_frames_s32__alaw(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut) +{ + drwav_uint64 totalFramesRead; + drwav_uint8 sampleData[4096]; + + drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav); + if (bytesPerFrame == 0) { + return 0; + } + + totalFramesRead = 0; + + while (framesToRead > 0) { + drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData); + if (framesRead == 0) { + break; + } + + drwav_alaw_to_s32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels)); + + pBufferOut += framesRead*pWav->channels; + framesToRead -= framesRead; + totalFramesRead += framesRead; + } + + return totalFramesRead; +} + +static drwav_uint64 drwav_read_pcm_frames_s32__mulaw(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut) +{ + drwav_uint64 totalFramesRead; + drwav_uint8 sampleData[4096]; + + drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav); + if (bytesPerFrame == 0) { + return 0; + } + + totalFramesRead = 0; + + while (framesToRead > 0) { + drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData); + if (framesRead == 0) { + break; + } + + drwav_mulaw_to_s32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels)); + + pBufferOut += framesRead*pWav->channels; + framesToRead -= framesRead; + totalFramesRead += framesRead; + } + + return totalFramesRead; +} + +DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut) +{ + if (pWav == NULL || framesToRead == 0) { + return 0; + } + + if (pBufferOut == NULL) { + return drwav_read_pcm_frames(pWav, framesToRead, NULL); + } + + /* Don't try to read more samples than can potentially fit in the output buffer. */ + if (framesToRead * pWav->channels * sizeof(drwav_int32) > DRWAV_SIZE_MAX) { + framesToRead = DRWAV_SIZE_MAX / sizeof(drwav_int32) / pWav->channels; + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM) { + return drwav_read_pcm_frames_s32__pcm(pWav, framesToRead, pBufferOut); + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) { + return drwav_read_pcm_frames_s32__msadpcm(pWav, framesToRead, pBufferOut); + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_IEEE_FLOAT) { + return drwav_read_pcm_frames_s32__ieee(pWav, framesToRead, pBufferOut); + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ALAW) { + return drwav_read_pcm_frames_s32__alaw(pWav, framesToRead, pBufferOut); + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_MULAW) { + return drwav_read_pcm_frames_s32__mulaw(pWav, framesToRead, pBufferOut); + } + + if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) { + return drwav_read_pcm_frames_s32__ima(pWav, framesToRead, pBufferOut); + } + + return 0; +} + +DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32le(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut) +{ + drwav_uint64 framesRead = drwav_read_pcm_frames_s32(pWav, framesToRead, pBufferOut); + if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_FALSE) { + drwav__bswap_samples_s32(pBufferOut, framesRead*pWav->channels); + } + + return framesRead; +} + +DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32be(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut) +{ + drwav_uint64 framesRead = drwav_read_pcm_frames_s32(pWav, framesToRead, pBufferOut); + if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_TRUE) { + drwav__bswap_samples_s32(pBufferOut, framesRead*pWav->channels); + } + + return framesRead; +} + + +DRWAV_API void drwav_u8_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount) +{ + size_t i; + + if (pOut == NULL || pIn == NULL) { + return; + } + + for (i = 0; i < sampleCount; ++i) { + *pOut++ = ((int)pIn[i] - 128) << 24; + } +} + +DRWAV_API void drwav_s16_to_s32(drwav_int32* pOut, const drwav_int16* pIn, size_t sampleCount) +{ + size_t i; + + if (pOut == NULL || pIn == NULL) { + return; + } + + for (i = 0; i < sampleCount; ++i) { + *pOut++ = pIn[i] << 16; + } +} + +DRWAV_API void drwav_s24_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount) +{ + size_t i; + + if (pOut == NULL || pIn == NULL) { + return; + } + + for (i = 0; i < sampleCount; ++i) { + unsigned int s0 = pIn[i*3 + 0]; + unsigned int s1 = pIn[i*3 + 1]; + unsigned int s2 = pIn[i*3 + 2]; + + drwav_int32 sample32 = (drwav_int32)((s0 << 8) | (s1 << 16) | (s2 << 24)); + *pOut++ = sample32; + } +} + +DRWAV_API void drwav_f32_to_s32(drwav_int32* pOut, const float* pIn, size_t sampleCount) +{ + size_t i; + + if (pOut == NULL || pIn == NULL) { + return; + } + + for (i = 0; i < sampleCount; ++i) { + *pOut++ = (drwav_int32)(2147483648.0 * pIn[i]); + } +} + +DRWAV_API void drwav_f64_to_s32(drwav_int32* pOut, const double* pIn, size_t sampleCount) +{ + size_t i; + + if (pOut == NULL || pIn == NULL) { + return; + } + + for (i = 0; i < sampleCount; ++i) { + *pOut++ = (drwav_int32)(2147483648.0 * pIn[i]); + } +} + +DRWAV_API void drwav_alaw_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount) +{ + size_t i; + + if (pOut == NULL || pIn == NULL) { + return; + } + + for (i = 0; i < sampleCount; ++i) { + *pOut++ = ((drwav_int32)drwav__alaw_to_s16(pIn[i])) << 16; + } +} + +DRWAV_API void drwav_mulaw_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount) +{ + size_t i; + + if (pOut == NULL || pIn == NULL) { + return; + } + + for (i= 0; i < sampleCount; ++i) { + *pOut++ = ((drwav_int32)drwav__mulaw_to_s16(pIn[i])) << 16; + } +} + + + +static drwav_int16* drwav__read_pcm_frames_and_close_s16(drwav* pWav, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount) +{ + drwav_uint64 sampleDataSize; + drwav_int16* pSampleData; + drwav_uint64 framesRead; + + DRWAV_ASSERT(pWav != NULL); + + sampleDataSize = pWav->totalPCMFrameCount * pWav->channels * sizeof(drwav_int16); + if (sampleDataSize > DRWAV_SIZE_MAX) { + drwav_uninit(pWav); + return NULL; /* File's too big. */ + } + + pSampleData = (drwav_int16*)drwav__malloc_from_callbacks((size_t)sampleDataSize, &pWav->allocationCallbacks); /* <-- Safe cast due to the check above. */ + if (pSampleData == NULL) { + drwav_uninit(pWav); + return NULL; /* Failed to allocate memory. */ + } + + framesRead = drwav_read_pcm_frames_s16(pWav, (size_t)pWav->totalPCMFrameCount, pSampleData); + if (framesRead != pWav->totalPCMFrameCount) { + drwav__free_from_callbacks(pSampleData, &pWav->allocationCallbacks); + drwav_uninit(pWav); + return NULL; /* There was an error reading the samples. */ + } + + drwav_uninit(pWav); + + if (sampleRate) { + *sampleRate = pWav->sampleRate; + } + if (channels) { + *channels = pWav->channels; + } + if (totalFrameCount) { + *totalFrameCount = pWav->totalPCMFrameCount; + } + + return pSampleData; +} + +static float* drwav__read_pcm_frames_and_close_f32(drwav* pWav, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount) +{ + drwav_uint64 sampleDataSize; + float* pSampleData; + drwav_uint64 framesRead; + + DRWAV_ASSERT(pWav != NULL); + + sampleDataSize = pWav->totalPCMFrameCount * pWav->channels * sizeof(float); + if (sampleDataSize > DRWAV_SIZE_MAX) { + drwav_uninit(pWav); + return NULL; /* File's too big. */ + } + + pSampleData = (float*)drwav__malloc_from_callbacks((size_t)sampleDataSize, &pWav->allocationCallbacks); /* <-- Safe cast due to the check above. */ + if (pSampleData == NULL) { + drwav_uninit(pWav); + return NULL; /* Failed to allocate memory. */ + } + + framesRead = drwav_read_pcm_frames_f32(pWav, (size_t)pWav->totalPCMFrameCount, pSampleData); + if (framesRead != pWav->totalPCMFrameCount) { + drwav__free_from_callbacks(pSampleData, &pWav->allocationCallbacks); + drwav_uninit(pWav); + return NULL; /* There was an error reading the samples. */ + } + + drwav_uninit(pWav); + + if (sampleRate) { + *sampleRate = pWav->sampleRate; + } + if (channels) { + *channels = pWav->channels; + } + if (totalFrameCount) { + *totalFrameCount = pWav->totalPCMFrameCount; + } + + return pSampleData; +} + +static drwav_int32* drwav__read_pcm_frames_and_close_s32(drwav* pWav, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount) +{ + drwav_uint64 sampleDataSize; + drwav_int32* pSampleData; + drwav_uint64 framesRead; + + DRWAV_ASSERT(pWav != NULL); + + sampleDataSize = pWav->totalPCMFrameCount * pWav->channels * sizeof(drwav_int32); + if (sampleDataSize > DRWAV_SIZE_MAX) { + drwav_uninit(pWav); + return NULL; /* File's too big. */ + } + + pSampleData = (drwav_int32*)drwav__malloc_from_callbacks((size_t)sampleDataSize, &pWav->allocationCallbacks); /* <-- Safe cast due to the check above. */ + if (pSampleData == NULL) { + drwav_uninit(pWav); + return NULL; /* Failed to allocate memory. */ + } + + framesRead = drwav_read_pcm_frames_s32(pWav, (size_t)pWav->totalPCMFrameCount, pSampleData); + if (framesRead != pWav->totalPCMFrameCount) { + drwav__free_from_callbacks(pSampleData, &pWav->allocationCallbacks); + drwav_uninit(pWav); + return NULL; /* There was an error reading the samples. */ + } + + drwav_uninit(pWav); + + if (sampleRate) { + *sampleRate = pWav->sampleRate; + } + if (channels) { + *channels = pWav->channels; + } + if (totalFrameCount) { + *totalFrameCount = pWav->totalPCMFrameCount; + } + + return pSampleData; +} + + + +DRWAV_API drwav_int16* drwav_open_and_read_pcm_frames_s16(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + drwav wav; + + if (channelsOut) { + *channelsOut = 0; + } + if (sampleRateOut) { + *sampleRateOut = 0; + } + if (totalFrameCountOut) { + *totalFrameCountOut = 0; + } + + if (!drwav_init(&wav, onRead, onSeek, pUserData, pAllocationCallbacks)) { + return NULL; + } + + return drwav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut); +} + +DRWAV_API float* drwav_open_and_read_pcm_frames_f32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + drwav wav; + + if (channelsOut) { + *channelsOut = 0; + } + if (sampleRateOut) { + *sampleRateOut = 0; + } + if (totalFrameCountOut) { + *totalFrameCountOut = 0; + } + + if (!drwav_init(&wav, onRead, onSeek, pUserData, pAllocationCallbacks)) { + return NULL; + } + + return drwav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut); +} + +DRWAV_API drwav_int32* drwav_open_and_read_pcm_frames_s32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + drwav wav; + + if (channelsOut) { + *channelsOut = 0; + } + if (sampleRateOut) { + *sampleRateOut = 0; + } + if (totalFrameCountOut) { + *totalFrameCountOut = 0; + } + + if (!drwav_init(&wav, onRead, onSeek, pUserData, pAllocationCallbacks)) { + return NULL; + } + + return drwav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut); +} + +#ifndef DR_WAV_NO_STDIO +DRWAV_API drwav_int16* drwav_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + drwav wav; + + if (channelsOut) { + *channelsOut = 0; + } + if (sampleRateOut) { + *sampleRateOut = 0; + } + if (totalFrameCountOut) { + *totalFrameCountOut = 0; + } + + if (!drwav_init_file(&wav, filename, pAllocationCallbacks)) { + return NULL; + } + + return drwav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut); +} + +DRWAV_API float* drwav_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + drwav wav; + + if (channelsOut) { + *channelsOut = 0; + } + if (sampleRateOut) { + *sampleRateOut = 0; + } + if (totalFrameCountOut) { + *totalFrameCountOut = 0; + } + + if (!drwav_init_file(&wav, filename, pAllocationCallbacks)) { + return NULL; + } + + return drwav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut); +} + +DRWAV_API drwav_int32* drwav_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + drwav wav; + + if (channelsOut) { + *channelsOut = 0; + } + if (sampleRateOut) { + *sampleRateOut = 0; + } + if (totalFrameCountOut) { + *totalFrameCountOut = 0; + } + + if (!drwav_init_file(&wav, filename, pAllocationCallbacks)) { + return NULL; + } + + return drwav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut); +} + + +DRWAV_API drwav_int16* drwav_open_file_and_read_pcm_frames_s16_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + drwav wav; + + if (sampleRateOut) { + *sampleRateOut = 0; + } + if (channelsOut) { + *channelsOut = 0; + } + if (totalFrameCountOut) { + *totalFrameCountOut = 0; + } + + if (!drwav_init_file_w(&wav, filename, pAllocationCallbacks)) { + return NULL; + } + + return drwav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut); +} + +DRWAV_API float* drwav_open_file_and_read_pcm_frames_f32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + drwav wav; + + if (sampleRateOut) { + *sampleRateOut = 0; + } + if (channelsOut) { + *channelsOut = 0; + } + if (totalFrameCountOut) { + *totalFrameCountOut = 0; + } + + if (!drwav_init_file_w(&wav, filename, pAllocationCallbacks)) { + return NULL; + } + + return drwav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut); +} + +DRWAV_API drwav_int32* drwav_open_file_and_read_pcm_frames_s32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + drwav wav; + + if (sampleRateOut) { + *sampleRateOut = 0; + } + if (channelsOut) { + *channelsOut = 0; + } + if (totalFrameCountOut) { + *totalFrameCountOut = 0; + } + + if (!drwav_init_file_w(&wav, filename, pAllocationCallbacks)) { + return NULL; + } + + return drwav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut); +} +#endif + +DRWAV_API drwav_int16* drwav_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + drwav wav; + + if (channelsOut) { + *channelsOut = 0; + } + if (sampleRateOut) { + *sampleRateOut = 0; + } + if (totalFrameCountOut) { + *totalFrameCountOut = 0; + } + + if (!drwav_init_memory(&wav, data, dataSize, pAllocationCallbacks)) { + return NULL; + } + + return drwav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut); +} + +DRWAV_API float* drwav_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + drwav wav; + + if (channelsOut) { + *channelsOut = 0; + } + if (sampleRateOut) { + *sampleRateOut = 0; + } + if (totalFrameCountOut) { + *totalFrameCountOut = 0; + } + + if (!drwav_init_memory(&wav, data, dataSize, pAllocationCallbacks)) { + return NULL; + } + + return drwav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut); +} + +DRWAV_API drwav_int32* drwav_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + drwav wav; + + if (channelsOut) { + *channelsOut = 0; + } + if (sampleRateOut) { + *sampleRateOut = 0; + } + if (totalFrameCountOut) { + *totalFrameCountOut = 0; + } + + if (!drwav_init_memory(&wav, data, dataSize, pAllocationCallbacks)) { + return NULL; + } + + return drwav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut); +} +#endif /* DR_WAV_NO_CONVERSION_API */ + + +DRWAV_API void drwav_free(void* p, const drwav_allocation_callbacks* pAllocationCallbacks) +{ + if (pAllocationCallbacks != NULL) { + drwav__free_from_callbacks(p, pAllocationCallbacks); + } else { + drwav__free_default(p, NULL); + } +} + +DRWAV_API drwav_uint16 drwav_bytes_to_u16(const drwav_uint8* data) +{ + return drwav__bytes_to_u16(data); +} + +DRWAV_API drwav_int16 drwav_bytes_to_s16(const drwav_uint8* data) +{ + return drwav__bytes_to_s16(data); +} + +DRWAV_API drwav_uint32 drwav_bytes_to_u32(const drwav_uint8* data) +{ + return drwav__bytes_to_u32(data); +} + +DRWAV_API drwav_int32 drwav_bytes_to_s32(const drwav_uint8* data) +{ + return drwav__bytes_to_s32(data); +} + +DRWAV_API drwav_uint64 drwav_bytes_to_u64(const drwav_uint8* data) +{ + return drwav__bytes_to_u64(data); +} + +DRWAV_API drwav_int64 drwav_bytes_to_s64(const drwav_uint8* data) +{ + return drwav__bytes_to_s64(data); +} + + +DRWAV_API drwav_bool32 drwav_guid_equal(const drwav_uint8 a[16], const drwav_uint8 b[16]) +{ + return drwav__guid_equal(a, b); +} + +DRWAV_API drwav_bool32 drwav_fourcc_equal(const drwav_uint8* a, const char* b) +{ + return drwav__fourcc_equal(a, b); +} + +#endif /* dr_wav_c */ +#endif /* DR_WAV_IMPLEMENTATION */ + +/* +RELEASE NOTES - v0.11.0 +======================= +Version 0.11.0 has breaking API changes. + +Improved Client-Defined Memory Allocation +----------------------------------------- +The main change with this release is the addition of a more flexible way of implementing custom memory allocation routines. The +existing system of DRWAV_MALLOC, DRWAV_REALLOC and DRWAV_FREE are still in place and will be used by default when no custom +allocation callbacks are specified. + +To use the new system, you pass in a pointer to a drwav_allocation_callbacks object to drwav_init() and family, like this: + + void* my_malloc(size_t sz, void* pUserData) + { + return malloc(sz); + } + void* my_realloc(void* p, size_t sz, void* pUserData) + { + return realloc(p, sz); + } + void my_free(void* p, void* pUserData) + { + free(p); + } + + ... + + drwav_allocation_callbacks allocationCallbacks; + allocationCallbacks.pUserData = &myData; + allocationCallbacks.onMalloc = my_malloc; + allocationCallbacks.onRealloc = my_realloc; + allocationCallbacks.onFree = my_free; + drwav_init_file(&wav, "my_file.wav", &allocationCallbacks); + +The advantage of this new system is that it allows you to specify user data which will be passed in to the allocation routines. + +Passing in null for the allocation callbacks object will cause dr_wav to use defaults which is the same as DRWAV_MALLOC, +DRWAV_REALLOC and DRWAV_FREE and the equivalent of how it worked in previous versions. + +Every API that opens a drwav object now takes this extra parameter. These include the following: + + drwav_init() + drwav_init_ex() + drwav_init_file() + drwav_init_file_ex() + drwav_init_file_w() + drwav_init_file_w_ex() + drwav_init_memory() + drwav_init_memory_ex() + drwav_init_write() + drwav_init_write_sequential() + drwav_init_write_sequential_pcm_frames() + drwav_init_file_write() + drwav_init_file_write_sequential() + drwav_init_file_write_sequential_pcm_frames() + drwav_init_file_write_w() + drwav_init_file_write_sequential_w() + drwav_init_file_write_sequential_pcm_frames_w() + drwav_init_memory_write() + drwav_init_memory_write_sequential() + drwav_init_memory_write_sequential_pcm_frames() + drwav_open_and_read_pcm_frames_s16() + drwav_open_and_read_pcm_frames_f32() + drwav_open_and_read_pcm_frames_s32() + drwav_open_file_and_read_pcm_frames_s16() + drwav_open_file_and_read_pcm_frames_f32() + drwav_open_file_and_read_pcm_frames_s32() + drwav_open_file_and_read_pcm_frames_s16_w() + drwav_open_file_and_read_pcm_frames_f32_w() + drwav_open_file_and_read_pcm_frames_s32_w() + drwav_open_memory_and_read_pcm_frames_s16() + drwav_open_memory_and_read_pcm_frames_f32() + drwav_open_memory_and_read_pcm_frames_s32() + +Endian Improvements +------------------- +Previously, the following APIs returned little-endian audio data. These now return native-endian data. This improves compatibility +on big-endian architectures. + + drwav_read_pcm_frames() + drwav_read_pcm_frames_s16() + drwav_read_pcm_frames_s32() + drwav_read_pcm_frames_f32() + drwav_open_and_read_pcm_frames_s16() + drwav_open_and_read_pcm_frames_s32() + drwav_open_and_read_pcm_frames_f32() + drwav_open_file_and_read_pcm_frames_s16() + drwav_open_file_and_read_pcm_frames_s32() + drwav_open_file_and_read_pcm_frames_f32() + drwav_open_file_and_read_pcm_frames_s16_w() + drwav_open_file_and_read_pcm_frames_s32_w() + drwav_open_file_and_read_pcm_frames_f32_w() + drwav_open_memory_and_read_pcm_frames_s16() + drwav_open_memory_and_read_pcm_frames_s32() + drwav_open_memory_and_read_pcm_frames_f32() + +APIs have been added to give you explicit control over whether or not audio data is read or written in big- or little-endian byte +order: + + drwav_read_pcm_frames_le() + drwav_read_pcm_frames_be() + drwav_read_pcm_frames_s16le() + drwav_read_pcm_frames_s16be() + drwav_read_pcm_frames_f32le() + drwav_read_pcm_frames_f32be() + drwav_read_pcm_frames_s32le() + drwav_read_pcm_frames_s32be() + drwav_write_pcm_frames_le() + drwav_write_pcm_frames_be() + +Removed APIs +------------ +The following APIs were deprecated in version 0.10.0 and have now been removed: + + drwav_open() + drwav_open_ex() + drwav_open_write() + drwav_open_write_sequential() + drwav_open_file() + drwav_open_file_ex() + drwav_open_file_write() + drwav_open_file_write_sequential() + drwav_open_memory() + drwav_open_memory_ex() + drwav_open_memory_write() + drwav_open_memory_write_sequential() + drwav_close() + + + +RELEASE NOTES - v0.10.0 +======================= +Version 0.10.0 has breaking API changes. There are no significant bug fixes in this release, so if you are affected you do +not need to upgrade. + +Removed APIs +------------ +The following APIs were deprecated in version 0.9.0 and have been completely removed in version 0.10.0: + + drwav_read() + drwav_read_s16() + drwav_read_f32() + drwav_read_s32() + drwav_seek_to_sample() + drwav_write() + drwav_open_and_read_s16() + drwav_open_and_read_f32() + drwav_open_and_read_s32() + drwav_open_file_and_read_s16() + drwav_open_file_and_read_f32() + drwav_open_file_and_read_s32() + drwav_open_memory_and_read_s16() + drwav_open_memory_and_read_f32() + drwav_open_memory_and_read_s32() + drwav::totalSampleCount + +See release notes for version 0.9.0 at the bottom of this file for replacement APIs. + +Deprecated APIs +--------------- +The following APIs have been deprecated. There is a confusing and completely arbitrary difference between drwav_init*() and +drwav_open*(), where drwav_init*() initializes a pre-allocated drwav object, whereas drwav_open*() will first allocated a +drwav object on the heap and then initialize it. drwav_open*() has been deprecated which means you must now use a pre- +allocated drwav object with drwav_init*(). If you need the previous functionality, you can just do a malloc() followed by +a called to one of the drwav_init*() APIs. + + drwav_open() + drwav_open_ex() + drwav_open_write() + drwav_open_write_sequential() + drwav_open_file() + drwav_open_file_ex() + drwav_open_file_write() + drwav_open_file_write_sequential() + drwav_open_memory() + drwav_open_memory_ex() + drwav_open_memory_write() + drwav_open_memory_write_sequential() + drwav_close() + +These APIs will be removed completely in a future version. The rationale for this change is to remove confusion between the +two different ways to initialize a drwav object. +*/ + +/* +REVISION HISTORY +================ +v0.12.16 - 2020-12-02 + - Fix a bug when trying to read more bytes than can fit in a size_t. + +v0.12.15 - 2020-11-21 + - Fix compilation with OpenWatcom. + +v0.12.14 - 2020-11-13 + - Minor code clean up. + +v0.12.13 - 2020-11-01 + - Improve compiler support for older versions of GCC. + +v0.12.12 - 2020-09-28 + - Add support for RF64. + - Fix a bug in writing mode where the size of the RIFF chunk incorrectly includes the header section. + +v0.12.11 - 2020-09-08 + - Fix a compilation error on older compilers. + +v0.12.10 - 2020-08-24 + - Fix a bug when seeking with ADPCM formats. + +v0.12.9 - 2020-08-02 + - Simplify sized types. + +v0.12.8 - 2020-07-25 + - Fix a compilation warning. + +v0.12.7 - 2020-07-15 + - Fix some bugs on big-endian architectures. + - Fix an error in s24 to f32 conversion. + +v0.12.6 - 2020-06-23 + - Change drwav_read_*() to allow NULL to be passed in as the output buffer which is equivalent to a forward seek. + - Fix a buffer overflow when trying to decode invalid IMA-ADPCM files. + - Add include guard for the implementation section. + +v0.12.5 - 2020-05-27 + - Minor documentation fix. + +v0.12.4 - 2020-05-16 + - Replace assert() with DRWAV_ASSERT(). + - Add compile-time and run-time version querying. + - DRWAV_VERSION_MINOR + - DRWAV_VERSION_MAJOR + - DRWAV_VERSION_REVISION + - DRWAV_VERSION_STRING + - drwav_version() + - drwav_version_string() + +v0.12.3 - 2020-04-30 + - Fix compilation errors with VC6. + +v0.12.2 - 2020-04-21 + - Fix a bug where drwav_init_file() does not close the file handle after attempting to load an erroneous file. + +v0.12.1 - 2020-04-13 + - Fix some pedantic warnings. + +v0.12.0 - 2020-04-04 + - API CHANGE: Add container and format parameters to the chunk callback. + - Minor documentation updates. + +v0.11.5 - 2020-03-07 + - Fix compilation error with Visual Studio .NET 2003. + +v0.11.4 - 2020-01-29 + - Fix some static analysis warnings. + - Fix a bug when reading f32 samples from an A-law encoded stream. + +v0.11.3 - 2020-01-12 + - Minor changes to some f32 format conversion routines. + - Minor bug fix for ADPCM conversion when end of file is reached. + +v0.11.2 - 2019-12-02 + - Fix a possible crash when using custom memory allocators without a custom realloc() implementation. + - Fix an integer overflow bug. + - Fix a null pointer dereference bug. + - Add limits to sample rate, channels and bits per sample to tighten up some validation. + +v0.11.1 - 2019-10-07 + - Internal code clean up. + +v0.11.0 - 2019-10-06 + - API CHANGE: Add support for user defined memory allocation routines. This system allows the program to specify their own memory allocation + routines with a user data pointer for client-specific contextual data. This adds an extra parameter to the end of the following APIs: + - drwav_init() + - drwav_init_ex() + - drwav_init_file() + - drwav_init_file_ex() + - drwav_init_file_w() + - drwav_init_file_w_ex() + - drwav_init_memory() + - drwav_init_memory_ex() + - drwav_init_write() + - drwav_init_write_sequential() + - drwav_init_write_sequential_pcm_frames() + - drwav_init_file_write() + - drwav_init_file_write_sequential() + - drwav_init_file_write_sequential_pcm_frames() + - drwav_init_file_write_w() + - drwav_init_file_write_sequential_w() + - drwav_init_file_write_sequential_pcm_frames_w() + - drwav_init_memory_write() + - drwav_init_memory_write_sequential() + - drwav_init_memory_write_sequential_pcm_frames() + - drwav_open_and_read_pcm_frames_s16() + - drwav_open_and_read_pcm_frames_f32() + - drwav_open_and_read_pcm_frames_s32() + - drwav_open_file_and_read_pcm_frames_s16() + - drwav_open_file_and_read_pcm_frames_f32() + - drwav_open_file_and_read_pcm_frames_s32() + - drwav_open_file_and_read_pcm_frames_s16_w() + - drwav_open_file_and_read_pcm_frames_f32_w() + - drwav_open_file_and_read_pcm_frames_s32_w() + - drwav_open_memory_and_read_pcm_frames_s16() + - drwav_open_memory_and_read_pcm_frames_f32() + - drwav_open_memory_and_read_pcm_frames_s32() + Set this extra parameter to NULL to use defaults which is the same as the previous behaviour. Setting this NULL will use + DRWAV_MALLOC, DRWAV_REALLOC and DRWAV_FREE. + - Add support for reading and writing PCM frames in an explicit endianness. New APIs: + - drwav_read_pcm_frames_le() + - drwav_read_pcm_frames_be() + - drwav_read_pcm_frames_s16le() + - drwav_read_pcm_frames_s16be() + - drwav_read_pcm_frames_f32le() + - drwav_read_pcm_frames_f32be() + - drwav_read_pcm_frames_s32le() + - drwav_read_pcm_frames_s32be() + - drwav_write_pcm_frames_le() + - drwav_write_pcm_frames_be() + - Remove deprecated APIs. + - API CHANGE: The following APIs now return native-endian data. Previously they returned little-endian data. + - drwav_read_pcm_frames() + - drwav_read_pcm_frames_s16() + - drwav_read_pcm_frames_s32() + - drwav_read_pcm_frames_f32() + - drwav_open_and_read_pcm_frames_s16() + - drwav_open_and_read_pcm_frames_s32() + - drwav_open_and_read_pcm_frames_f32() + - drwav_open_file_and_read_pcm_frames_s16() + - drwav_open_file_and_read_pcm_frames_s32() + - drwav_open_file_and_read_pcm_frames_f32() + - drwav_open_file_and_read_pcm_frames_s16_w() + - drwav_open_file_and_read_pcm_frames_s32_w() + - drwav_open_file_and_read_pcm_frames_f32_w() + - drwav_open_memory_and_read_pcm_frames_s16() + - drwav_open_memory_and_read_pcm_frames_s32() + - drwav_open_memory_and_read_pcm_frames_f32() + +v0.10.1 - 2019-08-31 + - Correctly handle partial trailing ADPCM blocks. + +v0.10.0 - 2019-08-04 + - Remove deprecated APIs. + - Add wchar_t variants for file loading APIs: + drwav_init_file_w() + drwav_init_file_ex_w() + drwav_init_file_write_w() + drwav_init_file_write_sequential_w() + - Add drwav_target_write_size_bytes() which calculates the total size in bytes of a WAV file given a format and sample count. + - Add APIs for specifying the PCM frame count instead of the sample count when opening in sequential write mode: + drwav_init_write_sequential_pcm_frames() + drwav_init_file_write_sequential_pcm_frames() + drwav_init_file_write_sequential_pcm_frames_w() + drwav_init_memory_write_sequential_pcm_frames() + - Deprecate drwav_open*() and drwav_close(): + drwav_open() + drwav_open_ex() + drwav_open_write() + drwav_open_write_sequential() + drwav_open_file() + drwav_open_file_ex() + drwav_open_file_write() + drwav_open_file_write_sequential() + drwav_open_memory() + drwav_open_memory_ex() + drwav_open_memory_write() + drwav_open_memory_write_sequential() + drwav_close() + - Minor documentation updates. + +v0.9.2 - 2019-05-21 + - Fix warnings. + +v0.9.1 - 2019-05-05 + - Add support for C89. + - Change license to choice of public domain or MIT-0. + +v0.9.0 - 2018-12-16 + - API CHANGE: Add new reading APIs for reading by PCM frames instead of samples. Old APIs have been deprecated and + will be removed in v0.10.0. Deprecated APIs and their replacements: + drwav_read() -> drwav_read_pcm_frames() + drwav_read_s16() -> drwav_read_pcm_frames_s16() + drwav_read_f32() -> drwav_read_pcm_frames_f32() + drwav_read_s32() -> drwav_read_pcm_frames_s32() + drwav_seek_to_sample() -> drwav_seek_to_pcm_frame() + drwav_write() -> drwav_write_pcm_frames() + drwav_open_and_read_s16() -> drwav_open_and_read_pcm_frames_s16() + drwav_open_and_read_f32() -> drwav_open_and_read_pcm_frames_f32() + drwav_open_and_read_s32() -> drwav_open_and_read_pcm_frames_s32() + drwav_open_file_and_read_s16() -> drwav_open_file_and_read_pcm_frames_s16() + drwav_open_file_and_read_f32() -> drwav_open_file_and_read_pcm_frames_f32() + drwav_open_file_and_read_s32() -> drwav_open_file_and_read_pcm_frames_s32() + drwav_open_memory_and_read_s16() -> drwav_open_memory_and_read_pcm_frames_s16() + drwav_open_memory_and_read_f32() -> drwav_open_memory_and_read_pcm_frames_f32() + drwav_open_memory_and_read_s32() -> drwav_open_memory_and_read_pcm_frames_s32() + drwav::totalSampleCount -> drwav::totalPCMFrameCount + - API CHANGE: Rename drwav_open_and_read_file_*() to drwav_open_file_and_read_*(). + - API CHANGE: Rename drwav_open_and_read_memory_*() to drwav_open_memory_and_read_*(). + - Add built-in support for smpl chunks. + - Add support for firing a callback for each chunk in the file at initialization time. + - This is enabled through the drwav_init_ex(), etc. family of APIs. + - Handle invalid FMT chunks more robustly. + +v0.8.5 - 2018-09-11 + - Const correctness. + - Fix a potential stack overflow. + +v0.8.4 - 2018-08-07 + - Improve 64-bit detection. + +v0.8.3 - 2018-08-05 + - Fix C++ build on older versions of GCC. + +v0.8.2 - 2018-08-02 + - Fix some big-endian bugs. + +v0.8.1 - 2018-06-29 + - Add support for sequential writing APIs. + - Disable seeking in write mode. + - Fix bugs with Wave64. + - Fix typos. + +v0.8 - 2018-04-27 + - Bug fix. + - Start using major.minor.revision versioning. + +v0.7f - 2018-02-05 + - Restrict ADPCM formats to a maximum of 2 channels. + +v0.7e - 2018-02-02 + - Fix a crash. + +v0.7d - 2018-02-01 + - Fix a crash. + +v0.7c - 2018-02-01 + - Set drwav.bytesPerSample to 0 for all compressed formats. + - Fix a crash when reading 16-bit floating point WAV files. In this case dr_wav will output silence for + all format conversion reading APIs (*_s16, *_s32, *_f32 APIs). + - Fix some divide-by-zero errors. + +v0.7b - 2018-01-22 + - Fix errors with seeking of compressed formats. + - Fix compilation error when DR_WAV_NO_CONVERSION_API + +v0.7a - 2017-11-17 + - Fix some GCC warnings. + +v0.7 - 2017-11-04 + - Add writing APIs. + +v0.6 - 2017-08-16 + - API CHANGE: Rename dr_* types to drwav_*. + - Add support for custom implementations of malloc(), realloc(), etc. + - Add support for Microsoft ADPCM. + - Add support for IMA ADPCM (DVI, format code 0x11). + - Optimizations to drwav_read_s16(). + - Bug fixes. + +v0.5g - 2017-07-16 + - Change underlying type for booleans to unsigned. + +v0.5f - 2017-04-04 + - Fix a minor bug with drwav_open_and_read_s16() and family. + +v0.5e - 2016-12-29 + - Added support for reading samples as signed 16-bit integers. Use the _s16() family of APIs for this. + - Minor fixes to documentation. + +v0.5d - 2016-12-28 + - Use drwav_int* and drwav_uint* sized types to improve compiler support. + +v0.5c - 2016-11-11 + - Properly handle JUNK chunks that come before the FMT chunk. + +v0.5b - 2016-10-23 + - A minor change to drwav_bool8 and drwav_bool32 types. + +v0.5a - 2016-10-11 + - Fixed a bug with drwav_open_and_read() and family due to incorrect argument ordering. + - Improve A-law and mu-law efficiency. + +v0.5 - 2016-09-29 + - API CHANGE. Swap the order of "channels" and "sampleRate" parameters in drwav_open_and_read*(). Rationale for this is to + keep it consistent with dr_audio and dr_flac. + +v0.4b - 2016-09-18 + - Fixed a typo in documentation. + +v0.4a - 2016-09-18 + - Fixed a typo. + - Change date format to ISO 8601 (YYYY-MM-DD) + +v0.4 - 2016-07-13 + - API CHANGE. Make onSeek consistent with dr_flac. + - API CHANGE. Rename drwav_seek() to drwav_seek_to_sample() for clarity and consistency with dr_flac. + - Added support for Sony Wave64. + +v0.3a - 2016-05-28 + - API CHANGE. Return drwav_bool32 instead of int in onSeek callback. + - Fixed a memory leak. + +v0.3 - 2016-05-22 + - Lots of API changes for consistency. + +v0.2a - 2016-05-16 + - Fixed Linux/GCC build. + +v0.2 - 2016-05-11 + - Added support for reading data as signed 32-bit PCM for consistency with dr_flac. + +v0.1a - 2016-05-07 + - Fixed a bug in drwav_open_file() where the file handle would not be closed if the loader failed to initialize. + +v0.1 - 2016-05-04 + - Initial versioned release. +*/ + +/* +This software is available as a choice of the following licenses. Choose +whichever you prefer. + +=============================================================================== +ALTERNATIVE 1 - Public Domain (www.unlicense.org) +=============================================================================== +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or distribute this +software, either in source code form or as a compiled binary, for any purpose, +commercial or non-commercial, and by any means. + +In jurisdictions that recognize copyright laws, the author or authors of this +software dedicate any and all copyright interest in the software to the public +domain. We make this dedication for the benefit of the public at large and to +the detriment of our heirs and successors. We intend this dedication to be an +overt act of relinquishment in perpetuity of all present and future rights to +this software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to + +=============================================================================== +ALTERNATIVE 2 - MIT No Attribution +=============================================================================== +Copyright 2020 David Reid + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ diff --git a/seamless_communication/ggml/examples/kaldi-native-fbank/CMakeLists.txt b/seamless_communication/ggml/examples/kaldi-native-fbank/CMakeLists.txt new file mode 100644 index 0000000..2037626 --- /dev/null +++ b/seamless_communication/ggml/examples/kaldi-native-fbank/CMakeLists.txt @@ -0,0 +1,8 @@ +add_subdirectory(csrc) + +if(KALDI_NATIVE_FBANK_BUILD_PYTHON) + message(STATUS "Building Python") + add_subdirectory(python) +else() + message(STATUS "Disable building Python") +endif() diff --git a/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/CMakeLists.txt b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/CMakeLists.txt new file mode 100644 index 0000000..6eb693d --- /dev/null +++ b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/CMakeLists.txt @@ -0,0 +1,93 @@ + +include_directories(${PROJECT_SOURCE_DIR}) +set(sources + feature-fbank.cc + feature-functions.cc + feature-window.cc + fftsg.c + mel-computations.cc + online-feature.cc + rfft.cc +) + +if(KALDI_NATIVE_FBANK_ENABLE_CHECK) + list(APPEND sources log.cc) +endif() + +add_library(kaldi-native-fbank-core ${sources}) +if(KALDI_NATIVE_FBANK_ENABLE_CHECK) + target_compile_definitions(kaldi-native-fbank-core PUBLIC KNF_ENABLE_CHECK=1) + + if(KNF_HAVE_EXECINFO_H) + target_compile_definitions(kaldi-native-fbank-core PRIVATE KNF_HAVE_EXECINFO_H=1) + endif() + + if(KNF_HAVE_CXXABI_H) + target_compile_definitions(kaldi-native-fbank-core PRIVATE KNF_HAVE_CXXABI_H=1) + endif() +endif() + +# We are using std::call_once() in log.h,which requires us to link with -pthread +if(NOT WIN32 AND KALDI_NATIVE_FBANK_ENABLE_CHECK) + target_link_libraries(kaldi-native-fbank-core -pthread) +endif() + +if(KALDI_NATIVE_FBANK_BUILD_TESTS) + add_executable(test-online-fbank test-online-fbank.cc) + target_link_libraries(test-online-fbank kaldi-native-fbank-core) +endif() + +function(kaldi_native_fbank_add_test source) + get_filename_component(name ${source} NAME_WE) + add_executable(${name} "${source}") + target_link_libraries(${name} + PRIVATE + kaldi-native-fbank-core + gtest + gtest_main + ) + + add_test(NAME "Test.${name}" + COMMAND + $ + ) +endfunction() + +# please sort the source files alphabetically +set(test_srcs + # test-online-feature.cc + test-log.cc + test-rfft.cc +) + +if(KALDI_NATIVE_FBANK_BUILD_TESTS) + foreach(source IN LISTS test_srcs) + kaldi_native_fbank_add_test(${source}) + endforeach() +endif() + +install(TARGETS kaldi-native-fbank-core + DESTINATION lib +) + +if(KALDI_NATIVE_FBANK_BUILD_TESTS) + install(TARGETS test-online-fbank + DESTINATION bin + ) +endif() + +file(MAKE_DIRECTORY + DESTINATION + ${PROJECT_BINARY_DIR}/include/kaldi-native-fbank/csrc +) +file(GLOB_RECURSE all_headers *.h) + +file(COPY + ${all_headers} + DESTINATION + ${PROJECT_BINARY_DIR}/include/kaldi-native-fbank/csrc +) + +install(FILES ${all_headers} + DESTINATION include/kaldi-native-fbank/csrc +) diff --git a/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/feature-fbank.cc b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/feature-fbank.cc new file mode 100644 index 0000000..4d0d046 --- /dev/null +++ b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/feature-fbank.cc @@ -0,0 +1,120 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is copied/modified from kaldi/src/feat/feature-fbank.cc +// +#include "feature-fbank.h" + +#include +#include +#include +#include + +#include "feature-functions.h" + +namespace knf { + +static void Sqrt(float *in_out, int32_t n) { + for (int32_t i = 0; i != n; ++i) { + in_out[i] = std::sqrt(in_out[i]); + } +} + +std::ostream &operator<<(std::ostream &os, const FbankOptions &opts) { + os << opts.ToString(); + return os; +} + +FbankComputer::FbankComputer(const FbankOptions &opts) + : opts_(opts), rfft_(opts.frame_opts.PaddedWindowSize()) { + if (opts.energy_floor > 0.0f) { + log_energy_floor_ = logf(opts.energy_floor); + } + + // We'll definitely need the filterbanks info for VTLN warping factor 1.0. + // [note: this call caches it.] + GetMelBanks(1.0f); +} + +FbankComputer::~FbankComputer() { + for (auto iter = mel_banks_.begin(); iter != mel_banks_.end(); ++iter) + delete iter->second; +} + +const MelBanks *FbankComputer::GetMelBanks(float vtln_warp) { + MelBanks *this_mel_banks = nullptr; + + // std::map::iterator iter = mel_banks_.find(vtln_warp); + auto iter = mel_banks_.find(vtln_warp); + if (iter == mel_banks_.end()) { + this_mel_banks = new MelBanks(opts_.mel_opts, opts_.frame_opts, vtln_warp); + mel_banks_[vtln_warp] = this_mel_banks; + } else { + this_mel_banks = iter->second; + } + return this_mel_banks; +} + +void FbankComputer::Compute(float signal_raw_log_energy, float vtln_warp, + std::vector *signal_frame, float *feature) { + const MelBanks &mel_banks = *(GetMelBanks(vtln_warp)); + + KNF_CHECK_EQ(signal_frame->size(), opts_.frame_opts.PaddedWindowSize()); + + // Compute energy after window function (not the raw one). + if (opts_.use_energy && !opts_.raw_energy) { + signal_raw_log_energy = std::log( + std::max(InnerProduct(signal_frame->data(), signal_frame->data(), + signal_frame->size()), + std::numeric_limits::epsilon())); + } + rfft_.Compute(signal_frame->data()); // signal_frame is modified in-place + ComputePowerSpectrum(signal_frame); + + // Use magnitude instead of power if requested. + if (!opts_.use_power) { + Sqrt(signal_frame->data(), signal_frame->size() / 2 + 1); + } + + int32_t mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0); + + // Its length is opts_.mel_opts.num_bins + float *mel_energies = feature + mel_offset; + + // Sum with mel filter banks over the power spectrum + mel_banks.Compute(signal_frame->data(), mel_energies); + + if (opts_.use_log_fbank) { + // Avoid log of zero (which should be prevented anyway by dithering). + for (int32_t i = 0; i != opts_.mel_opts.num_bins; ++i) { + auto t = std::max(mel_energies[i], std::numeric_limits::epsilon()); + mel_energies[i] = std::log(t); + } + } + + // Copy energy as first value (or the last, if htk_compat == true). + if (opts_.use_energy) { + if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) { + signal_raw_log_energy = log_energy_floor_; + } + int32_t energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0; + feature[energy_index] = signal_raw_log_energy; + } +} + +} // namespace knf diff --git a/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/feature-fbank.h b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/feature-fbank.h new file mode 100644 index 0000000..7d4edf8 --- /dev/null +++ b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/feature-fbank.h @@ -0,0 +1,134 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is copied/modified from kaldi/src/feat/feature-fbank.h + +#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_ +#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_ + +#include +#include +#include + +#include "feature-window.h" +#include "mel-computations.h" +#include "rfft.h" + +namespace knf { + +struct FbankOptions { + FrameExtractionOptions frame_opts; + MelBanksOptions mel_opts; + // append an extra dimension with energy to the filter banks + bool use_energy = false; + float energy_floor = 0.0f; // active iff use_energy==true + + // If true, compute log_energy before preemphasis and windowing + // If false, compute log_energy after preemphasis ans windowing + bool raw_energy = true; // active iff use_energy==true + + // If true, put energy last (if using energy) + // If false, put energy first + bool htk_compat = false; // active iff use_energy==true + + // if true (default), produce log-filterbank, else linear + bool use_log_fbank = true; + + // if true (default), use power in filterbank + // analysis, else magnitude. + bool use_power = true; + + FbankOptions() { mel_opts.num_bins = 23; } + + std::string ToString() const { + std::ostringstream os; + os << "frame_opts: \n"; + os << frame_opts << "\n"; + os << "\n"; + + os << "mel_opts: \n"; + os << mel_opts << "\n"; + + os << "use_energy: " << use_energy << "\n"; + os << "energy_floor: " << energy_floor << "\n"; + os << "raw_energy: " << raw_energy << "\n"; + os << "htk_compat: " << htk_compat << "\n"; + os << "use_log_fbank: " << use_log_fbank << "\n"; + os << "use_power: " << use_power << "\n"; + return os.str(); + } +}; + +std::ostream &operator<<(std::ostream &os, const FbankOptions &opts); + +class FbankComputer { + public: + using Options = FbankOptions; + + explicit FbankComputer(const FbankOptions &opts); + ~FbankComputer(); + + int32_t Dim() const { + return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0); + } + + // if true, compute log_energy_pre_window but after dithering and dc removal + bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; } + + const FrameExtractionOptions &GetFrameOptions() const { + return opts_.frame_opts; + } + + const FbankOptions &GetOptions() const { return opts_; } + + /** + Function that computes one frame of features from + one frame of signal. + + @param [in] signal_raw_log_energy The log-energy of the frame of the signal + prior to windowing and pre-emphasis, or + log(numeric_limits::min()), whichever is greater. Must be + ignored by this function if this class returns false from + this->NeedsRawLogEnergy(). + @param [in] vtln_warp The VTLN warping factor that the user wants + to be applied when computing features for this utterance. Will + normally be 1.0, meaning no warping is to be done. The value will + be ignored for feature types that don't support VLTN, such as + spectrogram features. + @param [in] signal_frame One frame of the signal, + as extracted using the function ExtractWindow() using the options + returned by this->GetFrameOptions(). The function will use the + vector as a workspace, which is why it's a non-const pointer. + @param [out] feature Pointer to a vector of size this->Dim(), to which + the computed feature will be written. It should be pre-allocated. + */ + void Compute(float signal_raw_log_energy, float vtln_warp, + std::vector *signal_frame, float *feature); + + private: + const MelBanks *GetMelBanks(float vtln_warp); + + FbankOptions opts_; + float log_energy_floor_; + std::map mel_banks_; // float is VTLN coefficient. + Rfft rfft_; +}; + +} // namespace knf + +#endif // KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_ diff --git a/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/feature-functions.cc b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/feature-functions.cc new file mode 100644 index 0000000..002e8d9 --- /dev/null +++ b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/feature-functions.cc @@ -0,0 +1,49 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is copied/modified from kaldi/src/feat/feature-functions.cc + +#include "feature-functions.h" + +#include +#include + +namespace knf { + +void ComputePowerSpectrum(std::vector *complex_fft) { + int32_t dim = complex_fft->size(); + + // now we have in complex_fft, first half of complex spectrum + // it's stored as [real0, realN/2, real1, im1, real2, im2, ...] + + float *p = complex_fft->data(); + int32_t half_dim = dim / 2; + float first_energy = p[0] * p[0]; + float last_energy = p[1] * p[1]; // handle this special case + + for (int32_t i = 1; i < half_dim; ++i) { + float real = p[i * 2]; + float im = p[i * 2 + 1]; + p[i] = real * real + im * im; + } + p[0] = first_energy; + p[half_dim] = last_energy; // Will actually never be used, and anyway + // if the signal has been bandlimited sensibly this should be zero. +} + +} // namespace knf diff --git a/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/feature-functions.h b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/feature-functions.h new file mode 100644 index 0000000..b221622 --- /dev/null +++ b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/feature-functions.h @@ -0,0 +1,38 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is copied/modified from kaldi/src/feat/feature-functions.h +#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H_ +#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H_ + +#include +namespace knf { + +// ComputePowerSpectrum converts a complex FFT (as produced by the FFT +// functions in csrc/rfft.h), and converts it into +// a power spectrum. If the complex FFT is a vector of size n (representing +// half of the complex FFT of a real signal of size n, as described there), +// this function computes in the first (n/2) + 1 elements of it, the +// energies of the fft bins from zero to the Nyquist frequency. Contents of the +// remaining (n/2) - 1 elements are undefined at output. + +void ComputePowerSpectrum(std::vector *complex_fft); + +} // namespace knf + +#endif // KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H_ diff --git a/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/feature-window.cc b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/feature-window.cc new file mode 100644 index 0000000..a1d9ea1 --- /dev/null +++ b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/feature-window.cc @@ -0,0 +1,235 @@ +// kaldi-native-fbank/csrc/feature-window.cc +// +// Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + +// This file is copied/modified from kaldi/src/feat/feature-window.cc + +#include "feature-window.h" + +#include +#include +#include +#include + +#ifndef M_2PI +#define M_2PI 6.283185307179586476925286766559005 +#endif + +namespace knf { + +std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts) { + os << opts.ToString(); + return os; +} + +FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts) + : window_(opts.WindowSize()) { + int32_t frame_length = opts.WindowSize(); + KNF_CHECK_GT(frame_length, 0); + + float *window_data = window_.data(); + + double a = M_2PI / (frame_length - 1); + for (int32_t i = 0; i < frame_length; i++) { + double i_fl = static_cast(i); + if (opts.window_type == "hanning") { + window_data[i] = 0.5 - 0.5 * cos(a * i_fl); + } else if (opts.window_type == "sine") { + // when you are checking ws wikipedia, please + // note that 0.5 * a = M_PI/(frame_length-1) + window_data[i] = sin(0.5 * a * i_fl); + } else if (opts.window_type == "hamming") { + window_data[i] = 0.54 - 0.46 * cos(a * i_fl); + } else if (opts.window_type == + "povey") { // like hamming but goes to zero at edges. + window_data[i] = pow(0.5 - 0.5 * cos(a * i_fl), 0.85); + } else if (opts.window_type == "rectangular") { + window_data[i] = 1.0; + } else if (opts.window_type == "blackman") { + window_data[i] = opts.blackman_coeff - 0.5 * cos(a * i_fl) + + (0.5 - opts.blackman_coeff) * cos(2 * a * i_fl); + } else { + KNF_LOG(FATAL) << "Invalid window type " << opts.window_type; + } + } +} + +void FeatureWindowFunction::Apply(float *wave) const { + int32_t window_size = window_.size(); + const float *p = window_.data(); + for (int32_t k = 0; k != window_size; ++k) { + wave[k] *= p[k]; + } +} + +int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts) { + int64_t frame_shift = opts.WindowShift(); + if (opts.snip_edges) { + return frame * frame_shift; + } else { + int64_t midpoint_of_frame = frame_shift * frame + frame_shift / 2, + beginning_of_frame = midpoint_of_frame - opts.WindowSize() / 2; + return beginning_of_frame; + } +} + +int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts, + bool flush /*= true*/) { + int64_t frame_shift = opts.WindowShift(); + int64_t frame_length = opts.WindowSize(); + if (opts.snip_edges) { + // with --snip-edges=true (the default), we use a HTK-like approach to + // determining the number of frames-- all frames have to fit completely into + // the waveform, and the first frame begins at sample zero. + if (num_samples < frame_length) + return 0; + else + return (1 + ((num_samples - frame_length) / frame_shift)); + // You can understand the expression above as follows: 'num_samples - + // frame_length' is how much room we have to shift the frame within the + // waveform; 'frame_shift' is how much we shift it each time; and the ratio + // is how many times we can shift it (integer arithmetic rounds down). + } else { + // if --snip-edges=false, the number of frames is determined by rounding the + // (file-length / frame-shift) to the nearest integer. The point of this + // formula is to make the number of frames an obvious and predictable + // function of the frame shift and signal length, which makes many + // segmentation-related questions simpler. + // + // Because integer division in C++ rounds toward zero, we add (half the + // frame-shift minus epsilon) before dividing, to have the effect of + // rounding towards the closest integer. + int32_t num_frames = (num_samples + (frame_shift / 2)) / frame_shift; + + if (flush) return num_frames; + + // note: 'end' always means the last plus one, i.e. one past the last. + int64_t end_sample_of_last_frame = + FirstSampleOfFrame(num_frames - 1, opts) + frame_length; + + // the following code is optimized more for clarity than efficiency. + // If flush == false, we can't output frames that extend past the end + // of the signal. + while (num_frames > 0 && end_sample_of_last_frame > num_samples) { + num_frames--; + end_sample_of_last_frame -= frame_shift; + } + return num_frames; + } +} + +void ExtractWindow(int64_t sample_offset, const float *wave, std::size_t wave_size, + int32_t f, const FrameExtractionOptions &opts, + const FeatureWindowFunction &window_function, + std::vector *window, + float *log_energy_pre_window /*= nullptr*/) { + KNF_CHECK(sample_offset >= 0 && wave_size != 0); + + int32_t frame_length = opts.WindowSize(); + int32_t frame_length_padded = opts.PaddedWindowSize(); + + int64_t num_samples = sample_offset + wave_size; + int64_t start_sample = FirstSampleOfFrame(f, opts); + int64_t end_sample = start_sample + frame_length; + + if (opts.snip_edges) { + KNF_CHECK(start_sample >= sample_offset && end_sample <= num_samples); + } else { + KNF_CHECK(sample_offset == 0 || start_sample >= sample_offset); + } + + if (window->size() != frame_length_padded) { + window->resize(frame_length_padded); + } + + // wave_start and wave_end are start and end indexes into 'wave', for the + // piece of wave that we're trying to extract. + int32_t wave_start = int32_t(start_sample - sample_offset); + int32_t wave_end = wave_start + frame_length; + + if (wave_start >= 0 && wave_end <= wave_size) { + // the normal case-- no edge effects to consider. + std::copy(wave + wave_start, + wave + wave_start + frame_length, window->data()); + } else { + // Deal with any end effects by reflection, if needed. This code will only + // be reached for about two frames per utterance, so we don't concern + // ourselves excessively with efficiency. + int32_t wave_dim = wave_size; + for (int32_t s = 0; s < frame_length; ++s) { + int32_t s_in_wave = s + wave_start; + while (s_in_wave < 0 || s_in_wave >= wave_dim) { + // reflect around the beginning or end of the wave. + // e.g. -1 -> 0, -2 -> 1. + // dim -> dim - 1, dim + 1 -> dim - 2. + // the code supports repeated reflections, although this + // would only be needed in pathological cases. + if (s_in_wave < 0) + s_in_wave = -s_in_wave - 1; + else + s_in_wave = 2 * wave_dim - 1 - s_in_wave; + } + (*window)[s] = wave[s_in_wave]; + } + } + + ProcessWindow(opts, window_function, window->data(), log_energy_pre_window); +} + +static void RemoveDcOffset(float *d, int32_t n) { + float sum = 0; + for (int32_t i = 0; i != n; ++i) { + sum += d[i]; + } + + float mean = sum / n; + + for (int32_t i = 0; i != n; ++i) { + d[i] -= mean; + } +} + +float InnerProduct(const float *a, const float *b, int32_t n) { + float sum = 0; + for (int32_t i = 0; i != n; ++i) { + sum += a[i] * b[i]; + } + return sum; +} + +static void Preemphasize(float *d, int32_t n, float preemph_coeff) { + if (preemph_coeff == 0.0) { + return; + } + + KNF_CHECK(preemph_coeff >= 0.0 && preemph_coeff <= 1.0); + + for (int32_t i = n - 1; i > 0; --i) { + d[i] -= preemph_coeff * d[i - 1]; + } + d[0] -= preemph_coeff * d[0]; +} + +void ProcessWindow(const FrameExtractionOptions &opts, + const FeatureWindowFunction &window_function, float *window, + float *log_energy_pre_window /*= nullptr*/) { + int32_t frame_length = opts.WindowSize(); + + if (opts.remove_dc_offset) { + RemoveDcOffset(window, frame_length); + } + + if (log_energy_pre_window != NULL) { + float energy = std::max(InnerProduct(window, window, frame_length), + std::numeric_limits::epsilon()); + *log_energy_pre_window = std::log(energy); + } + + if (opts.preemph_coeff != 0.0) { + Preemphasize(window, frame_length, opts.preemph_coeff); + } + + window_function.Apply(window); +} + +} // namespace knf diff --git a/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/feature-window.h b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/feature-window.h new file mode 100644 index 0000000..1a0f9ba --- /dev/null +++ b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/feature-window.h @@ -0,0 +1,172 @@ +// kaldi-native-fbank/csrc/feature-window.h +// +// Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + +// This file is copied/modified from kaldi/src/feat/feature-window.h + +#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_WINDOW_H_ +#define KALDI_NATIVE_FBANK_CSRC_FEATURE_WINDOW_H_ + +#include +#include +#include + +#include "log.h" + +namespace knf { + +inline int32_t RoundUpToNearestPowerOfTwo(int32_t n) { + // copied from kaldi/src/base/kaldi-math.cc + KNF_CHECK_GT(n, 0); + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + return n + 1; +} + +struct FrameExtractionOptions { + float samp_freq = 16000; + float frame_shift_ms = 10.0f; // in milliseconds. + float frame_length_ms = 25.0f; // in milliseconds. + float dither = 1.0f; // Amount of dithering, 0.0 means no dither. + float preemph_coeff = 0.97f; // Preemphasis coefficient. + bool remove_dc_offset = true; // Subtract mean of wave before FFT. + std::string window_type = "povey"; // e.g. Hamming window + // May be "hamming", "rectangular", "povey", "hanning", "sine", "blackman" + // "povey" is a window I made to be similar to Hamming but to go to zero at + // the edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85) I just don't think the + // Hamming window makes sense as a windowing function. + bool round_to_power_of_two = true; + float blackman_coeff = 0.42f; + bool snip_edges = true; + // bool allow_downsample = false; + // bool allow_upsample = false; + + int32_t WindowShift() const { + return static_cast(samp_freq * 0.001f * frame_shift_ms); + } + int32_t WindowSize() const { + return static_cast(samp_freq * 0.001f * frame_length_ms); + } + int32_t PaddedWindowSize() const { + return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize()) + : WindowSize()); + } + std::string ToString() const { + std::ostringstream os; +#define KNF_PRINT(x) os << #x << ": " << x << "\n" + KNF_PRINT(samp_freq); + KNF_PRINT(frame_shift_ms); + KNF_PRINT(frame_length_ms); + KNF_PRINT(dither); + KNF_PRINT(preemph_coeff); + KNF_PRINT(remove_dc_offset); + KNF_PRINT(window_type); + KNF_PRINT(round_to_power_of_two); + KNF_PRINT(blackman_coeff); + KNF_PRINT(snip_edges); + // KNF_PRINT(allow_downsample); + // KNF_PRINT(allow_upsample); +#undef KNF_PRINT + return os.str(); + } +}; + +std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts); + +class FeatureWindowFunction { + public: + FeatureWindowFunction() = default; + explicit FeatureWindowFunction(const FrameExtractionOptions &opts); + /** + * @param wave Pointer to a 1-D array of shape [window_size]. + * It is modified in-place: wave[i] = wave[i] * window_[i]. + * @param + */ + void Apply(float *wave) const; + + private: + std::vector window_; // of size opts.WindowSize() +}; + +int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts); + +/** + This function returns the number of frames that we can extract from a wave + file with the given number of samples in it (assumed to have the same + sampling rate as specified in 'opts'). + + @param [in] num_samples The number of samples in the wave file. + @param [in] opts The frame-extraction options class + + @param [in] flush True if we are asserting that this number of samples + is 'all there is', false if we expecting more data to possibly come in. This + only makes a difference to the answer + if opts.snips_edges== false. For offline feature extraction you always want + flush == true. In an online-decoding context, once you know (or decide) that + no more data is coming in, you'd call it with flush == true at the end to + flush out any remaining data. +*/ +int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts, + bool flush = true); + +/* + ExtractWindow() extracts a windowed frame of waveform (possibly with a + power-of-two, padded size, depending on the config), including all the + processing done by ProcessWindow(). + + @param [in] sample_offset If 'wave' is not the entire waveform, but + part of it to the left has been discarded, then the + number of samples prior to 'wave' that we have + already discarded. Set this to zero if you are + processing the entire waveform in one piece, or + if you get 'no matching function' compilation + errors when updating the code. + @param [in] wave The waveform + @param [in] f The frame index to be extracted, with + 0 <= f < NumFrames(sample_offset + wave.Dim(), opts, true) + @param [in] opts The options class to be used + @param [in] window_function The windowing function, as derived from the + options class. + @param [out] window The windowed, possibly-padded waveform to be + extracted. Will be resized as needed. + @param [out] log_energy_pre_window If non-NULL, the log-energy of + the signal prior to pre-emphasis and multiplying by + the windowing function will be written to here. +*/ +void ExtractWindow(int64_t sample_offset, const float *wave, std::size_t wave_size, + int32_t f, const FrameExtractionOptions &opts, + const FeatureWindowFunction &window_function, + std::vector *window, + float *log_energy_pre_window = nullptr); + +/** + This function does all the windowing steps after actually + extracting the windowed signal: depending on the + configuration, it does dithering, dc offset removal, + preemphasis, and multiplication by the windowing function. + @param [in] opts The options class to be used + @param [in] window_function The windowing function-- should have + been initialized using 'opts'. + @param [in,out] window A vector of size opts.WindowSize(). Note: + it will typically be a sub-vector of a larger vector of size + opts.PaddedWindowSize(), with the remaining samples zero, + as the FFT code is more efficient if it operates on data with + power-of-two size. + @param [out] log_energy_pre_window If non-NULL, then after dithering and + DC offset removal, this function will write to this pointer the log of + the total energy (i.e. sum-squared) of the frame. + */ +void ProcessWindow(const FrameExtractionOptions &opts, + const FeatureWindowFunction &window_function, float *window, + float *log_energy_pre_window = nullptr); + +// Compute the inner product of two vectors +float InnerProduct(const float *a, const float *b, int32_t n); + +} // namespace knf + +#endif // KALDI_NATIVE_FBANK_CSRC_FEATURE_WINDOW_H_ diff --git a/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/fftsg.c b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/fftsg.c new file mode 100644 index 0000000..eeb8ee3 --- /dev/null +++ b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/fftsg.c @@ -0,0 +1,2975 @@ +/* This file is copied from + * + * https://www.kurims.kyoto-u.ac.jp/~ooura/fft.html + * + * Copyright Takuya OOURA, 1996-2001 + * + * You may use, copy, modify and distribute this code for any + * purpose (include commercial use) and without fee. Please refer to + * this package when you modify this code. + */ +/* +Fast Fourier/Cosine/Sine Transform + dimension :one + data length :power of 2 + decimation :frequency + radix :split-radix + data :inplace + table :use +functions + cdft: Complex Discrete Fourier Transform + rdft: Real Discrete Fourier Transform + ddct: Discrete Cosine Transform + ddst: Discrete Sine Transform + dfct: Cosine Transform of RDFT (Real Symmetric DFT) + dfst: Sine Transform of RDFT (Real Anti-symmetric DFT) +function prototypes + void cdft(int, int, double *, int *, double *); + void rdft(int, int, double *, int *, double *); + void ddct(int, int, double *, int *, double *); + void ddst(int, int, double *, int *, double *); + void dfct(int, double *, double *, int *, double *); + void dfst(int, double *, double *, int *, double *); +macro definitions + USE_CDFT_PTHREADS : default=not defined + CDFT_THREADS_BEGIN_N : must be >= 512, default=8192 + CDFT_4THREADS_BEGIN_N : must be >= 512, default=65536 + USE_CDFT_WINTHREADS : default=not defined + CDFT_THREADS_BEGIN_N : must be >= 512, default=32768 + CDFT_4THREADS_BEGIN_N : must be >= 512, default=524288 + + +-------- Complex DFT (Discrete Fourier Transform) -------- + [definition] + + X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k + X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k + ip[0] = 0; // first time only + cdft(2*n, 1, a, ip, w); + + ip[0] = 0; // first time only + cdft(2*n, -1, a, ip, w); + [parameters] + 2*n :data length (int) + n >= 1, n = power of 2 + a[0...2*n-1] :input/output data (double *) + input data + a[2*j] = Re(x[j]), + a[2*j+1] = Im(x[j]), 0<=j= 2+sqrt(n) + strictly, + length of ip >= + 2+(1<<(int)(log(n+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n/2-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + cdft(2*n, -1, a, ip, w); + is + cdft(2*n, 1, a, ip, w); + for (j = 0; j <= 2 * n - 1; j++) { + a[j] *= 1.0 / n; + } + . + + +-------- Real DFT / Inverse of Real DFT -------- + [definition] + RDFT + R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2 + I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0 IRDFT (excluding scale) + a[k] = (R[0] + R[n/2]*cos(pi*k))/2 + + sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) + + sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k + ip[0] = 0; // first time only + rdft(n, 1, a, ip, w); + + ip[0] = 0; // first time only + rdft(n, -1, a, ip, w); + [parameters] + n :data length (int) + n >= 2, n = power of 2 + a[0...n-1] :input/output data (double *) + + output data + a[2*k] = R[k], 0<=k + input data + a[2*j] = R[j], 0<=j= 2+sqrt(n/2) + strictly, + length of ip >= + 2+(1<<(int)(log(n/2+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n/2-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + rdft(n, 1, a, ip, w); + is + rdft(n, -1, a, ip, w); + for (j = 0; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- DCT (Discrete Cosine Transform) / Inverse of DCT -------- + [definition] + IDCT (excluding scale) + C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k DCT + C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k + ip[0] = 0; // first time only + ddct(n, 1, a, ip, w); + + ip[0] = 0; // first time only + ddct(n, -1, a, ip, w); + [parameters] + n :data length (int) + n >= 2, n = power of 2 + a[0...n-1] :input/output data (double *) + output data + a[k] = C[k], 0<=k= 2+sqrt(n/2) + strictly, + length of ip >= + 2+(1<<(int)(log(n/2+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/4-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + ddct(n, -1, a, ip, w); + is + a[0] *= 0.5; + ddct(n, 1, a, ip, w); + for (j = 0; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- DST (Discrete Sine Transform) / Inverse of DST -------- + [definition] + IDST (excluding scale) + S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k DST + S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0 + ip[0] = 0; // first time only + ddst(n, 1, a, ip, w); + + ip[0] = 0; // first time only + ddst(n, -1, a, ip, w); + [parameters] + n :data length (int) + n >= 2, n = power of 2 + a[0...n-1] :input/output data (double *) + + input data + a[j] = A[j], 0 + output data + a[k] = S[k], 0= 2+sqrt(n/2) + strictly, + length of ip >= + 2+(1<<(int)(log(n/2+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/4-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + ddst(n, -1, a, ip, w); + is + a[0] *= 0.5; + ddst(n, 1, a, ip, w); + for (j = 0; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- Cosine Transform of RDFT (Real Symmetric DFT) -------- + [definition] + C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n + [usage] + ip[0] = 0; // first time only + dfct(n, a, t, ip, w); + [parameters] + n :data length - 1 (int) + n >= 2, n = power of 2 + a[0...n] :input/output data (double *) + output data + a[k] = C[k], 0<=k<=n + t[0...n/2] :work area (double *) + ip[0...*] :work area for bit reversal (int *) + length of ip >= 2+sqrt(n/4) + strictly, + length of ip >= + 2+(1<<(int)(log(n/4+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/8-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + a[0] *= 0.5; + a[n] *= 0.5; + dfct(n, a, t, ip, w); + is + a[0] *= 0.5; + a[n] *= 0.5; + dfct(n, a, t, ip, w); + for (j = 0; j <= n; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- Sine Transform of RDFT (Real Anti-symmetric DFT) -------- + [definition] + S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0= 2, n = power of 2 + a[0...n-1] :input/output data (double *) + output data + a[k] = S[k], 0= 2+sqrt(n/4) + strictly, + length of ip >= + 2+(1<<(int)(log(n/4+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/8-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + dfst(n, a, t, ip, w); + is + dfst(n, a, t, ip, w); + for (j = 1; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +Appendix : + The cos/sin table is recalculated when the larger table required. + w[] and ip[] are compatible with all routines. +*/ + + + +void rdft(int n, int isgn, double *a, int *ip, double *w) +{ + void makewt(int nw, int *ip, double *w); + void makect(int nc, int *ip, double *c); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void cftbsub(int n, double *a, int *ip, int nw, double *w); + void rftfsub(int n, double *a, int nc, double *c); + void rftbsub(int n, double *a, int nc, double *c); + int nw, nc; + double xi; + + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > (nc << 2)) { + nc = n >> 2; + makect(nc, ip, w + nw); + } + if (isgn >= 0) { + if (n > 4) { + cftfsub(n, a, ip, nw, w); + rftfsub(n, a, nc, w + nw); + } else if (n == 4) { + cftfsub(n, a, ip, nw, w); + } + xi = a[0] - a[1]; + a[0] += a[1]; + a[1] = xi; + } else { + a[1] = 0.5 * (a[0] - a[1]); + a[0] -= a[1]; + if (n > 4) { + rftbsub(n, a, nc, w + nw); + cftbsub(n, a, ip, nw, w); + } else if (n == 4) { + cftbsub(n, a, ip, nw, w); + } + } +} + + +/* -------- initializing routines -------- */ + + +#include + +void makewt(int nw, int *ip, double *w) +{ + void makeipt(int nw, int *ip); + int j, nwh, nw0, nw1; + double delta, wn4r, wk1r, wk1i, wk3r, wk3i; + + ip[0] = nw; + ip[1] = 1; + if (nw > 2) { + nwh = nw >> 1; + delta = atan(1.0) / nwh; + wn4r = cos(delta * nwh); + w[0] = 1; + w[1] = wn4r; + if (nwh == 4) { + w[2] = cos(delta * 2); + w[3] = sin(delta * 2); + } else if (nwh > 4) { + makeipt(nw, ip); + w[2] = 0.5 / cos(delta * 2); + w[3] = 0.5 / cos(delta * 6); + for (j = 4; j < nwh; j += 4) { + w[j] = cos(delta * j); + w[j + 1] = sin(delta * j); + w[j + 2] = cos(3 * delta * j); + w[j + 3] = -sin(3 * delta * j); + } + } + nw0 = 0; + while (nwh > 2) { + nw1 = nw0 + nwh; + nwh >>= 1; + w[nw1] = 1; + w[nw1 + 1] = wn4r; + if (nwh == 4) { + wk1r = w[nw0 + 4]; + wk1i = w[nw0 + 5]; + w[nw1 + 2] = wk1r; + w[nw1 + 3] = wk1i; + } else if (nwh > 4) { + wk1r = w[nw0 + 4]; + wk3r = w[nw0 + 6]; + w[nw1 + 2] = 0.5 / wk1r; + w[nw1 + 3] = 0.5 / wk3r; + for (j = 4; j < nwh; j += 4) { + wk1r = w[nw0 + 2 * j]; + wk1i = w[nw0 + 2 * j + 1]; + wk3r = w[nw0 + 2 * j + 2]; + wk3i = w[nw0 + 2 * j + 3]; + w[nw1 + j] = wk1r; + w[nw1 + j + 1] = wk1i; + w[nw1 + j + 2] = wk3r; + w[nw1 + j + 3] = wk3i; + } + } + nw0 = nw1; + } + } +} + + +void makeipt(int nw, int *ip) +{ + int j, l, m, m2, p, q; + + ip[2] = 0; + ip[3] = 16; + m = 2; + for (l = nw; l > 32; l >>= 2) { + m2 = m << 1; + q = m2 << 3; + for (j = m; j < m2; j++) { + p = ip[j] << 2; + ip[m + j] = p; + ip[m2 + j] = p + q; + } + m = m2; + } +} + + +void makect(int nc, int *ip, double *c) +{ + int j, nch; + double delta; + + ip[1] = nc; + if (nc > 1) { + nch = nc >> 1; + delta = atan(1.0) / nch; + c[0] = cos(delta * nch); + c[nch] = 0.5 * c[0]; + for (j = 1; j < nch; j++) { + c[j] = 0.5 * cos(delta * j); + c[nc - j] = 0.5 * sin(delta * j); + } + } +} + + +/* -------- child routines -------- */ + + +#ifdef USE_CDFT_PTHREADS +#define USE_CDFT_THREADS +#ifndef CDFT_THREADS_BEGIN_N +#define CDFT_THREADS_BEGIN_N 8192 +#endif +#ifndef CDFT_4THREADS_BEGIN_N +#define CDFT_4THREADS_BEGIN_N 65536 +#endif +#include +#include +#include +#define cdft_thread_t pthread_t +#define cdft_thread_create(thp,func,argp) { \ + if (pthread_create(thp, NULL, func, (void *) argp) != 0) { \ + fprintf(stderr, "cdft thread error\n"); \ + exit(1); \ + } \ +} +#define cdft_thread_wait(th) { \ + if (pthread_join(th, NULL) != 0) { \ + fprintf(stderr, "cdft thread error\n"); \ + exit(1); \ + } \ +} +#endif /* USE_CDFT_PTHREADS */ + + +#ifdef USE_CDFT_WINTHREADS +#define USE_CDFT_THREADS +#ifndef CDFT_THREADS_BEGIN_N +#define CDFT_THREADS_BEGIN_N 32768 +#endif +#ifndef CDFT_4THREADS_BEGIN_N +#define CDFT_4THREADS_BEGIN_N 524288 +#endif +#include +#include +#include +#define cdft_thread_t HANDLE +#define cdft_thread_create(thp,func,argp) { \ + DWORD thid; \ + *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, (LPVOID) argp, 0, &thid); \ + if (*(thp) == 0) { \ + fprintf(stderr, "cdft thread error\n"); \ + exit(1); \ + } \ +} +#define cdft_thread_wait(th) { \ + WaitForSingleObject(th, INFINITE); \ + CloseHandle(th); \ +} +#endif /* USE_CDFT_WINTHREADS */ + + +void cftfsub(int n, double *a, int *ip, int nw, double *w) +{ + void bitrv2(int n, int *ip, double *a); + void bitrv216(double *a); + void bitrv208(double *a); + void cftf1st(int n, double *a, double *w); + void cftrec4(int n, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftfx41(int n, double *a, int nw, double *w); + void cftf161(double *a, double *w); + void cftf081(double *a, double *w); + void cftf040(double *a); + void cftx020(double *a); +#ifdef USE_CDFT_THREADS + void cftrec4_th(int n, double *a, int nw, double *w); +#endif /* USE_CDFT_THREADS */ + + if (n > 8) { + if (n > 32) { + cftf1st(n, a, &w[nw - (n >> 2)]); +#ifdef USE_CDFT_THREADS + if (n > CDFT_THREADS_BEGIN_N) { + cftrec4_th(n, a, nw, w); + } else +#endif /* USE_CDFT_THREADS */ + if (n > 512) { + cftrec4(n, a, nw, w); + } else if (n > 128) { + cftleaf(n, 1, a, nw, w); + } else { + cftfx41(n, a, nw, w); + } + bitrv2(n, ip, a); + } else if (n == 32) { + cftf161(a, &w[nw - 8]); + bitrv216(a); + } else { + cftf081(a, w); + bitrv208(a); + } + } else if (n == 8) { + cftf040(a); + } else if (n == 4) { + cftx020(a); + } +} + + +void cftbsub(int n, double *a, int *ip, int nw, double *w) +{ + void bitrv2conj(int n, int *ip, double *a); + void bitrv216neg(double *a); + void bitrv208neg(double *a); + void cftb1st(int n, double *a, double *w); + void cftrec4(int n, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftfx41(int n, double *a, int nw, double *w); + void cftf161(double *a, double *w); + void cftf081(double *a, double *w); + void cftb040(double *a); + void cftx020(double *a); +#ifdef USE_CDFT_THREADS + void cftrec4_th(int n, double *a, int nw, double *w); +#endif /* USE_CDFT_THREADS */ + + if (n > 8) { + if (n > 32) { + cftb1st(n, a, &w[nw - (n >> 2)]); +#ifdef USE_CDFT_THREADS + if (n > CDFT_THREADS_BEGIN_N) { + cftrec4_th(n, a, nw, w); + } else +#endif /* USE_CDFT_THREADS */ + if (n > 512) { + cftrec4(n, a, nw, w); + } else if (n > 128) { + cftleaf(n, 1, a, nw, w); + } else { + cftfx41(n, a, nw, w); + } + bitrv2conj(n, ip, a); + } else if (n == 32) { + cftf161(a, &w[nw - 8]); + bitrv216neg(a); + } else { + cftf081(a, w); + bitrv208neg(a); + } + } else if (n == 8) { + cftb040(a); + } else if (n == 4) { + cftx020(a); + } +} + + +void bitrv2(int n, int *ip, double *a) +{ + int j, j1, k, k1, l, m, nh, nm; + double xr, xi, yr, yi; + + m = 1; + for (l = n >> 2; l > 8; l >>= 2) { + m <<= 1; + } + nh = n >> 1; + nm = 4 * m; + if (l == 8) { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + 2 * ip[m + k]; + k1 = 4 * k + 2 * ip[m + j]; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + 2 * ip[m + k]; + j1 = k1 + 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= 2; + k1 -= nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh + 2; + k1 += nh + 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh - nm; + k1 += 2 * nm - 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + } else { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + ip[m + k]; + k1 = 4 * k + ip[m + j]; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + ip[m + k]; + j1 = k1 + 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + } +} + + +void bitrv2conj(int n, int *ip, double *a) +{ + int j, j1, k, k1, l, m, nh, nm; + double xr, xi, yr, yi; + + m = 1; + for (l = n >> 2; l > 8; l >>= 2) { + m <<= 1; + } + nh = n >> 1; + nm = 4 * m; + if (l == 8) { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + 2 * ip[m + k]; + k1 = 4 * k + 2 * ip[m + j]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + 2 * ip[m + k]; + j1 = k1 + 2; + k1 += nh; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= 2; + k1 -= nh; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh + 2; + k1 += nh + 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh - nm; + k1 += 2 * nm - 2; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + } + } else { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + ip[m + k]; + k1 = 4 * k + ip[m + j]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + ip[m + k]; + j1 = k1 + 2; + k1 += nh; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + j1 += nm; + k1 += nm; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + } + } +} + + +void bitrv216(double *a) +{ + double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, + x5r, x5i, x7r, x7i, x8r, x8i, x10r, x10i, + x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i; + + x1r = a[2]; + x1i = a[3]; + x2r = a[4]; + x2i = a[5]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x5r = a[10]; + x5i = a[11]; + x7r = a[14]; + x7i = a[15]; + x8r = a[16]; + x8i = a[17]; + x10r = a[20]; + x10i = a[21]; + x11r = a[22]; + x11i = a[23]; + x12r = a[24]; + x12i = a[25]; + x13r = a[26]; + x13i = a[27]; + x14r = a[28]; + x14i = a[29]; + a[2] = x8r; + a[3] = x8i; + a[4] = x4r; + a[5] = x4i; + a[6] = x12r; + a[7] = x12i; + a[8] = x2r; + a[9] = x2i; + a[10] = x10r; + a[11] = x10i; + a[14] = x14r; + a[15] = x14i; + a[16] = x1r; + a[17] = x1i; + a[20] = x5r; + a[21] = x5i; + a[22] = x13r; + a[23] = x13i; + a[24] = x3r; + a[25] = x3i; + a[26] = x11r; + a[27] = x11i; + a[28] = x7r; + a[29] = x7i; +} + + +void bitrv216neg(double *a) +{ + double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, + x5r, x5i, x6r, x6i, x7r, x7i, x8r, x8i, + x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i, + x13r, x13i, x14r, x14i, x15r, x15i; + + x1r = a[2]; + x1i = a[3]; + x2r = a[4]; + x2i = a[5]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x5r = a[10]; + x5i = a[11]; + x6r = a[12]; + x6i = a[13]; + x7r = a[14]; + x7i = a[15]; + x8r = a[16]; + x8i = a[17]; + x9r = a[18]; + x9i = a[19]; + x10r = a[20]; + x10i = a[21]; + x11r = a[22]; + x11i = a[23]; + x12r = a[24]; + x12i = a[25]; + x13r = a[26]; + x13i = a[27]; + x14r = a[28]; + x14i = a[29]; + x15r = a[30]; + x15i = a[31]; + a[2] = x15r; + a[3] = x15i; + a[4] = x7r; + a[5] = x7i; + a[6] = x11r; + a[7] = x11i; + a[8] = x3r; + a[9] = x3i; + a[10] = x13r; + a[11] = x13i; + a[12] = x5r; + a[13] = x5i; + a[14] = x9r; + a[15] = x9i; + a[16] = x1r; + a[17] = x1i; + a[18] = x14r; + a[19] = x14i; + a[20] = x6r; + a[21] = x6i; + a[22] = x10r; + a[23] = x10i; + a[24] = x2r; + a[25] = x2i; + a[26] = x12r; + a[27] = x12i; + a[28] = x4r; + a[29] = x4i; + a[30] = x8r; + a[31] = x8i; +} + + +void bitrv208(double *a) +{ + double x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i; + + x1r = a[2]; + x1i = a[3]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x6r = a[12]; + x6i = a[13]; + a[2] = x4r; + a[3] = x4i; + a[6] = x6r; + a[7] = x6i; + a[8] = x1r; + a[9] = x1i; + a[12] = x3r; + a[13] = x3i; +} + + +void bitrv208neg(double *a) +{ + double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, + x5r, x5i, x6r, x6i, x7r, x7i; + + x1r = a[2]; + x1i = a[3]; + x2r = a[4]; + x2i = a[5]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x5r = a[10]; + x5i = a[11]; + x6r = a[12]; + x6i = a[13]; + x7r = a[14]; + x7i = a[15]; + a[2] = x7r; + a[3] = x7i; + a[4] = x3r; + a[5] = x3i; + a[6] = x5r; + a[7] = x5i; + a[8] = x1r; + a[9] = x1i; + a[10] = x6r; + a[11] = x6i; + a[12] = x2r; + a[13] = x2i; + a[14] = x4r; + a[15] = x4i; +} + + +void cftf1st(int n, double *a, double *w) +{ + int j, j0, j1, j2, j3, k, m, mh; + double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, + wd1r, wd1i, wd3r, wd3i; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i; + + mh = n >> 3; + m = 2 * mh; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] + a[j2]; + x0i = a[1] + a[j2 + 1]; + x1r = a[0] - a[j2]; + x1i = a[1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j2] = x1r - x3i; + a[j2 + 1] = x1i + x3r; + a[j3] = x1r + x3i; + a[j3 + 1] = x1i - x3r; + wn4r = w[1]; + csc1 = w[2]; + csc3 = w[3]; + wd1r = 1; + wd1i = 0; + wd3r = 1; + wd3i = 0; + k = 0; + for (j = 2; j < mh - 2; j += 4) { + k += 4; + wk1r = csc1 * (wd1r + w[k]); + wk1i = csc1 * (wd1i + w[k + 1]); + wk3r = csc3 * (wd3r + w[k + 2]); + wk3i = csc3 * (wd3i + w[k + 3]); + wd1r = w[k]; + wd1i = w[k + 1]; + wd3r = w[k + 2]; + wd3i = w[k + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] + a[j2]; + x0i = a[j + 1] + a[j2 + 1]; + x1r = a[j] - a[j2]; + x1i = a[j + 1] - a[j2 + 1]; + y0r = a[j + 2] + a[j2 + 2]; + y0i = a[j + 3] + a[j2 + 3]; + y1r = a[j + 2] - a[j2 + 2]; + y1i = a[j + 3] - a[j2 + 3]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 + 2] + a[j3 + 2]; + y2i = a[j1 + 3] + a[j3 + 3]; + y3r = a[j1 + 2] - a[j3 + 2]; + y3i = a[j1 + 3] - a[j3 + 3]; + a[j] = x0r + x2r; + a[j + 1] = x0i + x2i; + a[j + 2] = y0r + y2r; + a[j + 3] = y0i + y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j1 + 2] = y0r - y2r; + a[j1 + 3] = y0i - y2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1r * x0r - wk1i * x0i; + a[j2 + 1] = wk1r * x0i + wk1i * x0r; + x0r = y1r - y3i; + x0i = y1i + y3r; + a[j2 + 2] = wd1r * x0r - wd1i * x0i; + a[j2 + 3] = wd1r * x0i + wd1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3r * x0r + wk3i * x0i; + a[j3 + 1] = wk3r * x0i - wk3i * x0r; + x0r = y1r + y3i; + x0i = y1i - y3r; + a[j3 + 2] = wd3r * x0r + wd3i * x0i; + a[j3 + 3] = wd3r * x0i - wd3i * x0r; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + y0r = a[j0 - 2] + a[j2 - 2]; + y0i = a[j0 - 1] + a[j2 - 1]; + y1r = a[j0 - 2] - a[j2 - 2]; + y1i = a[j0 - 1] - a[j2 - 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 - 2] + a[j3 - 2]; + y2i = a[j1 - 1] + a[j3 - 1]; + y3r = a[j1 - 2] - a[j3 - 2]; + y3i = a[j1 - 1] - a[j3 - 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j0 - 2] = y0r + y2r; + a[j0 - 1] = y0i + y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j1 - 2] = y0r - y2r; + a[j1 - 1] = y0i - y2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1i * x0r - wk1r * x0i; + a[j2 + 1] = wk1i * x0i + wk1r * x0r; + x0r = y1r - y3i; + x0i = y1i + y3r; + a[j2 - 2] = wd1i * x0r - wd1r * x0i; + a[j2 - 1] = wd1i * x0i + wd1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3i * x0r + wk3r * x0i; + a[j3 + 1] = wk3i * x0i - wk3r * x0r; + x0r = y1r + y3i; + x0i = y1i - y3r; + a[j3 - 2] = wd3i * x0r + wd3r * x0i; + a[j3 - 1] = wd3i * x0i - wd3r * x0r; + } + wk1r = csc1 * (wd1r + wn4r); + wk1i = csc1 * (wd1i + wn4r); + wk3r = csc3 * (wd3r - wn4r); + wk3i = csc3 * (wd3i - wn4r); + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0 - 2] + a[j2 - 2]; + x0i = a[j0 - 1] + a[j2 - 1]; + x1r = a[j0 - 2] - a[j2 - 2]; + x1i = a[j0 - 1] - a[j2 - 1]; + x2r = a[j1 - 2] + a[j3 - 2]; + x2i = a[j1 - 1] + a[j3 - 1]; + x3r = a[j1 - 2] - a[j3 - 2]; + x3i = a[j1 - 1] - a[j3 - 1]; + a[j0 - 2] = x0r + x2r; + a[j0 - 1] = x0i + x2i; + a[j1 - 2] = x0r - x2r; + a[j1 - 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2 - 2] = wk1r * x0r - wk1i * x0i; + a[j2 - 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3 - 2] = wk3r * x0r + wk3i * x0i; + a[j3 - 1] = wk3r * x0i - wk3i * x0r; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wn4r * (x0r - x0i); + a[j2 + 1] = wn4r * (x0i + x0r); + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = -wn4r * (x0r + x0i); + a[j3 + 1] = -wn4r * (x0i - x0r); + x0r = a[j0 + 2] + a[j2 + 2]; + x0i = a[j0 + 3] + a[j2 + 3]; + x1r = a[j0 + 2] - a[j2 + 2]; + x1i = a[j0 + 3] - a[j2 + 3]; + x2r = a[j1 + 2] + a[j3 + 2]; + x2i = a[j1 + 3] + a[j3 + 3]; + x3r = a[j1 + 2] - a[j3 + 2]; + x3i = a[j1 + 3] - a[j3 + 3]; + a[j0 + 2] = x0r + x2r; + a[j0 + 3] = x0i + x2i; + a[j1 + 2] = x0r - x2r; + a[j1 + 3] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2 + 2] = wk1i * x0r - wk1r * x0i; + a[j2 + 3] = wk1i * x0i + wk1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3 + 2] = wk3i * x0r + wk3r * x0i; + a[j3 + 3] = wk3i * x0i - wk3r * x0r; +} + + +void cftb1st(int n, double *a, double *w) +{ + int j, j0, j1, j2, j3, k, m, mh; + double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, + wd1r, wd1i, wd3r, wd3i; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i; + + mh = n >> 3; + m = 2 * mh; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] + a[j2]; + x0i = -a[1] - a[j2 + 1]; + x1r = a[0] - a[j2]; + x1i = -a[1] + a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[0] = x0r + x2r; + a[1] = x0i - x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + a[j2] = x1r + x3i; + a[j2 + 1] = x1i + x3r; + a[j3] = x1r - x3i; + a[j3 + 1] = x1i - x3r; + wn4r = w[1]; + csc1 = w[2]; + csc3 = w[3]; + wd1r = 1; + wd1i = 0; + wd3r = 1; + wd3i = 0; + k = 0; + for (j = 2; j < mh - 2; j += 4) { + k += 4; + wk1r = csc1 * (wd1r + w[k]); + wk1i = csc1 * (wd1i + w[k + 1]); + wk3r = csc3 * (wd3r + w[k + 2]); + wk3i = csc3 * (wd3i + w[k + 3]); + wd1r = w[k]; + wd1i = w[k + 1]; + wd3r = w[k + 2]; + wd3i = w[k + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] + a[j2]; + x0i = -a[j + 1] - a[j2 + 1]; + x1r = a[j] - a[j2]; + x1i = -a[j + 1] + a[j2 + 1]; + y0r = a[j + 2] + a[j2 + 2]; + y0i = -a[j + 3] - a[j2 + 3]; + y1r = a[j + 2] - a[j2 + 2]; + y1i = -a[j + 3] + a[j2 + 3]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 + 2] + a[j3 + 2]; + y2i = a[j1 + 3] + a[j3 + 3]; + y3r = a[j1 + 2] - a[j3 + 2]; + y3i = a[j1 + 3] - a[j3 + 3]; + a[j] = x0r + x2r; + a[j + 1] = x0i - x2i; + a[j + 2] = y0r + y2r; + a[j + 3] = y0i - y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + a[j1 + 2] = y0r - y2r; + a[j1 + 3] = y0i + y2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2] = wk1r * x0r - wk1i * x0i; + a[j2 + 1] = wk1r * x0i + wk1i * x0r; + x0r = y1r + y3i; + x0i = y1i + y3r; + a[j2 + 2] = wd1r * x0r - wd1i * x0i; + a[j2 + 3] = wd1r * x0i + wd1i * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3] = wk3r * x0r + wk3i * x0i; + a[j3 + 1] = wk3r * x0i - wk3i * x0r; + x0r = y1r - y3i; + x0i = y1i - y3r; + a[j3 + 2] = wd3r * x0r + wd3i * x0i; + a[j3 + 3] = wd3r * x0i - wd3i * x0r; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = -a[j0 + 1] - a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = -a[j0 + 1] + a[j2 + 1]; + y0r = a[j0 - 2] + a[j2 - 2]; + y0i = -a[j0 - 1] - a[j2 - 1]; + y1r = a[j0 - 2] - a[j2 - 2]; + y1i = -a[j0 - 1] + a[j2 - 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 - 2] + a[j3 - 2]; + y2i = a[j1 - 1] + a[j3 - 1]; + y3r = a[j1 - 2] - a[j3 - 2]; + y3i = a[j1 - 1] - a[j3 - 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i - x2i; + a[j0 - 2] = y0r + y2r; + a[j0 - 1] = y0i - y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + a[j1 - 2] = y0r - y2r; + a[j1 - 1] = y0i + y2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2] = wk1i * x0r - wk1r * x0i; + a[j2 + 1] = wk1i * x0i + wk1r * x0r; + x0r = y1r + y3i; + x0i = y1i + y3r; + a[j2 - 2] = wd1i * x0r - wd1r * x0i; + a[j2 - 1] = wd1i * x0i + wd1r * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3] = wk3i * x0r + wk3r * x0i; + a[j3 + 1] = wk3i * x0i - wk3r * x0r; + x0r = y1r - y3i; + x0i = y1i - y3r; + a[j3 - 2] = wd3i * x0r + wd3r * x0i; + a[j3 - 1] = wd3i * x0i - wd3r * x0r; + } + wk1r = csc1 * (wd1r + wn4r); + wk1i = csc1 * (wd1i + wn4r); + wk3r = csc3 * (wd3r - wn4r); + wk3i = csc3 * (wd3i - wn4r); + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0 - 2] + a[j2 - 2]; + x0i = -a[j0 - 1] - a[j2 - 1]; + x1r = a[j0 - 2] - a[j2 - 2]; + x1i = -a[j0 - 1] + a[j2 - 1]; + x2r = a[j1 - 2] + a[j3 - 2]; + x2i = a[j1 - 1] + a[j3 - 1]; + x3r = a[j1 - 2] - a[j3 - 2]; + x3i = a[j1 - 1] - a[j3 - 1]; + a[j0 - 2] = x0r + x2r; + a[j0 - 1] = x0i - x2i; + a[j1 - 2] = x0r - x2r; + a[j1 - 1] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2 - 2] = wk1r * x0r - wk1i * x0i; + a[j2 - 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3 - 2] = wk3r * x0r + wk3i * x0i; + a[j3 - 1] = wk3r * x0i - wk3i * x0r; + x0r = a[j0] + a[j2]; + x0i = -a[j0 + 1] - a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = -a[j0 + 1] + a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i - x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2] = wn4r * (x0r - x0i); + a[j2 + 1] = wn4r * (x0i + x0r); + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3] = -wn4r * (x0r + x0i); + a[j3 + 1] = -wn4r * (x0i - x0r); + x0r = a[j0 + 2] + a[j2 + 2]; + x0i = -a[j0 + 3] - a[j2 + 3]; + x1r = a[j0 + 2] - a[j2 + 2]; + x1i = -a[j0 + 3] + a[j2 + 3]; + x2r = a[j1 + 2] + a[j3 + 2]; + x2i = a[j1 + 3] + a[j3 + 3]; + x3r = a[j1 + 2] - a[j3 + 2]; + x3i = a[j1 + 3] - a[j3 + 3]; + a[j0 + 2] = x0r + x2r; + a[j0 + 3] = x0i - x2i; + a[j1 + 2] = x0r - x2r; + a[j1 + 3] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2 + 2] = wk1i * x0r - wk1r * x0i; + a[j2 + 3] = wk1i * x0i + wk1r * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3 + 2] = wk3i * x0r + wk3r * x0i; + a[j3 + 3] = wk3i * x0i - wk3r * x0r; +} + + +#ifdef USE_CDFT_THREADS +struct cdft_arg_st { + int n0; + int n; + double *a; + int nw; + double *w; +}; +typedef struct cdft_arg_st cdft_arg_t; + + +void cftrec4_th(int n, double *a, int nw, double *w) +{ + void *cftrec1_th(void *p); + void *cftrec2_th(void *p); + int i, idiv4, m, nthread; + cdft_thread_t th[4]; + cdft_arg_t ag[4]; + + nthread = 2; + idiv4 = 0; + m = n >> 1; + if (n > CDFT_4THREADS_BEGIN_N) { + nthread = 4; + idiv4 = 1; + m >>= 1; + } + for (i = 0; i < nthread; i++) { + ag[i].n0 = n; + ag[i].n = m; + ag[i].a = &a[i * m]; + ag[i].nw = nw; + ag[i].w = w; + if (i != idiv4) { + cdft_thread_create(&th[i], cftrec1_th, &ag[i]); + } else { + cdft_thread_create(&th[i], cftrec2_th, &ag[i]); + } + } + for (i = 0; i < nthread; i++) { + cdft_thread_wait(th[i]); + } +} + + +void *cftrec1_th(void *p) +{ + int cfttree(int n, int j, int k, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftmdl1(int n, double *a, double *w); + int isplt, j, k, m, n, n0, nw; + double *a, *w; + + n0 = ((cdft_arg_t *) p)->n0; + n = ((cdft_arg_t *) p)->n; + a = ((cdft_arg_t *) p)->a; + nw = ((cdft_arg_t *) p)->nw; + w = ((cdft_arg_t *) p)->w; + m = n0; + while (m > 512) { + m >>= 2; + cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]); + } + cftleaf(m, 1, &a[n - m], nw, w); + k = 0; + for (j = n - m; j > 0; j -= m) { + k++; + isplt = cfttree(m, j, k, a, nw, w); + cftleaf(m, isplt, &a[j - m], nw, w); + } + return (void *) 0; +} + + +void *cftrec2_th(void *p) +{ + int cfttree(int n, int j, int k, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftmdl2(int n, double *a, double *w); + int isplt, j, k, m, n, n0, nw; + double *a, *w; + + n0 = ((cdft_arg_t *) p)->n0; + n = ((cdft_arg_t *) p)->n; + a = ((cdft_arg_t *) p)->a; + nw = ((cdft_arg_t *) p)->nw; + w = ((cdft_arg_t *) p)->w; + k = 1; + m = n0; + while (m > 512) { + m >>= 2; + k <<= 2; + cftmdl2(m, &a[n - m], &w[nw - m]); + } + cftleaf(m, 0, &a[n - m], nw, w); + k >>= 1; + for (j = n - m; j > 0; j -= m) { + k++; + isplt = cfttree(m, j, k, a, nw, w); + cftleaf(m, isplt, &a[j - m], nw, w); + } + return (void *) 0; +} +#endif /* USE_CDFT_THREADS */ + + +void cftrec4(int n, double *a, int nw, double *w) +{ + int cfttree(int n, int j, int k, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftmdl1(int n, double *a, double *w); + int isplt, j, k, m; + + m = n; + while (m > 512) { + m >>= 2; + cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]); + } + cftleaf(m, 1, &a[n - m], nw, w); + k = 0; + for (j = n - m; j > 0; j -= m) { + k++; + isplt = cfttree(m, j, k, a, nw, w); + cftleaf(m, isplt, &a[j - m], nw, w); + } +} + + +int cfttree(int n, int j, int k, double *a, int nw, double *w) +{ + void cftmdl1(int n, double *a, double *w); + void cftmdl2(int n, double *a, double *w); + int i, isplt, m; + + if ((k & 3) != 0) { + isplt = k & 1; + if (isplt != 0) { + cftmdl1(n, &a[j - n], &w[nw - (n >> 1)]); + } else { + cftmdl2(n, &a[j - n], &w[nw - n]); + } + } else { + m = n; + for (i = k; (i & 3) == 0; i >>= 2) { + m <<= 2; + } + isplt = i & 1; + if (isplt != 0) { + while (m > 128) { + cftmdl1(m, &a[j - m], &w[nw - (m >> 1)]); + m >>= 2; + } + } else { + while (m > 128) { + cftmdl2(m, &a[j - m], &w[nw - m]); + m >>= 2; + } + } + } + return isplt; +} + + +void cftleaf(int n, int isplt, double *a, int nw, double *w) +{ + void cftmdl1(int n, double *a, double *w); + void cftmdl2(int n, double *a, double *w); + void cftf161(double *a, double *w); + void cftf162(double *a, double *w); + void cftf081(double *a, double *w); + void cftf082(double *a, double *w); + + if (n == 512) { + cftmdl1(128, a, &w[nw - 64]); + cftf161(a, &w[nw - 8]); + cftf162(&a[32], &w[nw - 32]); + cftf161(&a[64], &w[nw - 8]); + cftf161(&a[96], &w[nw - 8]); + cftmdl2(128, &a[128], &w[nw - 128]); + cftf161(&a[128], &w[nw - 8]); + cftf162(&a[160], &w[nw - 32]); + cftf161(&a[192], &w[nw - 8]); + cftf162(&a[224], &w[nw - 32]); + cftmdl1(128, &a[256], &w[nw - 64]); + cftf161(&a[256], &w[nw - 8]); + cftf162(&a[288], &w[nw - 32]); + cftf161(&a[320], &w[nw - 8]); + cftf161(&a[352], &w[nw - 8]); + if (isplt != 0) { + cftmdl1(128, &a[384], &w[nw - 64]); + cftf161(&a[480], &w[nw - 8]); + } else { + cftmdl2(128, &a[384], &w[nw - 128]); + cftf162(&a[480], &w[nw - 32]); + } + cftf161(&a[384], &w[nw - 8]); + cftf162(&a[416], &w[nw - 32]); + cftf161(&a[448], &w[nw - 8]); + } else { + cftmdl1(64, a, &w[nw - 32]); + cftf081(a, &w[nw - 8]); + cftf082(&a[16], &w[nw - 8]); + cftf081(&a[32], &w[nw - 8]); + cftf081(&a[48], &w[nw - 8]); + cftmdl2(64, &a[64], &w[nw - 64]); + cftf081(&a[64], &w[nw - 8]); + cftf082(&a[80], &w[nw - 8]); + cftf081(&a[96], &w[nw - 8]); + cftf082(&a[112], &w[nw - 8]); + cftmdl1(64, &a[128], &w[nw - 32]); + cftf081(&a[128], &w[nw - 8]); + cftf082(&a[144], &w[nw - 8]); + cftf081(&a[160], &w[nw - 8]); + cftf081(&a[176], &w[nw - 8]); + if (isplt != 0) { + cftmdl1(64, &a[192], &w[nw - 32]); + cftf081(&a[240], &w[nw - 8]); + } else { + cftmdl2(64, &a[192], &w[nw - 64]); + cftf082(&a[240], &w[nw - 8]); + } + cftf081(&a[192], &w[nw - 8]); + cftf082(&a[208], &w[nw - 8]); + cftf081(&a[224], &w[nw - 8]); + } +} + + +void cftmdl1(int n, double *a, double *w) +{ + int j, j0, j1, j2, j3, k, m, mh; + double wn4r, wk1r, wk1i, wk3r, wk3i; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + mh = n >> 3; + m = 2 * mh; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] + a[j2]; + x0i = a[1] + a[j2 + 1]; + x1r = a[0] - a[j2]; + x1i = a[1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j2] = x1r - x3i; + a[j2 + 1] = x1i + x3r; + a[j3] = x1r + x3i; + a[j3 + 1] = x1i - x3r; + wn4r = w[1]; + k = 0; + for (j = 2; j < mh; j += 2) { + k += 4; + wk1r = w[k]; + wk1i = w[k + 1]; + wk3r = w[k + 2]; + wk3i = w[k + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] + a[j2]; + x0i = a[j + 1] + a[j2 + 1]; + x1r = a[j] - a[j2]; + x1i = a[j + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j] = x0r + x2r; + a[j + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1r * x0r - wk1i * x0i; + a[j2 + 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3r * x0r + wk3i * x0i; + a[j3 + 1] = wk3r * x0i - wk3i * x0r; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1i * x0r - wk1r * x0i; + a[j2 + 1] = wk1i * x0i + wk1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3i * x0r + wk3r * x0i; + a[j3 + 1] = wk3i * x0i - wk3r * x0r; + } + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wn4r * (x0r - x0i); + a[j2 + 1] = wn4r * (x0i + x0r); + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = -wn4r * (x0r + x0i); + a[j3 + 1] = -wn4r * (x0i - x0r); +} + + +void cftmdl2(int n, double *a, double *w) +{ + int j, j0, j1, j2, j3, k, kr, m, mh; + double wn4r, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y2r, y2i; + + mh = n >> 3; + m = 2 * mh; + wn4r = w[1]; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] - a[j2 + 1]; + x0i = a[1] + a[j2]; + x1r = a[0] + a[j2 + 1]; + x1i = a[1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wn4r * (x2r - x2i); + y0i = wn4r * (x2i + x2r); + a[0] = x0r + y0r; + a[1] = x0i + y0i; + a[j1] = x0r - y0r; + a[j1 + 1] = x0i - y0i; + y0r = wn4r * (x3r - x3i); + y0i = wn4r * (x3i + x3r); + a[j2] = x1r - y0i; + a[j2 + 1] = x1i + y0r; + a[j3] = x1r + y0i; + a[j3 + 1] = x1i - y0r; + k = 0; + kr = 2 * m; + for (j = 2; j < mh; j += 2) { + k += 4; + wk1r = w[k]; + wk1i = w[k + 1]; + wk3r = w[k + 2]; + wk3i = w[k + 3]; + kr -= 4; + wd1i = w[kr]; + wd1r = w[kr + 1]; + wd3i = w[kr + 2]; + wd3r = w[kr + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] - a[j2 + 1]; + x0i = a[j + 1] + a[j2]; + x1r = a[j] + a[j2 + 1]; + x1i = a[j + 1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wk1r * x0r - wk1i * x0i; + y0i = wk1r * x0i + wk1i * x0r; + y2r = wd1r * x2r - wd1i * x2i; + y2i = wd1r * x2i + wd1i * x2r; + a[j] = y0r + y2r; + a[j + 1] = y0i + y2i; + a[j1] = y0r - y2r; + a[j1 + 1] = y0i - y2i; + y0r = wk3r * x1r + wk3i * x1i; + y0i = wk3r * x1i - wk3i * x1r; + y2r = wd3r * x3r + wd3i * x3i; + y2i = wd3r * x3i - wd3i * x3r; + a[j2] = y0r + y2r; + a[j2 + 1] = y0i + y2i; + a[j3] = y0r - y2r; + a[j3 + 1] = y0i - y2i; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] - a[j2 + 1]; + x0i = a[j0 + 1] + a[j2]; + x1r = a[j0] + a[j2 + 1]; + x1i = a[j0 + 1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wd1i * x0r - wd1r * x0i; + y0i = wd1i * x0i + wd1r * x0r; + y2r = wk1i * x2r - wk1r * x2i; + y2i = wk1i * x2i + wk1r * x2r; + a[j0] = y0r + y2r; + a[j0 + 1] = y0i + y2i; + a[j1] = y0r - y2r; + a[j1 + 1] = y0i - y2i; + y0r = wd3i * x1r + wd3r * x1i; + y0i = wd3i * x1i - wd3r * x1r; + y2r = wk3i * x3r + wk3r * x3i; + y2i = wk3i * x3i - wk3r * x3r; + a[j2] = y0r + y2r; + a[j2 + 1] = y0i + y2i; + a[j3] = y0r - y2r; + a[j3 + 1] = y0i - y2i; + } + wk1r = w[m]; + wk1i = w[m + 1]; + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] - a[j2 + 1]; + x0i = a[j0 + 1] + a[j2]; + x1r = a[j0] + a[j2 + 1]; + x1i = a[j0 + 1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wk1r * x0r - wk1i * x0i; + y0i = wk1r * x0i + wk1i * x0r; + y2r = wk1i * x2r - wk1r * x2i; + y2i = wk1i * x2i + wk1r * x2r; + a[j0] = y0r + y2r; + a[j0 + 1] = y0i + y2i; + a[j1] = y0r - y2r; + a[j1 + 1] = y0i - y2i; + y0r = wk1i * x1r - wk1r * x1i; + y0i = wk1i * x1i + wk1r * x1r; + y2r = wk1r * x3r - wk1i * x3i; + y2i = wk1r * x3i + wk1i * x3r; + a[j2] = y0r - y2r; + a[j2 + 1] = y0i - y2i; + a[j3] = y0r + y2r; + a[j3 + 1] = y0i + y2i; +} + + +void cftfx41(int n, double *a, int nw, double *w) +{ + void cftf161(double *a, double *w); + void cftf162(double *a, double *w); + void cftf081(double *a, double *w); + void cftf082(double *a, double *w); + + if (n == 128) { + cftf161(a, &w[nw - 8]); + cftf162(&a[32], &w[nw - 32]); + cftf161(&a[64], &w[nw - 8]); + cftf161(&a[96], &w[nw - 8]); + } else { + cftf081(a, &w[nw - 8]); + cftf082(&a[16], &w[nw - 8]); + cftf081(&a[32], &w[nw - 8]); + cftf081(&a[48], &w[nw - 8]); + } +} + + +void cftf161(double *a, double *w) +{ + double wn4r, wk1r, wk1i, + x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, + y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, + y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, + y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i; + + wn4r = w[1]; + wk1r = w[2]; + wk1i = w[3]; + x0r = a[0] + a[16]; + x0i = a[1] + a[17]; + x1r = a[0] - a[16]; + x1i = a[1] - a[17]; + x2r = a[8] + a[24]; + x2i = a[9] + a[25]; + x3r = a[8] - a[24]; + x3i = a[9] - a[25]; + y0r = x0r + x2r; + y0i = x0i + x2i; + y4r = x0r - x2r; + y4i = x0i - x2i; + y8r = x1r - x3i; + y8i = x1i + x3r; + y12r = x1r + x3i; + y12i = x1i - x3r; + x0r = a[2] + a[18]; + x0i = a[3] + a[19]; + x1r = a[2] - a[18]; + x1i = a[3] - a[19]; + x2r = a[10] + a[26]; + x2i = a[11] + a[27]; + x3r = a[10] - a[26]; + x3i = a[11] - a[27]; + y1r = x0r + x2r; + y1i = x0i + x2i; + y5r = x0r - x2r; + y5i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + y9r = wk1r * x0r - wk1i * x0i; + y9i = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + y13r = wk1i * x0r - wk1r * x0i; + y13i = wk1i * x0i + wk1r * x0r; + x0r = a[4] + a[20]; + x0i = a[5] + a[21]; + x1r = a[4] - a[20]; + x1i = a[5] - a[21]; + x2r = a[12] + a[28]; + x2i = a[13] + a[29]; + x3r = a[12] - a[28]; + x3i = a[13] - a[29]; + y2r = x0r + x2r; + y2i = x0i + x2i; + y6r = x0r - x2r; + y6i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + y10r = wn4r * (x0r - x0i); + y10i = wn4r * (x0i + x0r); + x0r = x1r + x3i; + x0i = x1i - x3r; + y14r = wn4r * (x0r + x0i); + y14i = wn4r * (x0i - x0r); + x0r = a[6] + a[22]; + x0i = a[7] + a[23]; + x1r = a[6] - a[22]; + x1i = a[7] - a[23]; + x2r = a[14] + a[30]; + x2i = a[15] + a[31]; + x3r = a[14] - a[30]; + x3i = a[15] - a[31]; + y3r = x0r + x2r; + y3i = x0i + x2i; + y7r = x0r - x2r; + y7i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + y11r = wk1i * x0r - wk1r * x0i; + y11i = wk1i * x0i + wk1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + y15r = wk1r * x0r - wk1i * x0i; + y15i = wk1r * x0i + wk1i * x0r; + x0r = y12r - y14r; + x0i = y12i - y14i; + x1r = y12r + y14r; + x1i = y12i + y14i; + x2r = y13r - y15r; + x2i = y13i - y15i; + x3r = y13r + y15r; + x3i = y13i + y15i; + a[24] = x0r + x2r; + a[25] = x0i + x2i; + a[26] = x0r - x2r; + a[27] = x0i - x2i; + a[28] = x1r - x3i; + a[29] = x1i + x3r; + a[30] = x1r + x3i; + a[31] = x1i - x3r; + x0r = y8r + y10r; + x0i = y8i + y10i; + x1r = y8r - y10r; + x1i = y8i - y10i; + x2r = y9r + y11r; + x2i = y9i + y11i; + x3r = y9r - y11r; + x3i = y9i - y11i; + a[16] = x0r + x2r; + a[17] = x0i + x2i; + a[18] = x0r - x2r; + a[19] = x0i - x2i; + a[20] = x1r - x3i; + a[21] = x1i + x3r; + a[22] = x1r + x3i; + a[23] = x1i - x3r; + x0r = y5r - y7i; + x0i = y5i + y7r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + x0r = y5r + y7i; + x0i = y5i - y7r; + x3r = wn4r * (x0r - x0i); + x3i = wn4r * (x0i + x0r); + x0r = y4r - y6i; + x0i = y4i + y6r; + x1r = y4r + y6i; + x1i = y4i - y6r; + a[8] = x0r + x2r; + a[9] = x0i + x2i; + a[10] = x0r - x2r; + a[11] = x0i - x2i; + a[12] = x1r - x3i; + a[13] = x1i + x3r; + a[14] = x1r + x3i; + a[15] = x1i - x3r; + x0r = y0r + y2r; + x0i = y0i + y2i; + x1r = y0r - y2r; + x1i = y0i - y2i; + x2r = y1r + y3r; + x2i = y1i + y3i; + x3r = y1r - y3r; + x3i = y1i - y3i; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[2] = x0r - x2r; + a[3] = x0i - x2i; + a[4] = x1r - x3i; + a[5] = x1i + x3r; + a[6] = x1r + x3i; + a[7] = x1i - x3r; +} + + +void cftf162(double *a, double *w) +{ + double wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i, + x0r, x0i, x1r, x1i, x2r, x2i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, + y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, + y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, + y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i; + + wn4r = w[1]; + wk1r = w[4]; + wk1i = w[5]; + wk3r = w[6]; + wk3i = -w[7]; + wk2r = w[8]; + wk2i = w[9]; + x1r = a[0] - a[17]; + x1i = a[1] + a[16]; + x0r = a[8] - a[25]; + x0i = a[9] + a[24]; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + y0r = x1r + x2r; + y0i = x1i + x2i; + y4r = x1r - x2r; + y4i = x1i - x2i; + x1r = a[0] + a[17]; + x1i = a[1] - a[16]; + x0r = a[8] + a[25]; + x0i = a[9] - a[24]; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + y8r = x1r - x2i; + y8i = x1i + x2r; + y12r = x1r + x2i; + y12i = x1i - x2r; + x0r = a[2] - a[19]; + x0i = a[3] + a[18]; + x1r = wk1r * x0r - wk1i * x0i; + x1i = wk1r * x0i + wk1i * x0r; + x0r = a[10] - a[27]; + x0i = a[11] + a[26]; + x2r = wk3i * x0r - wk3r * x0i; + x2i = wk3i * x0i + wk3r * x0r; + y1r = x1r + x2r; + y1i = x1i + x2i; + y5r = x1r - x2r; + y5i = x1i - x2i; + x0r = a[2] + a[19]; + x0i = a[3] - a[18]; + x1r = wk3r * x0r - wk3i * x0i; + x1i = wk3r * x0i + wk3i * x0r; + x0r = a[10] + a[27]; + x0i = a[11] - a[26]; + x2r = wk1r * x0r + wk1i * x0i; + x2i = wk1r * x0i - wk1i * x0r; + y9r = x1r - x2r; + y9i = x1i - x2i; + y13r = x1r + x2r; + y13i = x1i + x2i; + x0r = a[4] - a[21]; + x0i = a[5] + a[20]; + x1r = wk2r * x0r - wk2i * x0i; + x1i = wk2r * x0i + wk2i * x0r; + x0r = a[12] - a[29]; + x0i = a[13] + a[28]; + x2r = wk2i * x0r - wk2r * x0i; + x2i = wk2i * x0i + wk2r * x0r; + y2r = x1r + x2r; + y2i = x1i + x2i; + y6r = x1r - x2r; + y6i = x1i - x2i; + x0r = a[4] + a[21]; + x0i = a[5] - a[20]; + x1r = wk2i * x0r - wk2r * x0i; + x1i = wk2i * x0i + wk2r * x0r; + x0r = a[12] + a[29]; + x0i = a[13] - a[28]; + x2r = wk2r * x0r - wk2i * x0i; + x2i = wk2r * x0i + wk2i * x0r; + y10r = x1r - x2r; + y10i = x1i - x2i; + y14r = x1r + x2r; + y14i = x1i + x2i; + x0r = a[6] - a[23]; + x0i = a[7] + a[22]; + x1r = wk3r * x0r - wk3i * x0i; + x1i = wk3r * x0i + wk3i * x0r; + x0r = a[14] - a[31]; + x0i = a[15] + a[30]; + x2r = wk1i * x0r - wk1r * x0i; + x2i = wk1i * x0i + wk1r * x0r; + y3r = x1r + x2r; + y3i = x1i + x2i; + y7r = x1r - x2r; + y7i = x1i - x2i; + x0r = a[6] + a[23]; + x0i = a[7] - a[22]; + x1r = wk1i * x0r + wk1r * x0i; + x1i = wk1i * x0i - wk1r * x0r; + x0r = a[14] + a[31]; + x0i = a[15] - a[30]; + x2r = wk3i * x0r - wk3r * x0i; + x2i = wk3i * x0i + wk3r * x0r; + y11r = x1r + x2r; + y11i = x1i + x2i; + y15r = x1r - x2r; + y15i = x1i - x2i; + x1r = y0r + y2r; + x1i = y0i + y2i; + x2r = y1r + y3r; + x2i = y1i + y3i; + a[0] = x1r + x2r; + a[1] = x1i + x2i; + a[2] = x1r - x2r; + a[3] = x1i - x2i; + x1r = y0r - y2r; + x1i = y0i - y2i; + x2r = y1r - y3r; + x2i = y1i - y3i; + a[4] = x1r - x2i; + a[5] = x1i + x2r; + a[6] = x1r + x2i; + a[7] = x1i - x2r; + x1r = y4r - y6i; + x1i = y4i + y6r; + x0r = y5r - y7i; + x0i = y5i + y7r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[8] = x1r + x2r; + a[9] = x1i + x2i; + a[10] = x1r - x2r; + a[11] = x1i - x2i; + x1r = y4r + y6i; + x1i = y4i - y6r; + x0r = y5r + y7i; + x0i = y5i - y7r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[12] = x1r - x2i; + a[13] = x1i + x2r; + a[14] = x1r + x2i; + a[15] = x1i - x2r; + x1r = y8r + y10r; + x1i = y8i + y10i; + x2r = y9r - y11r; + x2i = y9i - y11i; + a[16] = x1r + x2r; + a[17] = x1i + x2i; + a[18] = x1r - x2r; + a[19] = x1i - x2i; + x1r = y8r - y10r; + x1i = y8i - y10i; + x2r = y9r + y11r; + x2i = y9i + y11i; + a[20] = x1r - x2i; + a[21] = x1i + x2r; + a[22] = x1r + x2i; + a[23] = x1i - x2r; + x1r = y12r - y14i; + x1i = y12i + y14r; + x0r = y13r + y15i; + x0i = y13i - y15r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[24] = x1r + x2r; + a[25] = x1i + x2i; + a[26] = x1r - x2r; + a[27] = x1i - x2i; + x1r = y12r + y14i; + x1i = y12i - y14r; + x0r = y13r - y15i; + x0i = y13i + y15r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[28] = x1r - x2i; + a[29] = x1i + x2r; + a[30] = x1r + x2i; + a[31] = x1i - x2r; +} + + +void cftf081(double *a, double *w) +{ + double wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, + y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i; + + wn4r = w[1]; + x0r = a[0] + a[8]; + x0i = a[1] + a[9]; + x1r = a[0] - a[8]; + x1i = a[1] - a[9]; + x2r = a[4] + a[12]; + x2i = a[5] + a[13]; + x3r = a[4] - a[12]; + x3i = a[5] - a[13]; + y0r = x0r + x2r; + y0i = x0i + x2i; + y2r = x0r - x2r; + y2i = x0i - x2i; + y1r = x1r - x3i; + y1i = x1i + x3r; + y3r = x1r + x3i; + y3i = x1i - x3r; + x0r = a[2] + a[10]; + x0i = a[3] + a[11]; + x1r = a[2] - a[10]; + x1i = a[3] - a[11]; + x2r = a[6] + a[14]; + x2i = a[7] + a[15]; + x3r = a[6] - a[14]; + x3i = a[7] - a[15]; + y4r = x0r + x2r; + y4i = x0i + x2i; + y6r = x0r - x2r; + y6i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + x2r = x1r + x3i; + x2i = x1i - x3r; + y5r = wn4r * (x0r - x0i); + y5i = wn4r * (x0r + x0i); + y7r = wn4r * (x2r - x2i); + y7i = wn4r * (x2r + x2i); + a[8] = y1r + y5r; + a[9] = y1i + y5i; + a[10] = y1r - y5r; + a[11] = y1i - y5i; + a[12] = y3r - y7i; + a[13] = y3i + y7r; + a[14] = y3r + y7i; + a[15] = y3i - y7r; + a[0] = y0r + y4r; + a[1] = y0i + y4i; + a[2] = y0r - y4r; + a[3] = y0i - y4i; + a[4] = y2r - y6i; + a[5] = y2i + y6r; + a[6] = y2r + y6i; + a[7] = y2i - y6r; +} + + +void cftf082(double *a, double *w) +{ + double wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, + y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i; + + wn4r = w[1]; + wk1r = w[2]; + wk1i = w[3]; + y0r = a[0] - a[9]; + y0i = a[1] + a[8]; + y1r = a[0] + a[9]; + y1i = a[1] - a[8]; + x0r = a[4] - a[13]; + x0i = a[5] + a[12]; + y2r = wn4r * (x0r - x0i); + y2i = wn4r * (x0i + x0r); + x0r = a[4] + a[13]; + x0i = a[5] - a[12]; + y3r = wn4r * (x0r - x0i); + y3i = wn4r * (x0i + x0r); + x0r = a[2] - a[11]; + x0i = a[3] + a[10]; + y4r = wk1r * x0r - wk1i * x0i; + y4i = wk1r * x0i + wk1i * x0r; + x0r = a[2] + a[11]; + x0i = a[3] - a[10]; + y5r = wk1i * x0r - wk1r * x0i; + y5i = wk1i * x0i + wk1r * x0r; + x0r = a[6] - a[15]; + x0i = a[7] + a[14]; + y6r = wk1i * x0r - wk1r * x0i; + y6i = wk1i * x0i + wk1r * x0r; + x0r = a[6] + a[15]; + x0i = a[7] - a[14]; + y7r = wk1r * x0r - wk1i * x0i; + y7i = wk1r * x0i + wk1i * x0r; + x0r = y0r + y2r; + x0i = y0i + y2i; + x1r = y4r + y6r; + x1i = y4i + y6i; + a[0] = x0r + x1r; + a[1] = x0i + x1i; + a[2] = x0r - x1r; + a[3] = x0i - x1i; + x0r = y0r - y2r; + x0i = y0i - y2i; + x1r = y4r - y6r; + x1i = y4i - y6i; + a[4] = x0r - x1i; + a[5] = x0i + x1r; + a[6] = x0r + x1i; + a[7] = x0i - x1r; + x0r = y1r - y3i; + x0i = y1i + y3r; + x1r = y5r - y7r; + x1i = y5i - y7i; + a[8] = x0r + x1r; + a[9] = x0i + x1i; + a[10] = x0r - x1r; + a[11] = x0i - x1i; + x0r = y1r + y3i; + x0i = y1i - y3r; + x1r = y5r + y7r; + x1i = y5i + y7i; + a[12] = x0r - x1i; + a[13] = x0i + x1r; + a[14] = x0r + x1i; + a[15] = x0i - x1r; +} + + +void cftf040(double *a) +{ + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + x0r = a[0] + a[4]; + x0i = a[1] + a[5]; + x1r = a[0] - a[4]; + x1i = a[1] - a[5]; + x2r = a[2] + a[6]; + x2i = a[3] + a[7]; + x3r = a[2] - a[6]; + x3i = a[3] - a[7]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[2] = x1r - x3i; + a[3] = x1i + x3r; + a[4] = x0r - x2r; + a[5] = x0i - x2i; + a[6] = x1r + x3i; + a[7] = x1i - x3r; +} + + +void cftb040(double *a) +{ + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + x0r = a[0] + a[4]; + x0i = a[1] + a[5]; + x1r = a[0] - a[4]; + x1i = a[1] - a[5]; + x2r = a[2] + a[6]; + x2i = a[3] + a[7]; + x3r = a[2] - a[6]; + x3i = a[3] - a[7]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[2] = x1r + x3i; + a[3] = x1i - x3r; + a[4] = x0r - x2r; + a[5] = x0i - x2i; + a[6] = x1r - x3i; + a[7] = x1i + x3r; +} + + +void cftx020(double *a) +{ + double x0r, x0i; + + x0r = a[0] - a[2]; + x0i = a[1] - a[3]; + a[0] += a[2]; + a[1] += a[3]; + a[2] = x0r; + a[3] = x0i; +} + + +void rftfsub(int n, double *a, int nc, double *c) +{ + int j, k, kk, ks, m; + double wkr, wki, xr, xi, yr, yi; + + m = n >> 1; + ks = 2 * nc / m; + kk = 0; + for (j = 2; j < m; j += 2) { + k = n - j; + kk += ks; + wkr = 0.5 - c[nc - kk]; + wki = c[kk]; + xr = a[j] - a[k]; + xi = a[j + 1] + a[k + 1]; + yr = wkr * xr - wki * xi; + yi = wkr * xi + wki * xr; + a[j] -= yr; + a[j + 1] -= yi; + a[k] += yr; + a[k + 1] -= yi; + } +} + + +void rftbsub(int n, double *a, int nc, double *c) +{ + int j, k, kk, ks, m; + double wkr, wki, xr, xi, yr, yi; + + m = n >> 1; + ks = 2 * nc / m; + kk = 0; + for (j = 2; j < m; j += 2) { + k = n - j; + kk += ks; + wkr = 0.5 - c[nc - kk]; + wki = c[kk]; + xr = a[j] - a[k]; + xi = a[j + 1] + a[k + 1]; + yr = wkr * xr + wki * xi; + yi = wkr * xi - wki * xr; + a[j] -= yr; + a[j + 1] -= yi; + a[k] += yr; + a[k + 1] -= yi; + } +} + + diff --git a/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/log.cc b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/log.cc new file mode 100644 index 0000000..5b84115 --- /dev/null +++ b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/log.cc @@ -0,0 +1,142 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Stack trace related stuff is from kaldi. + * Refer to + * https://github.com/kaldi-asr/kaldi/blob/master/src/base/kaldi-error.cc + */ + +#include "log.h" + +#ifdef KNF_HAVE_EXECINFO_H +#include // To get stack trace in error messages. +#ifdef KNF_HAVE_CXXABI_H +#include // For name demangling. +// Useful to decode the stack trace, but only used if we have execinfo.h +#endif // KNF_HAVE_CXXABI_H +#endif // KNF_HAVE_EXECINFO_H + +#include + +#include +#include +#include + +namespace knf { + +std::string GetDateTimeStr() { + std::ostringstream os; + std::time_t t = std::time(nullptr); + std::tm tm = *std::localtime(&t); + os << std::put_time(&tm, "%F %T"); // yyyy-mm-dd hh:mm:ss + return os.str(); +} + +static bool LocateSymbolRange(const std::string &trace_name, std::size_t *begin, + std::size_t *end) { + // Find the first '_' with leading ' ' or '('. + *begin = std::string::npos; + for (std::size_t i = 1; i < trace_name.size(); ++i) { + if (trace_name[i] != '_') { + continue; + } + if (trace_name[i - 1] == ' ' || trace_name[i - 1] == '(') { + *begin = i; + break; + } + } + if (*begin == std::string::npos) { + return false; + } + *end = trace_name.find_first_of(" +", *begin); + return *end != std::string::npos; +} + +#ifdef KNF_HAVE_EXECINFO_H +static std::string Demangle(const std::string &trace_name) { +#ifndef KNF_HAVE_CXXABI_H + return trace_name; +#else // KNF_HAVE_CXXABI_H + // Try demangle the symbol. We are trying to support the following formats + // produced by different platforms: + // + // Linux: + // ./kaldi-error-test(_ZN5kaldi13UnitTestErrorEv+0xb) [0x804965d] + // + // Mac: + // 0 server 0x000000010f67614d _ZNK5kaldi13MessageLogger10LogMessageEv + 813 + // + // We want to extract the name e.g., '_ZN5kaldi13UnitTestErrorEv' and + // demangle it info a readable name like kaldi::UnitTextError. + std::size_t begin, end; + if (!LocateSymbolRange(trace_name, &begin, &end)) { + return trace_name; + } + std::string symbol = trace_name.substr(begin, end - begin); + int status; + char *demangled_name = abi::__cxa_demangle(symbol.c_str(), 0, 0, &status); + if (status == 0 && demangled_name != nullptr) { + symbol = demangled_name; + free(demangled_name); + } + return trace_name.substr(0, begin) + symbol + + trace_name.substr(end, std::string::npos); +#endif // KNF_HAVE_CXXABI_H +} +#endif // KNF_HAVE_EXECINFO_H + +std::string GetStackTrace() { + std::string ans; +#ifdef KNF_HAVE_EXECINFO_H + constexpr const std::size_t kMaxTraceSize = 50; + constexpr const std::size_t kMaxTracePrint = 50; // Must be even. + // Buffer for the trace. + void *trace[kMaxTraceSize]; + // Get the trace. + std::size_t size = backtrace(trace, kMaxTraceSize); + // Get the trace symbols. + char **trace_symbol = backtrace_symbols(trace, size); + if (trace_symbol == nullptr) return ans; + + // Compose a human-readable backtrace string. + ans += "[ Stack-Trace: ]\n"; + if (size <= kMaxTracePrint) { + for (std::size_t i = 0; i < size; ++i) { + ans += Demangle(trace_symbol[i]) + "\n"; + } + } else { // Print out first+last (e.g.) 5. + for (std::size_t i = 0; i < kMaxTracePrint / 2; ++i) { + ans += Demangle(trace_symbol[i]) + "\n"; + } + ans += ".\n.\n.\n"; + for (std::size_t i = size - kMaxTracePrint / 2; i < size; ++i) { + ans += Demangle(trace_symbol[i]) + "\n"; + } + if (size == kMaxTraceSize) + ans += ".\n.\n.\n"; // Stack was too long, probably a bug. + } + + // We must free the array of pointers allocated by backtrace_symbols(), + // but not the strings themselves. + free(trace_symbol); +#endif // KNF_HAVE_EXECINFO_H + return ans; +} + +} // namespace knf diff --git a/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/log.h b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/log.h new file mode 100644 index 0000000..bd21cc3 --- /dev/null +++ b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/log.h @@ -0,0 +1,383 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// The content in this file is copied/modified from +// https://github.com/k2-fsa/k2/blob/master/k2/csrc/log.h +#ifndef KALDI_NATIVE_FBANK_CSRC_LOG_H_ +#define KALDI_NATIVE_FBANK_CSRC_LOG_H_ + +#include + +#include // NOLINT +#include +#include + +namespace knf { + +#if KNF_ENABLE_CHECK + +#if defined(NDEBUG) +constexpr bool kDisableDebug = true; +#else +constexpr bool kDisableDebug = false; +#endif + +enum class LogLevel { + kTrace = 0, + kDebug = 1, + kInfo = 2, + kWarning = 3, + kError = 4, + kFatal = 5, // print message and abort the program +}; + +// They are used in KNF_LOG(xxx), so their names +// do not follow the google c++ code style +// +// You can use them in the following way: +// +// KNF_LOG(TRACE) << "some message"; +// KNF_LOG(DEBUG) << "some message"; +#ifndef _MSC_VER +constexpr LogLevel TRACE = LogLevel::kTrace; +constexpr LogLevel DEBUG = LogLevel::kDebug; +constexpr LogLevel INFO = LogLevel::kInfo; +constexpr LogLevel WARNING = LogLevel::kWarning; +constexpr LogLevel ERROR = LogLevel::kError; +constexpr LogLevel FATAL = LogLevel::kFatal; +#else +#define TRACE LogLevel::kTrace +#define DEBUG LogLevel::kDebug +#define INFO LogLevel::kInfo +#define WARNING LogLevel::kWarning +#define ERROR LogLevel::kError +#define FATAL LogLevel::kFatal +#endif + +std::string GetStackTrace(); + +/* Return the current log level. + + + If the current log level is TRACE, then all logged messages are printed out. + + If the current log level is DEBUG, log messages with "TRACE" level are not + shown and all other levels are printed out. + + Similarly, if the current log level is INFO, log message with "TRACE" and + "DEBUG" are not shown and all other levels are printed out. + + If it is FATAL, then only FATAL messages are shown. + */ +inline LogLevel GetCurrentLogLevel() { + static LogLevel log_level = INFO; + static std::once_flag init_flag; + std::call_once(init_flag, []() { + const char *env_log_level = std::getenv("KNF_LOG_LEVEL"); + if (env_log_level == nullptr) return; + + std::string s = env_log_level; + if (s == "TRACE") + log_level = TRACE; + else if (s == "DEBUG") + log_level = DEBUG; + else if (s == "INFO") + log_level = INFO; + else if (s == "WARNING") + log_level = WARNING; + else if (s == "ERROR") + log_level = ERROR; + else if (s == "FATAL") + log_level = FATAL; + else + fprintf(stderr, + "Unknown KNF_LOG_LEVEL: %s" + "\nSupported values are: " + "TRACE, DEBUG, INFO, WARNING, ERROR, FATAL", + s.c_str()); + }); + return log_level; +} + +inline bool EnableAbort() { + static std::once_flag init_flag; + static bool enable_abort = false; + std::call_once(init_flag, []() { + enable_abort = (std::getenv("KNF_ABORT") != nullptr); + }); + return enable_abort; +} + +class Logger { + public: + Logger(const char *filename, const char *func_name, uint32_t line_num, + LogLevel level) + : filename_(filename), + func_name_(func_name), + line_num_(line_num), + level_(level) { + cur_level_ = GetCurrentLogLevel(); + fprintf(stderr, "here\n"); + switch (level) { + case TRACE: + if (cur_level_ <= TRACE) fprintf(stderr, "[T] "); + break; + case DEBUG: + if (cur_level_ <= DEBUG) fprintf(stderr, "[D] "); + break; + case INFO: + if (cur_level_ <= INFO) fprintf(stderr, "[I] "); + break; + case WARNING: + if (cur_level_ <= WARNING) fprintf(stderr, "[W] "); + break; + case ERROR: + if (cur_level_ <= ERROR) fprintf(stderr, "[E] "); + break; + case FATAL: + if (cur_level_ <= FATAL) fprintf(stderr, "[F] "); + break; + } + + if (cur_level_ <= level_) { + fprintf(stderr, "%s:%u:%s ", filename, line_num, func_name); + } + } + + ~Logger() noexcept(false) { + static constexpr const char *kErrMsg = R"( + Some bad things happened. Please read the above error messages and stack + trace. If you are using Python, the following command may be helpful: + + gdb --args python /path/to/your/code.py + + (You can use `gdb` to debug the code. Please consider compiling + a debug version of KNF.). + + If you are unable to fix it, please open an issue at: + + https://github.com/csukuangfj/kaldi-native-fbank/issues/new + )"; + fprintf(stderr, "\n"); + if (level_ == FATAL) { + std::string stack_trace = GetStackTrace(); + if (!stack_trace.empty()) { + fprintf(stderr, "\n\n%s\n", stack_trace.c_str()); + } + + fflush(nullptr); + +#ifndef __ANDROID_API__ + if (EnableAbort()) { + // NOTE: abort() will terminate the program immediately without + // printing the Python stack backtrace. + abort(); + } + + throw std::runtime_error(kErrMsg); +#else + abort(); +#endif + } + } + + const Logger &operator<<(bool b) const { + if (cur_level_ <= level_) { + fprintf(stderr, b ? "true" : "false"); + } + return *this; + } + + const Logger &operator<<(int8_t i) const { + if (cur_level_ <= level_) fprintf(stderr, "%d", i); + return *this; + } + + const Logger &operator<<(const char *s) const { + if (cur_level_ <= level_) fprintf(stderr, "%s", s); + return *this; + } + + const Logger &operator<<(int32_t i) const { + if (cur_level_ <= level_) fprintf(stderr, "%d", i); + return *this; + } + + const Logger &operator<<(uint32_t i) const { + if (cur_level_ <= level_) fprintf(stderr, "%u", i); + return *this; + } + + const Logger &operator<<(uint64_t i) const { + if (cur_level_ <= level_) + fprintf(stderr, "%llu", (long long unsigned int)i); // NOLINT + return *this; + } + + const Logger &operator<<(int64_t i) const { + if (cur_level_ <= level_) + fprintf(stderr, "%lli", (long long int)i); // NOLINT + return *this; + } + + const Logger &operator<<(float f) const { + if (cur_level_ <= level_) fprintf(stderr, "%f", f); + return *this; + } + + const Logger &operator<<(double d) const { + if (cur_level_ <= level_) fprintf(stderr, "%f", d); + return *this; + } + + template + const Logger &operator<<(const T &t) const { + // require T overloads operator<< + std::ostringstream os; + os << t; + return *this << os.str().c_str(); + } + + // specialization to fix compile error: `stringstream << nullptr` is ambiguous + const Logger &operator<<(const std::nullptr_t &null) const { + if (cur_level_ <= level_) *this << "(null)"; + return *this; + } + + private: + const char *filename_; + const char *func_name_; + uint32_t line_num_; + LogLevel level_; + LogLevel cur_level_; +}; +#endif // KNF_ENABLE_CHECK + +class Voidifier { + public: +#if KNF_ENABLE_CHECK + void operator&(const Logger &) const {} +#endif +}; +#if !defined(KNF_ENABLE_CHECK) +template +const Voidifier &operator<<(const Voidifier &v, T &&) { + return v; +} +#endif + +} // namespace knf + +#define KNF_STATIC_ASSERT(x) static_assert(x, "") + +#ifdef KNF_ENABLE_CHECK + +#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) || \ + defined(__PRETTY_FUNCTION__) +// for clang and GCC +#define KNF_FUNC __PRETTY_FUNCTION__ +#else +// for other compilers +#define KNF_FUNC __func__ +#endif + +#define KNF_CHECK(x) \ + (x) ? (void)0 \ + : ::knf::Voidifier() & \ + ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::FATAL) \ + << "Check failed: " << #x << " " + +// WARNING: x and y may be evaluated multiple times, but this happens only +// when the check fails. Since the program aborts if it fails, we don't think +// the extra evaluation of x and y matters. +// +// CAUTION: we recommend the following use case: +// +// auto x = Foo(); +// auto y = Bar(); +// KNF_CHECK_EQ(x, y) << "Some message"; +// +// And please avoid +// +// KNF_CHECK_EQ(Foo(), Bar()); +// +// if `Foo()` or `Bar()` causes some side effects, e.g., changing some +// local static variables or global variables. +#define _KNF_CHECK_OP(x, y, op) \ + ((x)op(y)) ? (void)0 \ + : ::knf::Voidifier() & \ + ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::FATAL) \ + << "Check failed: " << #x << " " << #op << " " << #y \ + << " (" << (x) << " vs. " << (y) << ") " + +#define KNF_CHECK_EQ(x, y) _KNF_CHECK_OP(x, y, ==) +#define KNF_CHECK_NE(x, y) _KNF_CHECK_OP(x, y, !=) +#define KNF_CHECK_LT(x, y) _KNF_CHECK_OP(x, y, <) +#define KNF_CHECK_LE(x, y) _KNF_CHECK_OP(x, y, <=) +#define KNF_CHECK_GT(x, y) _KNF_CHECK_OP(x, y, >) +#define KNF_CHECK_GE(x, y) _KNF_CHECK_OP(x, y, >=) + +#define KNF_LOG(x) ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::x) + +// ------------------------------------------------------------ +// For debug check +// ------------------------------------------------------------ +// If you define the macro "-D NDEBUG" while compiling kaldi-native-fbank, +// the following macros are in fact empty and does nothing. + +#define KNF_DCHECK(x) ::knf::kDisableDebug ? (void)0 : KNF_CHECK(x) + +#define KNF_DCHECK_EQ(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_EQ(x, y) + +#define KNF_DCHECK_NE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_NE(x, y) + +#define KNF_DCHECK_LT(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_LT(x, y) + +#define KNF_DCHECK_LE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_LE(x, y) + +#define KNF_DCHECK_GT(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_GT(x, y) + +#define KNF_DCHECK_GE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_GE(x, y) + +#define KNF_DLOG(x) \ + ::knf::kDisableDebug ? (void)0 : ::knf::Voidifier() & KNF_LOG(x) + +#else + +#define KNF_CHECK(x) ::knf::Voidifier() +#define KNF_LOG(x) ::knf::Voidifier() + +#define KNF_CHECK_EQ(x, y) ::knf::Voidifier() +#define KNF_CHECK_NE(x, y) ::knf::Voidifier() +#define KNF_CHECK_LT(x, y) ::knf::Voidifier() +#define KNF_CHECK_LE(x, y) ::knf::Voidifier() +#define KNF_CHECK_GT(x, y) ::knf::Voidifier() +#define KNF_CHECK_GE(x, y) ::knf::Voidifier() + +#define KNF_DCHECK(x) ::knf::Voidifier() +#define KNF_DLOG(x) ::knf::Voidifier() +#define KNF_DCHECK_EQ(x, y) ::knf::Voidifier() +#define KNF_DCHECK_NE(x, y) ::knf::Voidifier() +#define KNF_DCHECK_LT(x, y) ::knf::Voidifier() +#define KNF_DCHECK_LE(x, y) ::knf::Voidifier() +#define KNF_DCHECK_GT(x, y) ::knf::Voidifier() +#define KNF_DCHECK_GE(x, y) ::knf::Voidifier() + +#endif // KNF_CHECK_NE + +#endif // KALDI_NATIVE_FBANK_CSRC_LOG_H_ diff --git a/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/mel-computations.cc b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/mel-computations.cc new file mode 100644 index 0000000..5455c03 --- /dev/null +++ b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/mel-computations.cc @@ -0,0 +1,257 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is copied/modified from kaldi/src/feat/mel-computations.cc + +#include "mel-computations.h" + +#include +#include +#include + +#include "feature-window.h" + +namespace knf { + +std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts) { + os << opts.ToString(); + return os; +} + +float MelBanks::VtlnWarpFreq( + float vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN. + float vtln_high_cutoff, + float low_freq, // upper+lower frequency cutoffs in mel computation + float high_freq, float vtln_warp_factor, float freq) { + /// This computes a VTLN warping function that is not the same as HTK's one, + /// but has similar inputs (this function has the advantage of never producing + /// empty bins). + + /// This function computes a warp function F(freq), defined between low_freq + /// and high_freq inclusive, with the following properties: + /// F(low_freq) == low_freq + /// F(high_freq) == high_freq + /// The function is continuous and piecewise linear with two inflection + /// points. + /// The lower inflection point (measured in terms of the unwarped + /// frequency) is at frequency l, determined as described below. + /// The higher inflection point is at a frequency h, determined as + /// described below. + /// If l <= f <= h, then F(f) = f/vtln_warp_factor. + /// If the higher inflection point (measured in terms of the unwarped + /// frequency) is at h, then max(h, F(h)) == vtln_high_cutoff. + /// Since (by the last point) F(h) == h/vtln_warp_factor, then + /// max(h, h/vtln_warp_factor) == vtln_high_cutoff, so + /// h = vtln_high_cutoff / max(1, 1/vtln_warp_factor). + /// = vtln_high_cutoff * min(1, vtln_warp_factor). + /// If the lower inflection point (measured in terms of the unwarped + /// frequency) is at l, then min(l, F(l)) == vtln_low_cutoff + /// This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor) + /// = vtln_low_cutoff * max(1, vtln_warp_factor) + + if (freq < low_freq || freq > high_freq) + return freq; // in case this gets called + // for out-of-range frequencies, just return the freq. + + KNF_CHECK_GT(vtln_low_cutoff, low_freq); + KNF_CHECK_LT(vtln_high_cutoff, high_freq); + + float one = 1.0f; + float l = vtln_low_cutoff * std::max(one, vtln_warp_factor); + float h = vtln_high_cutoff * std::min(one, vtln_warp_factor); + float scale = 1.0f / vtln_warp_factor; + float Fl = scale * l; // F(l); + float Fh = scale * h; // F(h); + KNF_CHECK(l > low_freq && h < high_freq); + // slope of left part of the 3-piece linear function + float scale_left = (Fl - low_freq) / (l - low_freq); + // [slope of center part is just "scale"] + + // slope of right part of the 3-piece linear function + float scale_right = (high_freq - Fh) / (high_freq - h); + + if (freq < l) { + return low_freq + scale_left * (freq - low_freq); + } else if (freq < h) { + return scale * freq; + } else { // freq >= h + return high_freq + scale_right * (freq - high_freq); + } +} + +float MelBanks::VtlnWarpMelFreq( + float vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN. + float vtln_high_cutoff, + float low_freq, // upper+lower frequency cutoffs in mel computation + float high_freq, float vtln_warp_factor, float mel_freq) { + return MelScale(VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, low_freq, + high_freq, vtln_warp_factor, + InverseMelScale(mel_freq))); +} + +MelBanks::MelBanks(const MelBanksOptions &opts, + const FrameExtractionOptions &frame_opts, + float vtln_warp_factor) + : htk_mode_(opts.htk_mode) { + int32_t num_bins = opts.num_bins; + if (num_bins < 3) KNF_LOG(FATAL) << "Must have at least 3 mel bins"; + + float sample_freq = frame_opts.samp_freq; + int32_t window_length_padded = frame_opts.PaddedWindowSize(); + KNF_CHECK_EQ(window_length_padded % 2, 0); + + int32_t num_fft_bins = window_length_padded / 2; + float nyquist = 0.5f * sample_freq; + + float low_freq = opts.low_freq, high_freq; + if (opts.high_freq > 0.0f) + high_freq = opts.high_freq; + else + high_freq = nyquist + opts.high_freq; + + if (low_freq < 0.0f || low_freq >= nyquist || high_freq <= 0.0f || + high_freq > nyquist || high_freq <= low_freq) { + KNF_LOG(FATAL) << "Bad values in options: low-freq " << low_freq + << " and high-freq " << high_freq << " vs. nyquist " + << nyquist; + } + + float fft_bin_width = sample_freq / window_length_padded; + // fft-bin width [think of it as Nyquist-freq / half-window-length] + + float mel_low_freq = MelScale(low_freq); + float mel_high_freq = MelScale(high_freq); + + debug_ = opts.debug_mel; + + // divide by num_bins+1 in next line because of end-effects where the bins + // spread out to the sides. + float mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1); + + float vtln_low = opts.vtln_low, vtln_high = opts.vtln_high; + if (vtln_high < 0.0f) { + vtln_high += nyquist; + } + + if (vtln_warp_factor != 1.0f && + (vtln_low < 0.0f || vtln_low <= low_freq || vtln_low >= high_freq || + vtln_high <= 0.0f || vtln_high >= high_freq || vtln_high <= vtln_low)) { + KNF_LOG(FATAL) << "Bad values in options: vtln-low " << vtln_low + << " and vtln-high " << vtln_high << ", versus " + << "low-freq " << low_freq << " and high-freq " << high_freq; + } + + bins_.resize(num_bins); + center_freqs_.resize(num_bins); + + for (int32_t bin = 0; bin < num_bins; ++bin) { + float left_mel = mel_low_freq + bin * mel_freq_delta, + center_mel = mel_low_freq + (bin + 1) * mel_freq_delta, + right_mel = mel_low_freq + (bin + 2) * mel_freq_delta; + + if (vtln_warp_factor != 1.0f) { + left_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq, + vtln_warp_factor, left_mel); + center_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq, + vtln_warp_factor, center_mel); + right_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq, + vtln_warp_factor, right_mel); + } + center_freqs_[bin] = InverseMelScale(center_mel); + + // this_bin will be a vector of coefficients that is only + // nonzero where this mel bin is active. + std::vector this_bin(num_fft_bins); + + int32_t first_index = -1, last_index = -1; + for (int32_t i = 0; i < num_fft_bins; ++i) { + float freq = (fft_bin_width * i); // Center frequency of this fft + // bin. + float mel = MelScale(freq); + if (mel > left_mel && mel < right_mel) { + float weight; + if (mel <= center_mel) + weight = (mel - left_mel) / (center_mel - left_mel); + else + weight = (right_mel - mel) / (right_mel - center_mel); + this_bin[i] = weight; + if (first_index == -1) first_index = i; + last_index = i; + } + } + KNF_CHECK(first_index != -1 && last_index >= first_index && + "You may have set num_mel_bins too large."); + + bins_[bin].first = first_index; + int32_t size = last_index + 1 - first_index; + bins_[bin].second.insert(bins_[bin].second.end(), + this_bin.begin() + first_index, + this_bin.begin() + first_index + size); + + // Replicate a bug in HTK, for testing purposes. + if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0f) { + bins_[bin].second[0] = 0.0; + } + } // for (int32_t bin = 0; bin < num_bins; ++bin) { + + if (debug_) { + std::ostringstream os; + for (size_t i = 0; i < bins_.size(); i++) { + os << "bin " << i << ", offset = " << bins_[i].first << ", vec = "; + for (auto k : bins_[i].second) os << k << ", "; + os << "\n"; + } + KNF_LOG(INFO) << os.str(); + } +} + +// "power_spectrum" contains fft energies. +void MelBanks::Compute(const float *power_spectrum, + float *mel_energies_out) const { + int32_t num_bins = bins_.size(); + + for (int32_t i = 0; i < num_bins; i++) { + int32_t offset = bins_[i].first; + const auto &v = bins_[i].second; + float energy = 0; + for (int32_t k = 0; k != v.size(); ++k) { + energy += v[k] * power_spectrum[k + offset]; + } + + // HTK-like flooring- for testing purposes (we prefer dither) + if (htk_mode_ && energy < 1.0) { + energy = 1.0; + } + + mel_energies_out[i] = energy; + + // The following assert was added due to a problem with OpenBlas that + // we had at one point (it was a bug in that library). Just to detect + // it early. + KNF_CHECK_EQ(energy, energy); // check that energy is not nan + } + + if (debug_) { + fprintf(stderr, "MEL BANKS:\n"); + for (int32_t i = 0; i < num_bins; i++) + fprintf(stderr, " %f", mel_energies_out[i]); + fprintf(stderr, "\n"); + } +} + +} // namespace knf diff --git a/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/mel-computations.h b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/mel-computations.h new file mode 100644 index 0000000..6e66715 --- /dev/null +++ b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/mel-computations.h @@ -0,0 +1,117 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// This file is copied/modified from kaldi/src/feat/mel-computations.h +#ifndef KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_ +#define KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_ + +#include +#include +#include +#include + +#include "feature-window.h" + +namespace knf { + +struct MelBanksOptions { + int32_t num_bins = 25; // e.g. 25; number of triangular bins + float low_freq = 20; // e.g. 20; lower frequency cutoff + + // an upper frequency cutoff; 0 -> no cutoff, negative + // ->added to the Nyquist frequency to get the cutoff. + float high_freq = 0; + + float vtln_low = 100; // vtln lower cutoff of warping function. + + // vtln upper cutoff of warping function: if negative, added + // to the Nyquist frequency to get the cutoff. + float vtln_high = -500; + + bool debug_mel = false; + // htk_mode is a "hidden" config, it does not show up on command line. + // Enables more exact compatibility with HTK, for testing purposes. Affects + // mel-energy flooring and reproduces a bug in HTK. + bool htk_mode = false; + + std::string ToString() const { + std::ostringstream os; + os << "num_bins: " << num_bins << "\n"; + os << "low_freq: " << low_freq << "\n"; + os << "high_freq: " << high_freq << "\n"; + os << "vtln_low: " << vtln_low << "\n"; + os << "vtln_high: " << vtln_high << "\n"; + os << "debug_mel: " << debug_mel << "\n"; + os << "htk_mode: " << htk_mode << "\n"; + return os.str(); + } +}; + +std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts); + +class MelBanks { + public: + static inline float InverseMelScale(float mel_freq) { + return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f); + } + + static inline float MelScale(float freq) { + return 1127.0f * logf(1.0f + freq / 700.0f); + } + + static float VtlnWarpFreq( + float vtln_low_cutoff, + float vtln_high_cutoff, // discontinuities in warp func + float low_freq, + float high_freq, // upper+lower frequency cutoffs in + // the mel computation + float vtln_warp_factor, float freq); + + static float VtlnWarpMelFreq(float vtln_low_cutoff, float vtln_high_cutoff, + float low_freq, float high_freq, + float vtln_warp_factor, float mel_freq); + + // TODO(fangjun): Remove vtln_warp_factor + MelBanks(const MelBanksOptions &opts, + const FrameExtractionOptions &frame_opts, float vtln_warp_factor); + + /// Compute Mel energies (note: not log energies). + /// At input, "fft_energies" contains the FFT energies (not log). + /// + /// @param fft_energies 1-D array of size num_fft_bins/2+1 + /// @param mel_energies_out 1-D array of size num_mel_bins + void Compute(const float *fft_energies, float *mel_energies_out) const; + + int32_t NumBins() const { return bins_.size(); } + + private: + // center frequencies of bins, numbered from 0 ... num_bins-1. + // Needed by GetCenterFreqs(). + std::vector center_freqs_; + + // the "bins_" vector is a vector, one for each bin, of a pair: + // (the first nonzero fft-bin), (the vector of weights). + std::vector>> bins_; + + // TODO(fangjun): Remove debug_ and htk_mode_ + bool debug_; + bool htk_mode_; +}; + +} // namespace knf + +#endif // KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_ diff --git a/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/online-feature.cc b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/online-feature.cc new file mode 100644 index 0000000..92826c6 --- /dev/null +++ b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/online-feature.cc @@ -0,0 +1,166 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// The content in this file is copied/modified from +// This file is copied/modified from kaldi/src/feat/online-feature.cc + +#include "online-feature.h" + +#include +#include +#include + +#include "feature-window.h" +#include "log.h" + +namespace knf { + +RecyclingVector::RecyclingVector(int32_t items_to_hold) + : items_to_hold_(items_to_hold == 0 ? -1 : items_to_hold), + first_available_index_(0) {} + +const float *RecyclingVector::At(int32_t index) const { + if (index < first_available_index_) { + KNF_LOG(FATAL) << "Attempted to retrieve feature vector that was " + "already removed by the RecyclingVector (index = " + << index << "; " + << "first_available_index = " << first_available_index_ + << "; " + << "size = " << Size() << ")"; + } + // 'at' does size checking. + return items_.at(index - first_available_index_).data(); +} + +void RecyclingVector::PushBack(std::vector item) { + // Note: -1 is a larger number when treated as unsigned + if (items_.size() == static_cast(items_to_hold_)) { + items_.pop_front(); + ++first_available_index_; + } + items_.push_back(std::move(item)); +} + +int32_t RecyclingVector::Size() const { + return first_available_index_ + static_cast(items_.size()); +} + +// discard the first n frames +void RecyclingVector::Pop(int32_t n) { + for (int32_t i = 0; i < n && !items_.empty(); ++i) { + items_.pop_front(); + ++first_available_index_; + } +} + +template +OnlineGenericBaseFeature::OnlineGenericBaseFeature( + const typename C::Options &opts) + : computer_(opts), + window_function_(computer_.GetFrameOptions()), + input_finished_(false), + waveform_offset_(0) {} + +template +void OnlineGenericBaseFeature::AcceptWaveform(float sampling_rate, + const float *waveform, + int32_t n) { + if (n == 0) { + return; // Nothing to do. + } + + if (input_finished_) { + KNF_LOG(FATAL) << "AcceptWaveform called after InputFinished() was called."; + } + + KNF_CHECK_EQ(sampling_rate, computer_.GetFrameOptions().samp_freq); + + waveform_remainder_.insert(waveform_remainder_.end(), waveform, waveform + n); + + ComputeFeatures(); +} + +template +void OnlineGenericBaseFeature::InputFinished() { + input_finished_ = true; + ComputeFeatures(); +} + +template +void OnlineGenericBaseFeature::ComputeFeatures() { + const FrameExtractionOptions &frame_opts = computer_.GetFrameOptions(); + + int64_t num_samples_total = waveform_offset_ + waveform_remainder_.size(); + + int32_t num_frames_old = features_.Size(); + + int32_t num_frames_new = + NumFrames(num_samples_total, frame_opts, input_finished_); + + KNF_CHECK_GE(num_frames_new, num_frames_old); + + // note: this online feature-extraction code does not support VTLN. + float vtln_warp = 1.0; + + std::vector window; + bool need_raw_log_energy = computer_.NeedRawLogEnergy(); + + for (int32_t frame = num_frames_old; frame < num_frames_new; ++frame) { + std::fill(window.begin(), window.end(), 0); + float raw_log_energy = 0.0; + ExtractWindow(waveform_offset_, waveform_remainder_.data(), waveform_remainder_.size(), + frame, frame_opts, window_function_, &window, + need_raw_log_energy ? &raw_log_energy : nullptr); + + std::vector this_feature(computer_.Dim()); + + computer_.Compute(raw_log_energy, vtln_warp, &window, this_feature.data()); + features_.PushBack(std::move(this_feature)); + } + + // OK, we will now discard any portion of the signal that will not be + // necessary to compute frames in the future. + int64_t first_sample_of_next_frame = + FirstSampleOfFrame(num_frames_new, frame_opts); + + int32_t samples_to_discard = first_sample_of_next_frame - waveform_offset_; + + if (samples_to_discard > 0) { + // discard the leftmost part of the waveform that we no longer need. + int32_t new_num_samples = + static_cast(waveform_remainder_.size()) - samples_to_discard; + + if (new_num_samples <= 0) { + // odd, but we'll try to handle it. + waveform_offset_ += waveform_remainder_.size(); + waveform_remainder_.resize(0); + } else { + std::vector new_remainder(new_num_samples); + + std::copy(waveform_remainder_.begin() + samples_to_discard, + waveform_remainder_.end(), new_remainder.begin()); + waveform_offset_ += samples_to_discard; + + waveform_remainder_.swap(new_remainder); + } + } +} + +template class OnlineGenericBaseFeature; + +} // namespace knf diff --git a/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/online-feature.h b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/online-feature.h new file mode 100644 index 0000000..e281fe1 --- /dev/null +++ b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/online-feature.h @@ -0,0 +1,148 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// The content in this file is copied/modified from +// This file is copied/modified from kaldi/src/feat/online-feature.h +#ifndef KALDI_NATIVE_FBANK_CSRC_ONLINE_FEATURE_H_ +#define KALDI_NATIVE_FBANK_CSRC_ONLINE_FEATURE_H_ + +#include +#include +#include + +#include "feature-fbank.h" + +namespace knf { + +/// This class serves as a storage for feature vectors with an option to limit +/// the memory usage by removing old elements. The deleted frames indices are +/// "remembered" so that regardless of the MAX_ITEMS setting, the user always +/// provides the indices as if no deletion was being performed. +/// This is useful when processing very long recordings which would otherwise +/// cause the memory to eventually blow up when the features are not being +/// removed. +class RecyclingVector { + public: + /// By default it does not remove any elements. + explicit RecyclingVector(int32_t items_to_hold = -1); + + ~RecyclingVector() = default; + RecyclingVector(const RecyclingVector &) = delete; + RecyclingVector &operator=(const RecyclingVector &) = delete; + + // The pointer is owned by RecyclingVector + // Users should not free it + const float *At(int32_t index) const; + + void PushBack(std::vector item); + + /// This method returns the size as if no "recycling" had happened, + /// i.e. equivalent to the number of times the PushBack method has been + /// called. + int32_t Size() const; + + // discard the first n frames + void Pop(int32_t n); + + private: + std::deque> items_; + int32_t items_to_hold_; + int32_t first_available_index_; +}; + +/// This is a templated class for online feature extraction; +/// it's templated on a class like MfccComputer or PlpComputer +/// that does the basic feature extraction. +template +class OnlineGenericBaseFeature { + public: + // Constructor from options class + explicit OnlineGenericBaseFeature(const typename C::Options &opts); + + int32_t Dim() const { return computer_.Dim(); } + + float FrameShiftInSeconds() const { + return computer_.GetFrameOptions().frame_shift_ms / 1000.0f; + } + + int32_t NumFramesReady() const { return features_.Size(); } + + // Note: IsLastFrame() will only ever return true if you have called + // InputFinished() (and this frame is the last frame). + bool IsLastFrame(int32_t frame) const { + return input_finished_ && frame == NumFramesReady() - 1; + } + + const float *GetFrame(int32_t frame) const { return features_.At(frame); } + + // This would be called from the application, when you get + // more wave data. Note: the sampling_rate is only provided so + // the code can assert that it matches the sampling rate + // expected in the options. + // + // @param sampling_rate The sampling_rate of the input waveform + // @param waveform Pointer to a 1-D array of size n + // @param n Number of entries in waveform + void AcceptWaveform(float sampling_rate, const float *waveform, int32_t n); + + // InputFinished() tells the class you won't be providing any + // more waveform. This will help flush out the last frame or two + // of features, in the case where snip-edges == false; it also + // affects the return value of IsLastFrame(). + void InputFinished(); + + // discard the first n frames + void Pop(int32_t n) { features_.Pop(n); } + + private: + // This function computes any additional feature frames that it is possible to + // compute from 'waveform_remainder_', which at this point may contain more + // than just a remainder-sized quantity (because AcceptWaveform() appends to + // waveform_remainder_ before calling this function). It adds these feature + // frames to features_, and shifts off any now-unneeded samples of input from + // waveform_remainder_ while incrementing waveform_offset_ by the same amount. + void ComputeFeatures(); + + C computer_; // class that does the MFCC or PLP or filterbank computation + + FeatureWindowFunction window_function_; + + // features_ is the Mfcc or Plp or Fbank features that we have already + // computed. + + RecyclingVector features_; + + // True if the user has called "InputFinished()" + bool input_finished_; + + // waveform_offset_ is the number of samples of waveform that we have + // already discarded, i.e. that were prior to 'waveform_remainder_'. + int64_t waveform_offset_; + + // waveform_remainder_ is a short piece of waveform that we may need to keep + // after extracting all the whole frames we can (whatever length of feature + // will be required for the next phase of computation). + // It is a 1-D tensor + std::vector waveform_remainder_; +}; + +using OnlineFbank = OnlineGenericBaseFeature; + +} // namespace knf + +#endif // KALDI_NATIVE_FBANK_CSRC_ONLINE_FEATURE_H_ diff --git a/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/rfft.cc b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/rfft.cc new file mode 100644 index 0000000..2ada947 --- /dev/null +++ b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/rfft.cc @@ -0,0 +1,67 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "rfft.h" + +#include +#include +#include + +#include "log.h" + +// see fftsg.c +#ifdef __cplusplus +extern "C" void rdft(int n, int isgn, double *a, int *ip, double *w); +#else +void rdft(int n, int isgn, double *a, int *ip, double *w); +#endif + +namespace knf { +class Rfft::RfftImpl { + public: + explicit RfftImpl(int32_t n) : n_(n), ip_(2 + std::sqrt(n / 2)), w_(n / 2) { + KNF_CHECK_EQ(n & (n - 1), 0); + } + + void Compute(float *in_out) { + std::vector d(in_out, in_out + n_); + + Compute(d.data()); + + std::copy(d.begin(), d.end(), in_out); + } + + void Compute(double *in_out) { + // 1 means forward fft + rdft(n_, 1, in_out, ip_.data(), w_.data()); + } + + private: + int32_t n_; + std::vector ip_; + std::vector w_; +}; + +Rfft::Rfft(int32_t n) : impl_(std::make_unique(n)) {} + +Rfft::~Rfft() = default; + +void Rfft::Compute(float *in_out) { impl_->Compute(in_out); } +void Rfft::Compute(double *in_out) { impl_->Compute(in_out); } + +} // namespace knf diff --git a/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/rfft.h b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/rfft.h new file mode 100644 index 0000000..c8cb9f8 --- /dev/null +++ b/seamless_communication/ggml/examples/kaldi-native-fbank/csrc/rfft.h @@ -0,0 +1,56 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef KALDI_NATIVE_FBANK_CSRC_RFFT_H_ +#define KALDI_NATIVE_FBANK_CSRC_RFFT_H_ + +#include + +namespace knf { + +// n-point Real discrete Fourier transform +// where n is a power of 2. n >= 2 +// +// R[k] = sum_j=0^n-1 in[j]*cos(2*pi*j*k/n), 0<=k<=n/2 +// I[k] = sum_j=0^n-1 in[j]*sin(2*pi*j*k/n), 0 impl_; +}; + +} // namespace knf + +#endif // KALDI_NATIVE_FBANK_CSRC_RFFT_H_ diff --git a/seamless_communication/ggml/examples/python/README.md b/seamless_communication/ggml/examples/python/README.md new file mode 100644 index 0000000..480920f --- /dev/null +++ b/seamless_communication/ggml/examples/python/README.md @@ -0,0 +1,115 @@ +# Simple autogenerated Python bindings for ggml + +This folder contains: + +- Scripts to generate full Python bindings from ggml headers (+ stubs for autocompletion in IDEs) +- Some barebones utils (see [ggml/utils.py](./ggml/utils.py)): + - `ggml.utils.init` builds a context that's freed automatically when the pointer gets GC'd + - `ggml.utils.copy` **copies between same-shaped tensors (numpy or ggml), w/ automatic (de/re)quantization** + - `ggml.utils.numpy` returns a numpy view over a ggml tensor; if it's quantized, it returns a copy (requires `allow_copy=True`) +- Very basic examples (anyone wants to port [llama2.c](https://github.com/karpathy/llama2.c)?) + +Provided you set `GGML_LIBRARY=.../path/to/libggml_shared.so` (see instructions below), it's trivial to do some operations on quantized tensors: + +```python +# Make sure libllama.so is in your [DY]LD_LIBRARY_PATH, or set GGML_LIBRARY=.../libggml_shared.so + +from ggml import lib, ffi +from ggml.utils import init, copy, numpy +import numpy as np + +ctx = init(mem_size=12*1024*1024) +n = 256 +n_threads = 4 + +a = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_Q5_K, n) +b = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, n) # Can't both be quantized +sum = lib.ggml_add(ctx, a, b) # all zeroes for now. Will be quantized too! + +gf = ffi.new('struct ggml_cgraph*') +lib.ggml_build_forward_expand(gf, sum) + +copy(np.array([i for i in range(n)], np.float32), a) +copy(np.array([i*100 for i in range(n)], np.float32), b) + +lib.ggml_graph_compute_with_ctx(ctx, gf, n_threads) + +print(numpy(a, allow_copy=True)) +# 0. 1.0439453 2.0878906 3.131836 4.1757812 5.2197266. ... +print(numpy(b)) +# 0. 100. 200. 300. 400. 500. ... +print(numpy(sum, allow_copy=True)) +# 0. 105.4375 210.875 316.3125 421.75 527.1875 ... +``` + +### Prerequisites + +You'll need a shared library of ggml to use the bindings. + +#### Build libggml_shared.so or libllama.so + +As of this writing the best is to use [ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp)'s generated `libggml_shared.so` or `libllama.so`, which you can build as follows: + +```bash +git clone https://github.com/ggerganov/llama.cpp +# On a CUDA-enabled system add -DLLAMA_CUBLAS=1 +# On a Mac add -DLLAMA_METAL=1 +cmake llama.cpp \ + -B llama_build \ + -DCMAKE_C_FLAGS=-Ofast \ + -DLLAMA_NATIVE=1 \ + -DLLAMA_LTO=1 \ + -DBUILD_SHARED_LIBS=1 \ + -DLLAMA_MPI=1 \ + -DLLAMA_BUILD_TESTS=0 \ + -DLLAMA_BUILD_EXAMPLES=0 +( cd llama_build && make -j ) + +# On Mac, this will be libggml_shared.dylib instead +export GGML_LIBRARY=$PWD/llama_build/libggml_shared.so +# Alternatively, you can just copy it to your system's lib dir, e.g /usr/local/lib +``` + +#### (Optional) Regenerate the bindings and stubs + +If you added or changed any signatures of the C API, you'll want to regenerate the bindings ([ggml/cffi.py](./ggml/cffi.py)) and stubs ([ggml/__init__.pyi](./ggml/__init__.pyi)). + +Luckily it's a one-liner using [regenerate.py](./regenerate.py): + +```bash +pip install -q cffi + +python regenerate.py +``` + +By default it assumes `llama.cpp` was cloned in ../../../llama.cpp (alongside the ggml folder). You can override this with: + +```bash +C_INCLUDE_DIR=$LLAMA_CPP_DIR python regenerate.py +``` + +You can also edit [api.h](./api.h) to control which files should be included in the generated bindings (defaults to `llama.cpp/ggml*.h`) + +In fact, if you wanted to only generate bindings for the current version of the `ggml` repo itself (instead of `llama.cpp`; you'd loose support for k-quants), you could run: + +```bash +API=../../include/ggml/ggml.h python regenerate.py +``` + +## Develop + +Run tests: + +```bash +pytest +``` + +### Alternatives + +This example's goal is to showcase [cffi](https://cffi.readthedocs.io/)-generated bindings that are trivial to use and update, but there are already alternatives in the wild: + +- https://github.com/abetlen/ggml-python: these bindings seem to be hand-written and use [ctypes](https://docs.python.org/3/library/ctypes.html). It has [high-quality API reference docs](https://ggml-python.readthedocs.io/en/latest/api-reference/#ggml.ggml) that can be used with these bindings too, but it doesn't expose Metal, CUDA, MPI or OpenCL calls, doesn't support transparent (de/re)quantization like this example does (see [ggml.utils](./ggml/utils.py) module), and won't pick up your local changes. + +- https://github.com/abetlen/llama-cpp-python: these expose the C++ `llama.cpp` interface, which this example cannot easily be extended to support (`cffi` only generates bindings of C libraries) + +- [pybind11](https://github.com/pybind/pybind11) and [nanobind](https://github.com/wjakob/nanobind) are two alternatives to cffi that support binding C++ libraries, but it doesn't seem either of them have an automatic generator (writing bindings is rather time-consuming). diff --git a/seamless_communication/ggml/examples/python/api.h b/seamless_communication/ggml/examples/python/api.h new file mode 100644 index 0000000..8d565bd --- /dev/null +++ b/seamless_communication/ggml/examples/python/api.h @@ -0,0 +1,14 @@ +/* + List here all the headers you want to expose in the Python bindings, + then run `python regenerate.py` (see details in README.md) +*/ + +#include "ggml.h" +#include "ggml-metal.h" +#include "ggml-opencl.h" + +// Headers below are currently only present in the llama.cpp repository, comment them out if you don't have them. +#include "k_quants.h" +#include "ggml-alloc.h" +#include "ggml-cuda.h" +#include "ggml-mpi.h" \ No newline at end of file diff --git a/seamless_communication/ggml/examples/python/example_add_quant.py b/seamless_communication/ggml/examples/python/example_add_quant.py new file mode 100644 index 0000000..cecb44e --- /dev/null +++ b/seamless_communication/ggml/examples/python/example_add_quant.py @@ -0,0 +1,25 @@ +from ggml import lib, ffi +from ggml.utils import init, copy, numpy +import numpy as np + +ctx = init(mem_size=12*1024*1024) # automatically freed when pointer is GC'd +n = 256 +n_threads = 4 + +a = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_Q5_K, n) +b = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, n) # can't both be quantized +sum = lib.ggml_add(ctx, a, b) # all zeroes for now. Will be quantized too! + +# See cffi's doc on how to allocate native memory: it's very simple! +# https://cffi.readthedocs.io/en/latest/ref.html#ffi-interface +gf = ffi.new('struct ggml_cgraph*') +lib.ggml_build_forward_expand(gf, sum) + +copy(np.array([i for i in range(n)], np.float32), a) +copy(np.array([i*100 for i in range(n)], np.float32), b) + +lib.ggml_graph_compute_with_ctx(ctx, gf, n_threads) + +print(numpy(a, allow_copy=True)) +print(numpy(b)) +print(numpy(sum, allow_copy=True)) \ No newline at end of file diff --git a/seamless_communication/ggml/examples/python/example_test_all_quants.py b/seamless_communication/ggml/examples/python/example_test_all_quants.py new file mode 100644 index 0000000..8d3c966 --- /dev/null +++ b/seamless_communication/ggml/examples/python/example_test_all_quants.py @@ -0,0 +1,68 @@ +from ggml import ffi, lib +from ggml.utils import init, numpy, copy +import numpy as np +from math import pi, cos, sin, ceil + +import matplotlib.pyplot as plt + +ctx = init(mem_size=100*1024*1024) # Will be auto-GC'd +n = 256 + +orig = np.array([ + [ + cos(j * 2 * pi / n) * (sin(i * 2 * pi / n)) + for j in range(n) + ] + for i in range(n) +], np.float32) +orig_tensor = lib.ggml_new_tensor_2d(ctx, lib.GGML_TYPE_F32, n, n) +copy(orig, orig_tensor) + +quants = [ + type for type in range(lib.GGML_TYPE_COUNT) + if lib.ggml_is_quantized(type) and + type not in [lib.GGML_TYPE_Q8_1, lib.GGML_TYPE_Q8_K] # Apparently not supported +] +# quants = [lib.GGML_TYPE_Q2_K] # Test a single one + +def get_name(type): + name = lib.ggml_type_name(type) + return ffi.string(name).decode('utf-8') if name else '?' + +quants.sort(key=get_name) +quants.insert(0, None) +print(quants) + +ncols=4 +nrows = ceil(len(quants) / ncols) + +plt.figure(figsize=(ncols * 5, nrows * 5), layout='tight') + +for i, type in enumerate(quants): + plt.subplot(nrows, ncols, i + 1) + try: + if type == None: + plt.title('Original') + plt.imshow(orig) + else: + quantized_tensor = lib.ggml_new_tensor_2d(ctx, type, n, n) + copy(orig_tensor, quantized_tensor) + quantized = numpy(quantized_tensor, allow_copy=True) + d = quantized - orig + results = { + "l2": np.linalg.norm(d, 2), + "linf": np.linalg.norm(d, np.inf), + "compression": + round(lib.ggml_nbytes(orig_tensor) / + lib.ggml_nbytes(quantized_tensor), 1) + } + name = get_name(type) + print(f'{name}: {results}') + + plt.title(f'{name} ({results["compression"]}x smaller)') + plt.imshow(quantized, interpolation='nearest') + + except Exception as e: + print(f'Error: {e}') + +plt.show() \ No newline at end of file diff --git a/seamless_communication/ggml/examples/python/ggml/__init__.py b/seamless_communication/ggml/examples/python/ggml/__init__.py new file mode 100644 index 0000000..31a1910 --- /dev/null +++ b/seamless_communication/ggml/examples/python/ggml/__init__.py @@ -0,0 +1,58 @@ +""" + Python bindings for the ggml library. + + Usage example: + + from ggml import lib, ffi + from ggml.utils import init, copy, numpy + import numpy as np + + ctx = init(mem_size=10*1024*1024) + n = 1024 + n_threads = 4 + + a = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_Q5_K, n) + b = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, n) + sum = lib.ggml_add(ctx, a, b) + + gf = ffi.new('struct ggml_cgraph*') + lib.ggml_build_forward_expand(gf, sum) + + copy(np.array([i for i in range(n)], np.float32), a) + copy(np.array([i*100 for i in range(n)], np.float32), b) + lib.ggml_graph_compute_with_ctx(ctx, gf, n_threads) + + print(numpy(sum, allow_copy=True)) + + See https://cffi.readthedocs.io/en/latest/cdef.html for more on cffi. +""" + +try: + from ggml.cffi import ffi as ffi +except ImportError as e: + raise ImportError(f"Couldn't find ggml bindings ({e}). Run `python regenerate.py` or check your PYTHONPATH.") + +import os, platform + +__exact_library = os.environ.get("GGML_LIBRARY") +if __exact_library: + __candidates = [__exact_library] +elif platform.system() == "Windows": + __candidates = ["ggml_shared.dll", "llama.dll"] +else: + __candidates = ["libggml_shared.so", "libllama.so"] + if platform.system() == "Darwin": + __candidates += ["libggml_shared.dylib", "libllama.dylib"] + +for i, name in enumerate(__candidates): + try: + # This is where all the functions, enums and constants are defined + lib = ffi.dlopen(name) + except OSError: + if i < len(__candidates) - 1: + continue + raise OSError(f"Couldn't find ggml's shared library (tried names: {__candidates}). Add its directory to DYLD_LIBRARY_PATH (on Mac) or LD_LIBRARY_PATH, or define GGML_LIBRARY.") + +# This contains the cffi helpers such as new, cast, string, etc. +# https://cffi.readthedocs.io/en/latest/ref.html#ffi-interface +ffi = ffi diff --git a/seamless_communication/ggml/examples/python/ggml/__init__.pyi b/seamless_communication/ggml/examples/python/ggml/__init__.pyi new file mode 100644 index 0000000..1a764b0 --- /dev/null +++ b/seamless_communication/ggml/examples/python/ggml/__init__.pyi @@ -0,0 +1,2431 @@ +# auto-generated file +import ggml.ffi as ffi +import numpy as np +class lib: + @property + def GGML_BACKEND_CPU(self) -> int: ... + @property + def GGML_BACKEND_GPU(self) -> int: ... + @property + def GGML_BACKEND_GPU_SPLIT(self) -> int: ... + @property + def GGML_FTYPE_ALL_F32(self) -> int: ... + @property + def GGML_FTYPE_MOSTLY_F16(self) -> int: ... + @property + def GGML_FTYPE_MOSTLY_Q2_K(self) -> int: ... + @property + def GGML_FTYPE_MOSTLY_Q3_K(self) -> int: ... + @property + def GGML_FTYPE_MOSTLY_Q4_0(self) -> int: ... + @property + def GGML_FTYPE_MOSTLY_Q4_1(self) -> int: ... + @property + def GGML_FTYPE_MOSTLY_Q4_1_SOME_F16(self) -> int: ... + @property + def GGML_FTYPE_MOSTLY_Q4_K(self) -> int: ... + @property + def GGML_FTYPE_MOSTLY_Q5_0(self) -> int: ... + @property + def GGML_FTYPE_MOSTLY_Q5_1(self) -> int: ... + @property + def GGML_FTYPE_MOSTLY_Q5_K(self) -> int: ... + @property + def GGML_FTYPE_MOSTLY_Q6_K(self) -> int: ... + @property + def GGML_FTYPE_MOSTLY_Q8_0(self) -> int: ... + @property + def GGML_FTYPE_UNKNOWN(self) -> int: ... + @property + def GGML_LINESEARCH_BACKTRACKING_ARMIJO(self) -> int: ... + @property + def GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE(self) -> int: ... + @property + def GGML_LINESEARCH_BACKTRACKING_WOLFE(self) -> int: ... + @property + def GGML_LINESEARCH_DEFAULT(self) -> int: ... + @property + def GGML_LINESEARCH_FAIL(self) -> int: ... + @property + def GGML_LINESEARCH_INVALID_PARAMETERS(self) -> int: ... + @property + def GGML_LINESEARCH_MAXIMUM_ITERATIONS(self) -> int: ... + @property + def GGML_LINESEARCH_MAXIMUM_STEP(self) -> int: ... + @property + def GGML_LINESEARCH_MINIMUM_STEP(self) -> int: ... + @property + def GGML_OBJECT_GRAPH(self) -> int: ... + @property + def GGML_OBJECT_TENSOR(self) -> int: ... + @property + def GGML_OBJECT_WORK_BUFFER(self) -> int: ... + @property + def GGML_OPT_ADAM(self) -> int: ... + @property + def GGML_OPT_DID_NOT_CONVERGE(self) -> int: ... + @property + def GGML_OPT_FAIL(self) -> int: ... + @property + def GGML_OPT_INVALID_WOLFE(self) -> int: ... + @property + def GGML_OPT_LBFGS(self) -> int: ... + @property + def GGML_OPT_NO_CONTEXT(self) -> int: ... + @property + def GGML_OPT_OK(self) -> int: ... + @property + def GGML_OP_ACC(self) -> int: ... + @property + def GGML_OP_ADD(self) -> int: ... + @property + def GGML_OP_ADD1(self) -> int: ... + @property + def GGML_OP_ALIBI(self) -> int: ... + @property + def GGML_OP_ARGMAX(self) -> int: ... + @property + def GGML_OP_CLAMP(self) -> int: ... + @property + def GGML_OP_CONT(self) -> int: ... + @property + def GGML_OP_CONV_1D(self) -> int: ... + @property + def GGML_OP_CONV_2D(self) -> int: ... + @property + def GGML_OP_COUNT(self) -> int: ... + @property + def GGML_OP_CPY(self) -> int: ... + @property + def GGML_OP_CROSS_ENTROPY_LOSS(self) -> int: ... + @property + def GGML_OP_CROSS_ENTROPY_LOSS_BACK(self) -> int: ... + @property + def GGML_OP_DIAG(self) -> int: ... + @property + def GGML_OP_DIAG_MASK_INF(self) -> int: ... + @property + def GGML_OP_DIAG_MASK_ZERO(self) -> int: ... + @property + def GGML_OP_DIV(self) -> int: ... + @property + def GGML_OP_DUP(self) -> int: ... + @property + def GGML_OP_FLASH_ATTN(self) -> int: ... + @property + def GGML_OP_FLASH_ATTN_BACK(self) -> int: ... + @property + def GGML_OP_FLASH_FF(self) -> int: ... + @property + def GGML_OP_GET_ROWS(self) -> int: ... + @property + def GGML_OP_GET_ROWS_BACK(self) -> int: ... + @property + def GGML_OP_LOG(self) -> int: ... + @property + def GGML_OP_MAP_BINARY(self) -> int: ... + @property + def GGML_OP_MAP_CUSTOM1(self) -> int: ... + @property + def GGML_OP_MAP_CUSTOM1_F32(self) -> int: ... + @property + def GGML_OP_MAP_CUSTOM2(self) -> int: ... + @property + def GGML_OP_MAP_CUSTOM2_F32(self) -> int: ... + @property + def GGML_OP_MAP_CUSTOM3(self) -> int: ... + @property + def GGML_OP_MAP_CUSTOM3_F32(self) -> int: ... + @property + def GGML_OP_MAP_UNARY(self) -> int: ... + @property + def GGML_OP_MEAN(self) -> int: ... + @property + def GGML_OP_MUL(self) -> int: ... + @property + def GGML_OP_MUL_MAT(self) -> int: ... + @property + def GGML_OP_NONE(self) -> int: ... + @property + def GGML_OP_NORM(self) -> int: ... + @property + def GGML_OP_OUT_PROD(self) -> int: ... + @property + def GGML_OP_PERMUTE(self) -> int: ... + @property + def GGML_OP_POOL_1D(self) -> int: ... + @property + def GGML_OP_POOL_2D(self) -> int: ... + @property + def GGML_OP_POOL_AVG(self) -> int: ... + @property + def GGML_OP_POOL_COUNT(self) -> int: ... + @property + def GGML_OP_POOL_MAX(self) -> int: ... + @property + def GGML_OP_REPEAT(self) -> int: ... + @property + def GGML_OP_REPEAT_BACK(self) -> int: ... + @property + def GGML_OP_RESHAPE(self) -> int: ... + @property + def GGML_OP_RMS_NORM(self) -> int: ... + @property + def GGML_OP_RMS_NORM_BACK(self) -> int: ... + @property + def GGML_OP_ROPE(self) -> int: ... + @property + def GGML_OP_ROPE_BACK(self) -> int: ... + @property + def GGML_OP_SCALE(self) -> int: ... + @property + def GGML_OP_SET(self) -> int: ... + @property + def GGML_OP_SILU_BACK(self) -> int: ... + @property + def GGML_OP_SOFT_MAX(self) -> int: ... + @property + def GGML_OP_SOFT_MAX_BACK(self) -> int: ... + @property + def GGML_OP_SQR(self) -> int: ... + @property + def GGML_OP_SQRT(self) -> int: ... + @property + def GGML_OP_SUB(self) -> int: ... + @property + def GGML_OP_SUM(self) -> int: ... + @property + def GGML_OP_SUM_ROWS(self) -> int: ... + @property + def GGML_OP_TRANSPOSE(self) -> int: ... + @property + def GGML_OP_UNARY(self) -> int: ... + @property + def GGML_OP_VIEW(self) -> int: ... + @property + def GGML_OP_WIN_PART(self) -> int: ... + @property + def GGML_OP_WIN_UNPART(self) -> int: ... + @property + def GGML_TASK_COMPUTE(self) -> int: ... + @property + def GGML_TASK_FINALIZE(self) -> int: ... + @property + def GGML_TASK_INIT(self) -> int: ... + @property + def GGML_TYPE_COUNT(self) -> int: ... + @property + def GGML_TYPE_F16(self) -> int: ... + @property + def GGML_TYPE_F32(self) -> int: ... + @property + def GGML_TYPE_I16(self) -> int: ... + @property + def GGML_TYPE_I32(self) -> int: ... + @property + def GGML_TYPE_I8(self) -> int: ... + @property + def GGML_TYPE_Q2_K(self) -> int: ... + @property + def GGML_TYPE_Q3_K(self) -> int: ... + @property + def GGML_TYPE_Q4_0(self) -> int: ... + @property + def GGML_TYPE_Q4_1(self) -> int: ... + @property + def GGML_TYPE_Q4_K(self) -> int: ... + @property + def GGML_TYPE_Q5_0(self) -> int: ... + @property + def GGML_TYPE_Q5_1(self) -> int: ... + @property + def GGML_TYPE_Q5_K(self) -> int: ... + @property + def GGML_TYPE_Q6_K(self) -> int: ... + @property + def GGML_TYPE_Q8_0(self) -> int: ... + @property + def GGML_TYPE_Q8_1(self) -> int: ... + @property + def GGML_TYPE_Q8_K(self) -> int: ... + @property + def GGML_UNARY_OP_ABS(self) -> int: ... + @property + def GGML_UNARY_OP_ELU(self) -> int: ... + @property + def GGML_UNARY_OP_GELU(self) -> int: ... + @property + def GGML_UNARY_OP_GELU_QUICK(self) -> int: ... + @property + def GGML_UNARY_OP_NEG(self) -> int: ... + @property + def GGML_UNARY_OP_RELU(self) -> int: ... + @property + def GGML_UNARY_OP_SGN(self) -> int: ... + @property + def GGML_UNARY_OP_SILU(self) -> int: ... + @property + def GGML_UNARY_OP_STEP(self) -> int: ... + @property + def GGML_UNARY_OP_TANH(self) -> int: ... + @property + def GGUF_TYPE_ARRAY(self) -> int: ... + @property + def GGUF_TYPE_BOOL(self) -> int: ... + @property + def GGUF_TYPE_COUNT(self) -> int: ... + @property + def GGUF_TYPE_FLOAT32(self) -> int: ... + @property + def GGUF_TYPE_INT16(self) -> int: ... + @property + def GGUF_TYPE_INT32(self) -> int: ... + @property + def GGUF_TYPE_INT8(self) -> int: ... + @property + def GGUF_TYPE_STRING(self) -> int: ... + @property + def GGUF_TYPE_UINT16(self) -> int: ... + @property + def GGUF_TYPE_UINT32(self) -> int: ... + @property + def GGUF_TYPE_UINT8(self) -> int: ... + def abort_callback(data: ffi.CData) -> bool: + """ + abort ggml_graph_compute when true + + bool (*abort_callback)(void * data); + """ + ... + def dequantize_row_q2_K(x: ffi.CData, y: ffi.CData, k: int) -> None: + """ + Dequantization + + void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k); + """ + ... + def dequantize_row_q3_K(x: ffi.CData, y: ffi.CData, k: int) -> None: + """void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);""" + ... + def dequantize_row_q4_K(x: ffi.CData, y: ffi.CData, k: int) -> None: + """void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);""" + ... + def dequantize_row_q5_K(x: ffi.CData, y: ffi.CData, k: int) -> None: + """void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);""" + ... + def dequantize_row_q6_K(x: ffi.CData, y: ffi.CData, k: int) -> None: + """void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);""" + ... + def dequantize_row_q8_K(x: ffi.CData, y: ffi.CData, k: int) -> None: + """void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);""" + ... + def ggml_abs(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_abs( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_abs_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_abs_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_acc(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, nb1: int, nb2: int, nb3: int, offset: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_acc( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + """ + ... + def ggml_acc_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, nb1: int, nb2: int, nb3: int, offset: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_acc_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + """ + ... + def ggml_add(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_add( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_add1(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_add1( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_add1_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_add1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_add_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_add_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_alibi(ctx: ffi.CData, a: ffi.CData, n_past: int, n_head: int, bias_max: float) -> ffi.CData: + """ + alibi position embedding + in-place, returns view(a) + + struct ggml_tensor * ggml_alibi( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_head, + float bias_max); + """ + ... + def ggml_allocr_alloc(alloc: ffi.CData, tensor: ffi.CData) -> None: + """GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);""" + ... + def ggml_allocr_alloc_graph(alloc: ffi.CData, graph: ffi.CData) -> int: + """GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);""" + ... + def ggml_allocr_free(alloc: ffi.CData) -> None: + """GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);""" + ... + def ggml_allocr_is_measure(alloc: ffi.CData) -> bool: + """GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);""" + ... + def ggml_allocr_new(data: ffi.CData, size: int, alignment: int) -> ffi.CData: + """GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);""" + ... + def ggml_allocr_new_measure(alignment: int) -> ffi.CData: + """GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);""" + ... + def ggml_allocr_reset(alloc: ffi.CData) -> None: + """GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);""" + ... + def ggml_allocr_set_parse_seq(alloc: ffi.CData, list: ffi.CData, n: int) -> None: + """ + tell the allocator to parse nodes following the order described in the list + you should call this if your graph are optimized to execute out-of-order + + GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n); + """ + ... + def ggml_are_same_shape(t0: ffi.CData, t1: ffi.CData) -> bool: + """ GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);""" + ... + def ggml_argmax(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + argmax along rows + + GGML_API struct ggml_tensor * ggml_argmax( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_blck_size(type: int) -> int: + """ GGML_API int ggml_blck_size (enum ggml_type type);""" + ... + def ggml_build_backward(ctx: ffi.CData, gf: ffi.CData, keep: bool) -> ffi.CData: + """ GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);""" + ... + def ggml_build_forward(tensor: ffi.CData) -> ffi.CData: + """ GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);""" + ... + def ggml_build_forward_ctx(ctx: ffi.CData, tensor: ffi.CData) -> ffi.CData: + """ GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);""" + ... + def ggml_build_forward_expand(cgraph: ffi.CData, tensor: ffi.CData) -> None: + """ GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);""" + ... + def ggml_cl_can_mul_mat(src0: ffi.CData, src1: ffi.CData, dst: ffi.CData) -> bool: + """bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);""" + ... + def ggml_cl_free_data(tensor: ffi.CData) -> None: + """void ggml_cl_free_data(const struct ggml_tensor* tensor);""" + ... + def ggml_cl_host_free(ptr: ffi.CData) -> None: + """void ggml_cl_host_free(void * ptr);""" + ... + def ggml_cl_host_malloc(size: int) -> ffi.CData: + """void * ggml_cl_host_malloc(size_t size);""" + ... + def ggml_cl_init() -> None: + """void ggml_cl_init(void);""" + ... + def ggml_cl_mul(src0: ffi.CData, src1: ffi.CData, dst: ffi.CData) -> None: + """void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);""" + ... + def ggml_cl_mul_mat(src0: ffi.CData, src1: ffi.CData, dst: ffi.CData, wdata: ffi.CData, wsize: int) -> None: + """void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);""" + ... + def ggml_cl_mul_mat_get_wsize(src0: ffi.CData, src1: ffi.CData, dst: ffi.CData) -> int: + """size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);""" + ... + def ggml_cl_transform_tensor(data: ffi.CData, tensor: ffi.CData) -> None: + """void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);""" + ... + def ggml_clamp(ctx: ffi.CData, a: ffi.CData, min: float, max: float) -> ffi.CData: + """ + clamp + in-place, returns view(a) + + struct ggml_tensor * ggml_clamp( + struct ggml_context * ctx, + struct ggml_tensor * a, + float min, + float max); + """ + ... + def ggml_cont(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + make contiguous + + GGML_API struct ggml_tensor * ggml_cont( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_cont_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + make contiguous, in-place + + GGML_API struct ggml_tensor * ggml_cont_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_conv_1d(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, s0: int, p0: int, d0: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_conv_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, // stride + int p0, // padding + int d0); // dilation + """ + ... + def ggml_conv_1d_ph(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, s: int, d: int) -> ffi.CData: + """ + conv_1d with padding = half + alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) + + GGML_API struct ggml_tensor * ggml_conv_1d_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s, + int d); + """ + ... + def ggml_conv_2d(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, s0: int, s1: int, p0: int, p1: int, d0: int, d1: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_conv_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1); + """ + ... + def ggml_cpu_has_arm_fma() -> int: + """ GGML_API int ggml_cpu_has_arm_fma (void);""" + ... + def ggml_cpu_has_avx() -> int: + """ GGML_API int ggml_cpu_has_avx (void);""" + ... + def ggml_cpu_has_avx2() -> int: + """ GGML_API int ggml_cpu_has_avx2 (void);""" + ... + def ggml_cpu_has_avx512() -> int: + """ GGML_API int ggml_cpu_has_avx512 (void);""" + ... + def ggml_cpu_has_avx512_vbmi() -> int: + """ GGML_API int ggml_cpu_has_avx512_vbmi(void);""" + ... + def ggml_cpu_has_avx512_vnni() -> int: + """ GGML_API int ggml_cpu_has_avx512_vnni(void);""" + ... + def ggml_cpu_has_blas() -> int: + """ GGML_API int ggml_cpu_has_blas (void);""" + ... + def ggml_cpu_has_clblast() -> int: + """ GGML_API int ggml_cpu_has_clblast (void);""" + ... + def ggml_cpu_has_cublas() -> int: + """ GGML_API int ggml_cpu_has_cublas (void);""" + ... + def ggml_cpu_has_f16c() -> int: + """ GGML_API int ggml_cpu_has_f16c (void);""" + ... + def ggml_cpu_has_fma() -> int: + """ GGML_API int ggml_cpu_has_fma (void);""" + ... + def ggml_cpu_has_fp16_va() -> int: + """ GGML_API int ggml_cpu_has_fp16_va (void);""" + ... + def ggml_cpu_has_gpublas() -> int: + """ GGML_API int ggml_cpu_has_gpublas (void);""" + ... + def ggml_cpu_has_neon() -> int: + """ GGML_API int ggml_cpu_has_neon (void);""" + ... + def ggml_cpu_has_sse3() -> int: + """ GGML_API int ggml_cpu_has_sse3 (void);""" + ... + def ggml_cpu_has_vsx() -> int: + """ GGML_API int ggml_cpu_has_vsx (void);""" + ... + def ggml_cpu_has_wasm_simd() -> int: + """ GGML_API int ggml_cpu_has_wasm_simd (void);""" + ... + def ggml_cpy(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + a -> b, return view(b) + + GGML_API struct ggml_tensor * ggml_cpy( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_cpy_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + a -> b, in-place, return view(b) + + GGML_API struct ggml_tensor * ggml_cpy_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_cross_entropy_loss(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_cross_entropy_loss( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_cross_entropy_loss_back(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, c: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c); + """ + ... + def ggml_cuda_assign_buffers(tensor: ffi.CData) -> None: + """GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);""" + ... + def ggml_cuda_assign_buffers_force_inplace(tensor: ffi.CData) -> None: + """GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);""" + ... + def ggml_cuda_assign_buffers_no_scratch(tensor: ffi.CData) -> None: + """GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);""" + ... + def ggml_cuda_can_mul_mat(src0: ffi.CData, src1: ffi.CData, dst: ffi.CData) -> bool: + """GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);""" + ... + def ggml_cuda_compute_forward(params: ffi.CData, tensor: ffi.CData) -> bool: + """GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);""" + ... + def ggml_cuda_free_data(tensor: ffi.CData) -> None: + """GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor);""" + ... + def ggml_cuda_free_scratch() -> None: + """GGML_API void ggml_cuda_free_scratch(void);""" + ... + def ggml_cuda_get_device_count() -> int: + """GGML_API int ggml_cuda_get_device_count(void);""" + ... + def ggml_cuda_get_device_description(device: int, description: ffi.CData, description_size: int) -> None: + """GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);""" + ... + def ggml_cuda_host_free(ptr: ffi.CData) -> None: + """GGML_API void ggml_cuda_host_free(void * ptr);""" + ... + def ggml_cuda_host_malloc(size: int) -> ffi.CData: + """GGML_API void * ggml_cuda_host_malloc(size_t size);""" + ... + def ggml_cuda_set_main_device(main_device: int) -> None: + """GGML_API void ggml_cuda_set_main_device(int main_device);""" + ... + def ggml_cuda_set_mul_mat_q(mul_mat_q: bool) -> None: + """GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);""" + ... + def ggml_cuda_set_scratch_size(scratch_size: int) -> None: + """GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size);""" + ... + def ggml_cuda_set_tensor_split(tensor_split: ffi.CData) -> None: + """GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split);""" + ... + def ggml_cuda_transform_tensor(data: ffi.CData, tensor: ffi.CData) -> None: + """GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);""" + ... + def ggml_cycles() -> int: + """ GGML_API int64_t ggml_cycles(void);""" + ... + def ggml_cycles_per_ms() -> int: + """ GGML_API int64_t ggml_cycles_per_ms(void);""" + ... + def ggml_diag(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_diag( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_diag_mask_inf(ctx: ffi.CData, a: ffi.CData, n_past: int) -> ffi.CData: + """ + set elements above the diagonal to -INF + + GGML_API struct ggml_tensor * ggml_diag_mask_inf( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + """ + ... + def ggml_diag_mask_inf_inplace(ctx: ffi.CData, a: ffi.CData, n_past: int) -> ffi.CData: + """ + in-place, returns view(a) + + GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + """ + ... + def ggml_diag_mask_zero(ctx: ffi.CData, a: ffi.CData, n_past: int) -> ffi.CData: + """ + set elements above the diagonal to 0 + + GGML_API struct ggml_tensor * ggml_diag_mask_zero( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + """ + ... + def ggml_diag_mask_zero_inplace(ctx: ffi.CData, a: ffi.CData, n_past: int) -> ffi.CData: + """ + in-place, returns view(a) + + GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + """ + ... + def ggml_div(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_div( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_div_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_div_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_dup(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_dup( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_dup_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + in-place, returns view(a) + + GGML_API struct ggml_tensor * ggml_dup_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_dup_tensor(ctx: ffi.CData, src: ffi.CData) -> ffi.CData: + """ GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);""" + ... + def ggml_element_size(tensor: ffi.CData) -> int: + """ GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);""" + ... + def ggml_elu(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_elu( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_elu_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_elu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_flash_attn(ctx: ffi.CData, q: ffi.CData, k: ffi.CData, v: ffi.CData, masked: bool) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_flash_attn( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + bool masked); + """ + ... + def ggml_flash_attn_back(ctx: ffi.CData, q: ffi.CData, k: ffi.CData, v: ffi.CData, d: ffi.CData, masked: bool) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_flash_attn_back( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * d, + bool masked); + """ + ... + def ggml_flash_ff(ctx: ffi.CData, a: ffi.CData, b0: ffi.CData, b1: ffi.CData, c0: ffi.CData, c1: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_flash_ff( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b0, + struct ggml_tensor * b1, + struct ggml_tensor * c0, + struct ggml_tensor * c1); + """ + ... + def ggml_format_name(tensor: ffi.CData, fmt: ffi.CData, *args2) -> ffi.CData: + """ GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);""" + ... + def ggml_fp16_to_fp32(x: np.float16) -> float: + """ + convert FP16 <-> FP32 + + GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x); + """ + ... + def ggml_fp16_to_fp32_row(x: ffi.CData, y: ffi.CData, n: int) -> None: + """ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);""" + ... + def ggml_fp32_to_fp16(x: float) -> np.float16: + """ GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);""" + ... + def ggml_fp32_to_fp16_row(x: ffi.CData, y: ffi.CData, n: int) -> None: + """ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);""" + ... + def ggml_free(ctx: ffi.CData) -> None: + """ GGML_API void ggml_free(struct ggml_context * ctx);""" + ... + def ggml_ftype_to_ggml_type(ftype: int) -> int: + """ + TODO: temporary until model loading of ggml examples is refactored + + GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); + """ + ... + def ggml_gelu(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + TODO: double-check this computation is correct + + GGML_API struct ggml_tensor * ggml_gelu( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_gelu_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_gelu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_gelu_quick(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_gelu_quick( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_gelu_quick_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_gelu_quick_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_get_data(tensor: ffi.CData) -> ffi.CData: + """ GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);""" + ... + def ggml_get_data_f32(tensor: ffi.CData) -> ffi.CData: + """ GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);""" + ... + def ggml_get_f32_1d(tensor: ffi.CData, i: int) -> float: + """ GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);""" + ... + def ggml_get_i32_1d(tensor: ffi.CData, i: int) -> int: + """ GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);""" + ... + def ggml_get_max_tensor_size(ctx: ffi.CData) -> int: + """ GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);""" + ... + def ggml_get_mem_buffer(ctx: ffi.CData) -> ffi.CData: + """ GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);""" + ... + def ggml_get_mem_size(ctx: ffi.CData) -> int: + """ GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);""" + ... + def ggml_get_name(tensor: ffi.CData) -> ffi.CData: + """ GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);""" + ... + def ggml_get_no_alloc(ctx: ffi.CData) -> bool: + """ GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);""" + ... + def ggml_get_rows(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_get_rows( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_get_rows_back(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, c: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_get_rows_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c); + """ + ... + def ggml_get_tensor(ctx: ffi.CData, name: ffi.CData) -> ffi.CData: + """ GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);""" + ... + def ggml_get_unary_op(tensor: ffi.CData) -> int: + """ GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);""" + ... + def ggml_graph_compute(cgraph: ffi.CData, cplan: ffi.CData) -> int: + """ GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);""" + ... + def ggml_graph_compute_with_ctx(ctx: ffi.CData, cgraph: ffi.CData, n_threads: int) -> None: + """ + same as ggml_graph_compute() but the work data is allocated as a part of the context + note: the drawback of this API is that you must have ensured that the context has enough memory for the work data + + GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); + """ + ... + def ggml_graph_dump_dot(gb: ffi.CData, gf: ffi.CData, filename: ffi.CData) -> None: + """ + dump the graph into a file using the dot format + + GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); + """ + ... + def ggml_graph_export(cgraph: ffi.CData, fname: ffi.CData) -> None: + """ GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);""" + ... + def ggml_graph_get_tensor(cgraph: ffi.CData, name: ffi.CData) -> ffi.CData: + """ GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);""" + ... + def ggml_graph_import(fname: ffi.CData, ctx_data: ffi.CData, ctx_eval: ffi.CData) -> ffi.CData: + """ GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);""" + ... + def ggml_graph_overhead() -> int: + """ GGML_API size_t ggml_graph_overhead(void);""" + ... + def ggml_graph_plan(cgraph: ffi.CData, n_threads: int) -> ffi.CData: + """ + ggml_graph_plan() has to be called before ggml_graph_compute() + when plan.work_size > 0, caller must allocate memory for plan.work_data + + GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + """ + ... + def ggml_graph_print(cgraph: ffi.CData) -> None: + """ + print info and performance information for the graph + + GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph); + """ + ... + def ggml_graph_reset(cgraph: ffi.CData) -> None: + """ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);""" + ... + def ggml_init(params: ffi.CData) -> ffi.CData: + """ GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);""" + ... + def ggml_init_cublas() -> None: + """GGML_API void ggml_init_cublas(void);""" + ... + def ggml_internal_get_type_traits(type: int) -> ffi.CData: + """ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);""" + ... + def ggml_is_contiguous(tensor: ffi.CData) -> bool: + """ GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);""" + ... + def ggml_is_numa() -> bool: + """ GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node""" + ... + def ggml_is_permuted(tensor: ffi.CData) -> bool: + """ GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);""" + ... + def ggml_is_quantized(type: int) -> bool: + """ GGML_API bool ggml_is_quantized(enum ggml_type type);""" + ... + def ggml_is_transposed(tensor: ffi.CData) -> bool: + """ GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);""" + ... + def ggml_log(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_log( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_log_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_log_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_map_binary_f32(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, fun: ffi.CData) -> ffi.CData: + """ + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_binary_op_f32_t fun), + "use ggml_map_custom2 instead"); + """ + ... + def ggml_map_binary_inplace_f32(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, fun: ffi.CData) -> ffi.CData: + """ + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_binary_op_f32_t fun), + "use ggml_map_custom2_inplace instead"); + """ + ... + def ggml_map_custom1(ctx: ffi.CData, a: ffi.CData, fun: ffi.CData, n_tasks: int, userdata: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_map_custom1( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_t fun, + int n_tasks, + void * userdata); + """ + ... + def ggml_map_custom1_f32(ctx: ffi.CData, a: ffi.CData, fun: ffi.CData) -> ffi.CData: + """ + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_f32_t fun), + "use ggml_map_custom1 instead"); + """ + ... + def ggml_map_custom1_inplace(ctx: ffi.CData, a: ffi.CData, fun: ffi.CData, n_tasks: int, userdata: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_map_custom1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_t fun, + int n_tasks, + void * userdata); + """ + ... + def ggml_map_custom1_inplace_f32(ctx: ffi.CData, a: ffi.CData, fun: ffi.CData) -> ffi.CData: + """ + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_f32_t fun), + "use ggml_map_custom1_inplace instead"); + """ + ... + def ggml_map_custom2(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, fun: ffi.CData, n_tasks: int, userdata: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_map_custom2( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_t fun, + int n_tasks, + void * userdata); + """ + ... + def ggml_map_custom2_f32(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, fun: ffi.CData) -> ffi.CData: + """ + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_f32_t fun), + "use ggml_map_custom2 instead"); + """ + ... + def ggml_map_custom2_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, fun: ffi.CData, n_tasks: int, userdata: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_map_custom2_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_t fun, + int n_tasks, + void * userdata); + """ + ... + def ggml_map_custom2_inplace_f32(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, fun: ffi.CData) -> ffi.CData: + """ + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_f32_t fun), + "use ggml_map_custom2_inplace instead"); + """ + ... + def ggml_map_custom3(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, c: ffi.CData, fun: ffi.CData, n_tasks: int, userdata: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_map_custom3( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_t fun, + int n_tasks, + void * userdata); + """ + ... + def ggml_map_custom3_f32(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, c: ffi.CData, fun: ffi.CData) -> ffi.CData: + """ + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_f32_t fun), + "use ggml_map_custom3 instead"); + """ + ... + def ggml_map_custom3_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, c: ffi.CData, fun: ffi.CData, n_tasks: int, userdata: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_map_custom3_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_t fun, + int n_tasks, + void * userdata); + """ + ... + def ggml_map_custom3_inplace_f32(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, c: ffi.CData, fun: ffi.CData) -> ffi.CData: + """ + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_f32_t fun), + "use ggml_map_custom3_inplace instead"); + """ + ... + def ggml_map_unary_f32(ctx: ffi.CData, a: ffi.CData, fun: ffi.CData) -> ffi.CData: + """ + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_unary_op_f32_t fun), + "use ggml_map_custom1 instead"); + """ + ... + def ggml_map_unary_inplace_f32(ctx: ffi.CData, a: ffi.CData, fun: ffi.CData) -> ffi.CData: + """ + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_unary_op_f32_t fun), + "use ggml_map_custom1_inplace instead"); + """ + ... + def ggml_mean(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + mean along rows + + GGML_API struct ggml_tensor * ggml_mean( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_metal_add_buffer(ctx: ffi.CData, name: ffi.CData, data: ffi.CData, size: int, max_size: int) -> bool: + """ + creates a mapping between a host memory buffer and a device memory buffer + - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute + - the mapping is used during computation to determine the arguments of the compute kernels + - you don't need to keep the host memory buffer allocated as it is never accessed by Metal + - max_size specifies the maximum size of a tensor and is used to create shared views such + that it is guaranteed that the tensor will fit in at least one of the views + + + bool ggml_metal_add_buffer( + struct ggml_metal_context * ctx, + const char * name, + void * data, + size_t size, + size_t max_size); + """ + ... + def ggml_metal_free(ctx: ffi.CData) -> None: + """void ggml_metal_free(struct ggml_metal_context * ctx);""" + ... + def ggml_metal_get_concur_list(ctx: ffi.CData) -> ffi.CData: + """ + output the concur_list for ggml_alloc + + int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx); + """ + ... + def ggml_metal_get_tensor(ctx: ffi.CData, t: ffi.CData) -> None: + """ + get data from the device into host memory + + void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); + """ + ... + def ggml_metal_graph_compute(ctx: ffi.CData, gf: ffi.CData) -> None: + """ + same as ggml_graph_compute but uses Metal + creates gf->n_threads command buffers in parallel + + void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); + """ + ... + def ggml_metal_graph_find_concurrency(ctx: ffi.CData, gf: ffi.CData, check_mem: bool) -> None: + """ + try to find operations that can be run concurrently in the graph + you should run it again if the topology of your graph changes + + void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem); + """ + ... + def ggml_metal_host_free(data: ffi.CData) -> None: + """void ggml_metal_host_free (void * data);""" + ... + def ggml_metal_host_malloc(n: int) -> ffi.CData: + """void * ggml_metal_host_malloc(size_t n);""" + ... + def ggml_metal_if_optimized(ctx: ffi.CData) -> int: + """ + if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized + + int ggml_metal_if_optimized(struct ggml_metal_context * ctx); + """ + ... + def ggml_metal_init(n_cb: int) -> ffi.CData: + """ + number of command buffers to use + + struct ggml_metal_context * ggml_metal_init(int n_cb); + """ + ... + def ggml_metal_set_n_cb(ctx: ffi.CData, n_cb: int) -> None: + """ + set the number of command buffers to use + + void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); + """ + ... + def ggml_metal_set_tensor(ctx: ffi.CData, t: ffi.CData) -> None: + """ + set data from host memory into the device + + void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); + """ + ... + def ggml_mpi_backend_free() -> None: + """void ggml_mpi_backend_free(void);""" + ... + def ggml_mpi_backend_init() -> None: + """void ggml_mpi_backend_init(void);""" + ... + def ggml_mpi_eval_init(ctx_mpi: ffi.CData, n_tokens: ffi.CData, n_past: ffi.CData, n_threads: ffi.CData) -> None: + """ + void ggml_mpi_eval_init( + struct ggml_mpi_context * ctx_mpi, + int * n_tokens, + int * n_past, + int * n_threads); + """ + ... + def ggml_mpi_free(ctx: ffi.CData) -> None: + """void ggml_mpi_free(struct ggml_mpi_context * ctx);""" + ... + def ggml_mpi_graph_compute_post(ctx_mpi: ffi.CData, gf: ffi.CData, n_layers: int) -> None: + """ + void ggml_mpi_graph_compute_post( + struct ggml_mpi_context * ctx_mpi, + struct ggml_cgraph * gf, + int n_layers); + """ + ... + def ggml_mpi_graph_compute_pre(ctx_mpi: ffi.CData, gf: ffi.CData, n_layers: int) -> None: + """ + void ggml_mpi_graph_compute_pre( + struct ggml_mpi_context * ctx_mpi, + struct ggml_cgraph * gf, + int n_layers); + """ + ... + def ggml_mpi_init() -> ffi.CData: + """struct ggml_mpi_context * ggml_mpi_init(void);""" + ... + def ggml_mpi_rank(ctx: ffi.CData) -> int: + """int ggml_mpi_rank(struct ggml_mpi_context * ctx);""" + ... + def ggml_mul(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_mul( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_mul_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_mul_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_mul_mat(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + A: n columns, m rows + B: n columns, p rows (i.e. we transpose it internally) + result is m columns, p rows + + GGML_API struct ggml_tensor * ggml_mul_mat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_nbytes(tensor: ffi.CData) -> int: + """ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);""" + ... + def ggml_nbytes_pad(tensor: ffi.CData) -> int: + """ GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN""" + ... + def ggml_nbytes_split(tensor: ffi.CData, nrows_split: int) -> int: + """ GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);""" + ... + def ggml_neg(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_neg( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_neg_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_neg_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_nelements(tensor: ffi.CData) -> int: + """ GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);""" + ... + def ggml_new_f32(ctx: ffi.CData, value: float) -> ffi.CData: + """ GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);""" + ... + def ggml_new_graph(ctx: ffi.CData) -> ffi.CData: + """ + graph allocation in a context + + GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); + """ + ... + def ggml_new_i32(ctx: ffi.CData, value: int) -> ffi.CData: + """ GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);""" + ... + def ggml_new_tensor(ctx: ffi.CData, type: int, n_dims: int, ne: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_new_tensor( + struct ggml_context * ctx, + enum ggml_type type, + int n_dims, + const int64_t *ne); + """ + ... + def ggml_new_tensor_1d(ctx: ffi.CData, type: int, ne0: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_new_tensor_1d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0); + """ + ... + def ggml_new_tensor_2d(ctx: ffi.CData, type: int, ne0: int, ne1: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_new_tensor_2d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1); + """ + ... + def ggml_new_tensor_3d(ctx: ffi.CData, type: int, ne0: int, ne1: int, ne2: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_new_tensor_3d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1, + int64_t ne2); + """ + ... + def ggml_new_tensor_4d(ctx: ffi.CData, type: int, ne0: int, ne1: int, ne2: int, ne3: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_new_tensor_4d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3); + """ + ... + def ggml_norm(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + normalize along rows + TODO: eps is hardcoded to 1e-5 for now + + GGML_API struct ggml_tensor * ggml_norm( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_norm_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_nrows(tensor: ffi.CData) -> int: + """ GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);""" + ... + def ggml_numa_init() -> None: + """ GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems""" + ... + def ggml_op_name(op: int) -> ffi.CData: + """ GGML_API const char * ggml_op_name (enum ggml_op op);""" + ... + def ggml_op_symbol(op: int) -> ffi.CData: + """ GGML_API const char * ggml_op_symbol(enum ggml_op op);""" + ... + def ggml_opt(ctx: ffi.CData, params: ffi.CData, f: ffi.CData) -> int: + """ + optimize the function defined by the tensor f + + GGML_API enum ggml_opt_result ggml_opt( + struct ggml_context * ctx, + struct ggml_opt_params params, + struct ggml_tensor * f); + """ + ... + def ggml_opt_default_params(type: int) -> ffi.CData: + """ GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);""" + ... + def ggml_opt_init(ctx: ffi.CData, opt: ffi.CData, params: ffi.CData, nx: int) -> None: + """ + initialize optimizer context + + GGML_API void ggml_opt_init( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_opt_params params, + int64_t nx); + """ + ... + def ggml_opt_resume(ctx: ffi.CData, opt: ffi.CData, f: ffi.CData) -> int: + """ + continue optimizing the function defined by the tensor f + + GGML_API enum ggml_opt_result ggml_opt_resume( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f); + """ + ... + def ggml_opt_resume_g(ctx: ffi.CData, opt: ffi.CData, f: ffi.CData, gf: ffi.CData, gb: ffi.CData) -> int: + """ + continue optimizing the function defined by the tensor f + + GGML_API enum ggml_opt_result ggml_opt_resume_g( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb); + """ + ... + def ggml_out_prod(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + A: m columns, n rows, + B: p columns, n rows, + result is m columns, p rows + + GGML_API struct ggml_tensor * ggml_out_prod( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_permute(ctx: ffi.CData, a: ffi.CData, axis0: int, axis1: int, axis2: int, axis3: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_permute( + struct ggml_context * ctx, + struct ggml_tensor * a, + int axis0, + int axis1, + int axis2, + int axis3); + """ + ... + def ggml_pool_1d(ctx: ffi.CData, a: ffi.CData, op: int, k0: int, s0: int, p0: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_pool_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, // kernel size + int s0, // stride + int p0); // padding + """ + ... + def ggml_pool_2d(ctx: ffi.CData, a: ffi.CData, op: int, k0: int, k1: int, s0: int, s1: int, p0: int, p1: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_pool_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, + int k1, + int s0, + int s1, + int p0, + int p1); + """ + ... + def ggml_print_object(obj: ffi.CData) -> None: + """ GGML_API void ggml_print_object (const struct ggml_object * obj);""" + ... + def ggml_print_objects(ctx: ffi.CData) -> None: + """ GGML_API void ggml_print_objects(const struct ggml_context * ctx);""" + ... + def ggml_quantize_chunk(type: int, src: ffi.CData, dst: ffi.CData, start: int, n: int, hist: ffi.CData) -> int: + """ GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);""" + ... + def ggml_quantize_q2_K(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int: + """ + Quantization with histogram collection + + size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist); + """ + ... + def ggml_quantize_q3_K(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int: + """size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);""" + ... + def ggml_quantize_q4_0(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int: + """ GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);""" + ... + def ggml_quantize_q4_1(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int: + """ GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);""" + ... + def ggml_quantize_q4_K(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int: + """size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);""" + ... + def ggml_quantize_q5_0(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int: + """ GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);""" + ... + def ggml_quantize_q5_1(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int: + """ GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);""" + ... + def ggml_quantize_q5_K(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int: + """size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);""" + ... + def ggml_quantize_q6_K(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int: + """size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);""" + ... + def ggml_quantize_q8_0(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int: + """ GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);""" + ... + def ggml_relu(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_relu( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_relu_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_relu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_repeat(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + if a is the same shape as b, and a is not parameter, return a + otherwise, return a new tensor: repeat(a) to fit in b + + GGML_API struct ggml_tensor * ggml_repeat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_repeat_back(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_repeat_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_reshape(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + return view(a), b specifies the new shape + TODO: when we start computing gradient, make a copy instead of view + + GGML_API struct ggml_tensor * ggml_reshape( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_reshape_1d(ctx: ffi.CData, a: ffi.CData, ne0: int) -> ffi.CData: + """ + return view(a) + TODO: when we start computing gradient, make a copy instead of view + + GGML_API struct ggml_tensor * ggml_reshape_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0); + """ + ... + def ggml_reshape_2d(ctx: ffi.CData, a: ffi.CData, ne0: int, ne1: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_reshape_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1); + """ + ... + def ggml_reshape_3d(ctx: ffi.CData, a: ffi.CData, ne0: int, ne1: int, ne2: int) -> ffi.CData: + """ + return view(a) + TODO: when we start computing gradient, make a copy instead of view + + GGML_API struct ggml_tensor * ggml_reshape_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2); + """ + ... + def ggml_reshape_4d(ctx: ffi.CData, a: ffi.CData, ne0: int, ne1: int, ne2: int, ne3: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_reshape_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3); + """ + ... + def ggml_rms_norm(ctx: ffi.CData, a: ffi.CData, eps: float) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_rms_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + """ + ... + def ggml_rms_norm_back(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + a - x + b - dy + TODO: update with configurable eps + + GGML_API struct ggml_tensor * ggml_rms_norm_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_rms_norm_inplace(ctx: ffi.CData, a: ffi.CData, eps: float) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_rms_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + """ + ... + def ggml_rope(ctx: ffi.CData, a: ffi.CData, n_past: int, n_dims: int, mode: int, n_ctx: int) -> ffi.CData: + """ + rotary position embedding + if mode & 1 == 1, skip n_past elements + if mode & 2 == 1, GPT-NeoX style + if mode & 4 == 1, ChatGLM style + TODO: avoid creating a new tensor every time + + GGML_API struct ggml_tensor * ggml_rope( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx); + """ + ... + def ggml_rope_back(ctx: ffi.CData, a: ffi.CData, n_past: int, n_dims: int, mode: int, n_ctx: int) -> ffi.CData: + """ + rotary position embedding backward, i.e compute dx from dy + a - dy + + GGML_API struct ggml_tensor * ggml_rope_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx); + """ + ... + def ggml_rope_custom(ctx: ffi.CData, a: ffi.CData, n_past: int, n_dims: int, mode: int, n_ctx: int, freq_base: float, freq_scale: float) -> ffi.CData: + """ + custom RoPE + + GGML_API struct ggml_tensor * ggml_rope_custom( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx, + float freq_base, + float freq_scale); + """ + ... + def ggml_rope_custom_inplace(ctx: ffi.CData, a: ffi.CData, n_past: int, n_dims: int, mode: int, n_ctx: int, freq_base: float, freq_scale: float) -> ffi.CData: + """ + in-place, returns view(a) + + GGML_API struct ggml_tensor * ggml_rope_custom_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx, + float freq_base, + float freq_scale); + """ + ... + def ggml_rope_inplace(ctx: ffi.CData, a: ffi.CData, n_past: int, n_dims: int, mode: int, n_ctx: int) -> ffi.CData: + """ + in-place, returns view(a) + + GGML_API struct ggml_tensor * ggml_rope_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx); + """ + ... + def ggml_scale(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_scale( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_scale_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + in-place, returns view(a) + + GGML_API struct ggml_tensor * ggml_scale_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_set(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, nb1: int, nb2: int, nb3: int, offset: int) -> ffi.CData: + """ + b -> view(a,offset,nb1,nb2,3), return modified a + + GGML_API struct ggml_tensor * ggml_set( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + """ + ... + def ggml_set_1d(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, offset: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_set_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset); + """ + ... + def ggml_set_1d_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, offset: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_set_1d_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset); + """ + ... + def ggml_set_2d(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, nb1: int, offset: int) -> ffi.CData: + """ + b -> view(a,offset,nb1,nb2,3), return modified a + + GGML_API struct ggml_tensor * ggml_set_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t offset); + """ + ... + def ggml_set_2d_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, nb1: int, offset: int) -> ffi.CData: + """ + b -> view(a,offset,nb1,nb2,3), return view(a) + + GGML_API struct ggml_tensor * ggml_set_2d_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t offset); + """ + ... + def ggml_set_f32(tensor: ffi.CData, value: float) -> ffi.CData: + """ GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);""" + ... + def ggml_set_f32_1d(tensor: ffi.CData, i: int, value: float) -> None: + """ GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);""" + ... + def ggml_set_i32(tensor: ffi.CData, value: int) -> ffi.CData: + """ GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);""" + ... + def ggml_set_i32_1d(tensor: ffi.CData, i: int, value: int) -> None: + """ GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);""" + ... + def ggml_set_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, nb1: int, nb2: int, nb3: int, offset: int) -> ffi.CData: + """ + b -> view(a,offset,nb1,nb2,3), return view(a) + + GGML_API struct ggml_tensor * ggml_set_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + """ + ... + def ggml_set_name(tensor: ffi.CData, name: ffi.CData) -> ffi.CData: + """ GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);""" + ... + def ggml_set_no_alloc(ctx: ffi.CData, no_alloc: bool) -> None: + """ GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);""" + ... + def ggml_set_param(ctx: ffi.CData, tensor: ffi.CData) -> None: + """ + GGML_API void ggml_set_param( + struct ggml_context * ctx, + struct ggml_tensor * tensor); + """ + ... + def ggml_set_scratch(ctx: ffi.CData, scratch: ffi.CData) -> int: + """ GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);""" + ... + def ggml_set_zero(tensor: ffi.CData) -> ffi.CData: + """ GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);""" + ... + def ggml_sgn(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_sgn( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_sgn_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_sgn_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_silu(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_silu( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_silu_back(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + a - x + b - dy + + GGML_API struct ggml_tensor * ggml_silu_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_silu_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_silu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_soft_max(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_soft_max( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_soft_max_back(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_soft_max_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_soft_max_back_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + in-place, returns view(a) + + GGML_API struct ggml_tensor * ggml_soft_max_back_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_soft_max_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + in-place, returns view(a) + + GGML_API struct ggml_tensor * ggml_soft_max_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_sqr(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_sqr( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_sqr_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_sqr_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_sqrt(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_sqrt( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_sqrt_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_sqrt_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_step(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_step( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_step_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_step_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_sub(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_sub( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_sub_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_sub_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + """ + ... + def ggml_sum(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + return scalar + + GGML_API struct ggml_tensor * ggml_sum( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_sum_rows(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d] + + GGML_API struct ggml_tensor * ggml_sum_rows( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_tanh(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_tanh( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_tanh_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_tanh_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_tensor_overhead() -> int: + """ + use this to compute the memory overhead of a tensor + + GGML_API size_t ggml_tensor_overhead(void); + """ + ... + def ggml_time_init() -> None: + """ GGML_API void ggml_time_init(void); // call this once at the beginning of the program""" + ... + def ggml_time_ms() -> int: + """ GGML_API int64_t ggml_time_ms(void);""" + ... + def ggml_time_us() -> int: + """ GGML_API int64_t ggml_time_us(void);""" + ... + def ggml_transpose(ctx: ffi.CData, a: ffi.CData) -> ffi.CData: + """ + alias for ggml_permute(ctx, a, 1, 0, 2, 3) + + GGML_API struct ggml_tensor * ggml_transpose( + struct ggml_context * ctx, + struct ggml_tensor * a); + """ + ... + def ggml_type_name(type: int) -> ffi.CData: + """ GGML_API const char * ggml_type_name(enum ggml_type type);""" + ... + def ggml_type_size(type: int) -> int: + """ GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block""" + ... + def ggml_type_sizef(type: int) -> float: + """ GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float""" + ... + def ggml_unary(ctx: ffi.CData, a: ffi.CData, op: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_unary( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_unary_op op); + """ + ... + def ggml_unary_inplace(ctx: ffi.CData, a: ffi.CData, op: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_unary_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_unary_op op); + """ + ... + def ggml_used_mem(ctx: ffi.CData) -> int: + """ GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);""" + ... + def ggml_vec_dot_q2_K_q8_K(n: int, s: ffi.CData, vx: ffi.CData, vy: ffi.CData) -> None: + """ + Dot product + + void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); + """ + ... + def ggml_vec_dot_q3_K_q8_K(n: int, s: ffi.CData, vx: ffi.CData, vy: ffi.CData) -> None: + """void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);""" + ... + def ggml_vec_dot_q4_K_q8_K(n: int, s: ffi.CData, vx: ffi.CData, vy: ffi.CData) -> None: + """void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);""" + ... + def ggml_vec_dot_q5_K_q8_K(n: int, s: ffi.CData, vx: ffi.CData, vy: ffi.CData) -> None: + """void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);""" + ... + def ggml_vec_dot_q6_K_q8_K(n: int, s: ffi.CData, vx: ffi.CData, vy: ffi.CData) -> None: + """void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);""" + ... + def ggml_view_1d(ctx: ffi.CData, a: ffi.CData, ne0: int, offset: int) -> ffi.CData: + """ + offset in bytes + + GGML_API struct ggml_tensor * ggml_view_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + size_t offset); + """ + ... + def ggml_view_2d(ctx: ffi.CData, a: ffi.CData, ne0: int, ne1: int, nb1: int, offset: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_view_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + size_t nb1, // row stride in bytes + size_t offset); + """ + ... + def ggml_view_3d(ctx: ffi.CData, a: ffi.CData, ne0: int, ne1: int, ne2: int, nb1: int, nb2: int, offset: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_view_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + size_t nb1, // row stride in bytes + size_t nb2, // slice stride in bytes + size_t offset); + """ + ... + def ggml_view_4d(ctx: ffi.CData, a: ffi.CData, ne0: int, ne1: int, ne2: int, ne3: int, nb1: int, nb2: int, nb3: int, offset: int) -> ffi.CData: + """ + GGML_API struct ggml_tensor * ggml_view_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + size_t nb1, // row stride in bytes + size_t nb2, // slice stride in bytes + size_t nb3, + size_t offset); + """ + ... + def ggml_view_tensor(ctx: ffi.CData, src: ffi.CData) -> ffi.CData: + """ GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);""" + ... + def ggml_win_part(ctx: ffi.CData, a: ffi.CData, w: int) -> ffi.CData: + """ + partition into non-overlapping windows with padding if needed + example: + a: 768 64 64 1 + w: 14 + res: 768 14 14 25 + used in sam + + GGML_API struct ggml_tensor * ggml_win_part( + struct ggml_context * ctx, + struct ggml_tensor * a, + int w); + """ + ... + def ggml_win_unpart(ctx: ffi.CData, a: ffi.CData, w0: int, h0: int, w: int) -> ffi.CData: + """ + reverse of ggml_win_part + used in sam + + GGML_API struct ggml_tensor * ggml_win_unpart( + struct ggml_context * ctx, + struct ggml_tensor * a, + int w0, + int h0, + int w); + """ + ... + def gguf_add_tensor(ctx: ffi.CData, tensor: ffi.CData) -> None: + """ + manage tensor info + + GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor); + """ + ... + def gguf_find_key(ctx: ffi.CData, key: ffi.CData) -> int: + """ GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);""" + ... + def gguf_find_tensor(ctx: ffi.CData, name: ffi.CData) -> int: + """ GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name);""" + ... + def gguf_free(ctx: ffi.CData) -> None: + """ GGML_API void gguf_free(struct gguf_context * ctx);""" + ... + def gguf_get_alignment(ctx: ffi.CData) -> int: + """ GGML_API size_t gguf_get_alignment (struct gguf_context * ctx);""" + ... + def gguf_get_arr_data(ctx: ffi.CData, i: int) -> ffi.CData: + """ GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);""" + ... + def gguf_get_arr_n(ctx: ffi.CData, i: int) -> int: + """ GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);""" + ... + def gguf_get_arr_str(ctx: ffi.CData, key_id: int, i: int) -> ffi.CData: + """ GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);""" + ... + def gguf_get_arr_type(ctx: ffi.CData, i: int) -> int: + """ GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);""" + ... + def gguf_get_data(ctx: ffi.CData) -> ffi.CData: + """ GGML_API void * gguf_get_data (struct gguf_context * ctx);""" + ... + def gguf_get_data_offset(ctx: ffi.CData) -> int: + """ GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);""" + ... + def gguf_get_key(ctx: ffi.CData, i: int) -> ffi.CData: + """ GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);""" + ... + def gguf_get_kv_type(ctx: ffi.CData, i: int) -> int: + """ GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);""" + ... + def gguf_get_meta_data(ctx: ffi.CData, data: ffi.CData) -> None: + """ GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data);""" + ... + def gguf_get_meta_size(ctx: ffi.CData) -> int: + """ + get the size in bytes of the meta data (header, kv pairs, tensor info) including padding + + GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx); + """ + ... + def gguf_get_n_kv(ctx: ffi.CData) -> int: + """ GGML_API int gguf_get_n_kv(struct gguf_context * ctx);""" + ... + def gguf_get_n_tensors(ctx: ffi.CData) -> int: + """ GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);""" + ... + def gguf_get_tensor_name(ctx: ffi.CData, i: int) -> ffi.CData: + """ GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);""" + ... + def gguf_get_tensor_offset(ctx: ffi.CData, i: int) -> int: + """ GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);""" + ... + def gguf_get_val_bool(ctx: ffi.CData, i: int) -> bool: + """ GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);""" + ... + def gguf_get_val_f32(ctx: ffi.CData, i: int) -> float: + """ GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);""" + ... + def gguf_get_val_i16(ctx: ffi.CData, i: int) -> int: + """ GGML_API int16_t gguf_get_val_i16 (struct gguf_context * ctx, int i);""" + ... + def gguf_get_val_i32(ctx: ffi.CData, i: int) -> int: + """ GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);""" + ... + def gguf_get_val_i8(ctx: ffi.CData, i: int) -> int: + """ GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i);""" + ... + def gguf_get_val_str(ctx: ffi.CData, i: int) -> ffi.CData: + """ GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);""" + ... + def gguf_get_val_u16(ctx: ffi.CData, i: int) -> int: + """ GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i);""" + ... + def gguf_get_val_u32(ctx: ffi.CData, i: int) -> int: + """ GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);""" + ... + def gguf_get_val_u8(ctx: ffi.CData, i: int) -> int: + """ + results are undefined if the wrong type is used for the key + + GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i); + """ + ... + def gguf_get_version(ctx: ffi.CData) -> int: + """ GGML_API int gguf_get_version (struct gguf_context * ctx);""" + ... + def gguf_init_empty() -> ffi.CData: + """ GGML_API struct gguf_context * gguf_init_empty(void);""" + ... + def gguf_init_from_file(fname: ffi.CData, params: ffi.CData) -> ffi.CData: + """ GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);""" + ... + def gguf_set_arr_data(ctx: ffi.CData, key: ffi.CData, type: int, data: ffi.CData, n: int) -> None: + """ GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);""" + ... + def gguf_set_arr_str(ctx: ffi.CData, key: ffi.CData, data: ffi.CData, n: int) -> None: + """ GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);""" + ... + def gguf_set_kv(ctx: ffi.CData, src: ffi.CData) -> None: + """ + set or add KV pairs from another context + + GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src); + """ + ... + def gguf_set_tensor_data(ctx: ffi.CData, name: ffi.CData, data: ffi.CData, size: int) -> None: + """ GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);""" + ... + def gguf_set_tensor_type(ctx: ffi.CData, name: ffi.CData, type: int) -> None: + """ GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);""" + ... + def gguf_set_val_bool(ctx: ffi.CData, key: ffi.CData, val: bool) -> None: + """ GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);""" + ... + def gguf_set_val_f32(ctx: ffi.CData, key: ffi.CData, val: float) -> None: + """ GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);""" + ... + def gguf_set_val_i16(ctx: ffi.CData, key: ffi.CData, val: int) -> None: + """ GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);""" + ... + def gguf_set_val_i32(ctx: ffi.CData, key: ffi.CData, val: int) -> None: + """ GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);""" + ... + def gguf_set_val_i8(ctx: ffi.CData, key: ffi.CData, val: int) -> None: + """ GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);""" + ... + def gguf_set_val_str(ctx: ffi.CData, key: ffi.CData, val: ffi.CData) -> None: + """ GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);""" + ... + def gguf_set_val_u16(ctx: ffi.CData, key: ffi.CData, val: int) -> None: + """ GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);""" + ... + def gguf_set_val_u32(ctx: ffi.CData, key: ffi.CData, val: int) -> None: + """ GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);""" + ... + def gguf_set_val_u8(ctx: ffi.CData, key: ffi.CData, val: int) -> None: + """ + overrides existing values or adds a new one + + GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val); + """ + ... + def gguf_type_name(type: int) -> ffi.CData: + """ GGML_API const char * gguf_type_name(enum gguf_type type);""" + ... + def gguf_write_to_file(ctx: ffi.CData, fname: ffi.CData, only_meta: bool) -> None: + """ + write the entire context to a binary file + + GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta); + """ + ... + def quantize_row_q2_K(x: ffi.CData, y: ffi.CData, k: int) -> None: + """void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);""" + ... + def quantize_row_q2_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None: + """ + Quantization + + void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k); + """ + ... + def quantize_row_q3_K(x: ffi.CData, y: ffi.CData, k: int) -> None: + """void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);""" + ... + def quantize_row_q3_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None: + """void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);""" + ... + def quantize_row_q4_K(x: ffi.CData, y: ffi.CData, k: int) -> None: + """void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);""" + ... + def quantize_row_q4_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None: + """void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);""" + ... + def quantize_row_q5_K(x: ffi.CData, y: ffi.CData, k: int) -> None: + """void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);""" + ... + def quantize_row_q5_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None: + """void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);""" + ... + def quantize_row_q6_K(x: ffi.CData, y: ffi.CData, k: int) -> None: + """void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);""" + ... + def quantize_row_q6_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None: + """void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);""" + ... + def quantize_row_q8_K(x: ffi.CData, y: ffi.CData, k: int) -> None: + """void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);""" + ... + def quantize_row_q8_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None: + """void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);""" + ... \ No newline at end of file diff --git a/seamless_communication/ggml/examples/python/ggml/cffi.py b/seamless_communication/ggml/examples/python/ggml/cffi.py new file mode 100644 index 0000000..7b65ff6 --- /dev/null +++ b/seamless_communication/ggml/examples/python/ggml/cffi.py @@ -0,0 +1,11 @@ +# auto-generated file +import _cffi_backend + +ffi = _cffi_backend.FFI('ggml.cffi', + _version = 0x2601, + _types = b'\x00\x00\xB6\x0D\x00\x00\x09\x0B\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x04\x2F\x03\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x04\x31\x03\x00\x04\x3D\x03\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x04\x32\x03\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x04\x34\x03\x00\x03\xFE\x03\x00\x04\x53\x03\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x04\x3D\x03\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x04\x3E\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x00\x00\x0F\x00\x02\xD0\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x0F\x0D\x00\x00\x04\x0B\x00\x00\x00\x0F\x00\x00\x0F\x0D\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x00\x0F\x0D\x00\x00\x0B\x0B\x00\x00\x00\x0F\x00\x00\x0F\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\x0F\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x0F\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x16\x0D\x00\x00\x0B\x11\x00\x04\x38\x03\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x16\x0D\x00\x00\x0B\x11\x00\x00\x44\x11\x00\x00\x08\x11\x00\x04\x30\x03\x00\x00\x4B\x11\x00\x00\x00\x0F\x00\x04\x16\x0D\x00\x00\x0B\x11\x00\x00\x20\x09\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x01\x0D\x00\x00\x01\x0B\x00\x00\x00\x0F\x00\x01\x14\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\x34\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x02\x7E\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\xF4\x0D\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x00\xF4\x0D\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\xF4\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\xF4\x0D\x00\x00\x06\x01\x00\x00\x00\x0F\x00\x04\x18\x0D\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x02\xE9\x0D\x00\x00\x0E\x11\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x4B\x11\x00\x04\x33\x03\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x0E\x11\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x04\x35\x03\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x21\x11\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x00\x0F\x00\x00\xDB\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\xDB\x0D\x00\x00\x00\x0F\x00\x03\xB0\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x03\xB5\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x04\x0D\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x04\x0D\x00\x00\x10\x11\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x4B\x0D\x00\x00\x0B\x11\x00\x00\x00\x0F\x00\x00\x4B\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x30\x0D\x00\x00\x0F\x11\x00\x00\x0B\x03\x00\x00\xB0\x11\x00\x00\x00\x0F\x00\x04\x30\x0D\x00\x00\x0B\x11\x00\x00\x4B\x11\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x04\x30\x0D\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x0B\x0D\x00\x00\x1B\x09\x00\x00\x00\x0F\x00\x04\x33\x0D\x00\x00\x4B\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x0E\x0D\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x7F\x0D\x00\x00\x00\x0F\x00\x00\x50\x0D\x00\x00\x07\x0B\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x4B\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x01\x11\x00\x00\x07\x01\x00\x00\xDB\x03\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x01\x11\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x01\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x01\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x01\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x05\x0B\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x01\x01\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0A\x0B\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0D\x01\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x0D\x01\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x03\x5C\x03\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x03\x62\x03\x00\x00\x07\x01\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x02\xD8\x03\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x03\x4F\x03\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x03\x54\x03\x00\x00\x07\x01\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x02\xD3\x03\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x03\x44\x03\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x03\x48\x03\x00\x00\x07\x01\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x08\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x08\x11\x00\x00\x0F\x11\x00\x00\x01\x0F\x00\x00\x08\x0D\x00\x00\x08\x11\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x21\x0D\x00\x00\x0F\x11\x00\x00\x24\x09\x00\x00\x00\x0F\x00\x00\x21\x0D\x00\x00\x00\x0F\x00\x03\xBA\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x03\xBF\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x01\x11\x00\x00\xF4\x03\x00\x00\x10\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\xDB\x03\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x02\x35\x11\x00\x00\x10\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x02\x39\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x04\x11\x00\x00\x4B\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x0B\x11\x00\x00\x21\x09\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x04\x32\x03\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x21\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x00\x0F\x00\x00\x6C\x0D\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x6C\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x10\x0D\x00\x02\x4B\x11\x00\x00\x00\x0F\x00\x00\x10\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\x10\x0D\x00\x00\x21\x11\x00\x00\x00\x0F\x00\x00\x10\x0D\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x02\xE1\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x03\xF8\x03\x00\x00\xF4\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x03\xF9\x03\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x03\xFA\x03\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x03\xFB\x03\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x03\xFC\x03\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x03\xFD\x03\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0F\x11\x00\x00\x0F\x11\x00\x00\x07\x01\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x03\xF8\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x03\xF9\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x03\xFA\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x03\xFB\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x03\xFC\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x03\xFD\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x00\x6C\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x00\x10\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x07\x01\x00\x03\xFE\x03\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x07\x01\x00\x02\x7E\x11\x00\x02\x35\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x07\x01\x00\x02\x7E\x11\x00\x02\x35\x11\x00\x02\x35\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x07\x01\x00\x02\x7E\x11\x00\x04\x53\x03\x00\x02\xE1\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x04\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x04\x11\x00\x00\x22\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x04\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x4B\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x4B\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x04\x30\x03\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\xF8\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\xF8\x11\x00\x02\xF8\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0B\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0B\x11\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0B\x11\x00\x00\x4B\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0B\x11\x00\x00\x44\x11\x00\x00\x50\x11\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x4B\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0E\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0E\x11\x00\x00\x4B\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0E\x11\x00\x00\x4B\x11\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0E\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x7F\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x7F\x11\x00\x02\xE9\x11\x00\x02\xE9\x11\x00\x02\xE9\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x7F\x11\x00\x00\x4B\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x04\x37\x03\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x08\x11\x00\x00\x10\x11\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x0F\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x34\x11\x00\x02\xE1\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x05\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x03\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x04\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x08\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x06\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x02\xE1\x11\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x21\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x6C\x03\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x10\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\xE1\x11\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x00\x0F\x00\x00\x24\x03\x00\x00\x0D\x09\x00\x00\x0E\x09\x00\x00\x0F\x09\x00\x00\x10\x09\x00\x00\x11\x09\x00\x00\x12\x09\x00\x00\x13\x09\x00\x00\x14\x09\x00\x00\x04\x09\x00\x00\x05\x09\x00\x00\x06\x09\x00\x00\x07\x09\x00\x00\x08\x09\x00\x00\x09\x09\x00\x00\x0A\x09\x00\x00\x02\x01\x00\x03\xFE\x05\x00\x00\x00\x80\x00\x03\xFE\x05\x00\x00\x00\x10\x00\x03\xFE\x05\x00\x00\x00\xC0\x00\x03\xFE\x05\x00\x00\x00\x25\x00\x03\xFE\x05\x00\x00\x00\x28\x00\x03\xFE\x05\x00\x00\x00\x04\x00\x03\xFE\x05\x00\x00\x00\x38\x00\x03\xFE\x05\x00\x00\x00\x40\x00\x03\xFE\x05\x00\x00\x1F\xF0\x00\x03\xFE\x05\x00\x00\x00\x08\x00\x00\x00\x0B\x00\x00\x02\x0B\x00\x00\x03\x0B\x00\x00\x06\x0B\x00\x00\x08\x0B\x00\x00\x0B\x09\x00\x00\x22\x05\x00\x00\x10\x00\x00\x00\x22\x05\x00\x00\x00\x08\x00\x00\x0F\x01\x00\x00\xDB\x05\x00\x00\x00\x04\x00\x00\x09\x01\x00\x03\xB0\x05\x00\x00\x00\x10\x00\x03\xB5\x05\x00\x00\x00\x10\x00\x03\xB5\x05\x00\x00\x01\x00\x00\x00\x00\x09\x00\x00\x01\x09\x00\x00\x02\x09\x00\x00\x03\x09\x00\x04\x2C\x03\x00\x00\x0C\x09\x00\x04\x2E\x03\x00\x00\x15\x09\x00\x00\x16\x09\x00\x00\x17\x09\x00\x00\x18\x09\x00\x00\x19\x09\x00\x00\x1A\x09\x00\x00\x1C\x09\x00\x00\x1D\x09\x00\x04\x37\x03\x00\x00\x1E\x09\x00\x00\x1F\x09\x00\x00\x08\x05\x00\x00\x10\x00\x00\x00\x08\x05\x00\x00\x00\x06\x00\x00\x22\x09\x00\x00\x23\x09\x00\x03\xBA\x03\x00\x03\xBA\x05\x00\x00\x00\x80\x00\x03\xBA\x05\x00\x00\x00\x0C\x00\x03\xBA\x05\x00\x00\x00\x10\x00\x03\xBA\x05\x00\x00\x00\x20\x00\x03\xBA\x05\x00\x00\x00\x40\x00\x00\x0C\x01\x00\x00\x11\x05\x00\x00\x00\x04\x00\x00\x10\x05\x00\x00\x20\x51\x00\x02\xC6\x03\x00\x02\xDE\x03\x00\x03\xE0\x03\x00\x03\xE7\x03\x00\x00\x00\x01', + _globals = (b'\xFF\xFF\xFF\x0BGGML_BACKEND_CPU',0,b'\xFF\xFF\xFF\x0BGGML_BACKEND_GPU',10,b'\xFF\xFF\xFF\x0BGGML_BACKEND_GPU_SPLIT',20,b'\xFF\xFF\xFF\x0BGGML_FTYPE_ALL_F32',0,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_F16',1,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q2_K',10,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q3_K',11,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q4_0',2,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q4_1',3,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q4_1_SOME_F16',4,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q4_K',12,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q5_0',8,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q5_1',9,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q5_K',13,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q6_K',14,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q8_0',7,b'\xFF\xFF\xFF\x0BGGML_FTYPE_UNKNOWN',-1,b'\xFF\xFF\xFF\x1FGGML_GRAPH_SIZE',164520,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_BACKTRACKING_ARMIJO',0,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE',2,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_BACKTRACKING_WOLFE',1,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_DEFAULT',1,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_FAIL',-128,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_INVALID_PARAMETERS',-124,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_MAXIMUM_ITERATIONS',-125,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_MAXIMUM_STEP',-126,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_MINIMUM_STEP',-127,b'\xFF\xFF\xFF\x0BGGML_OBJECT_GRAPH',1,b'\xFF\xFF\xFF\x1FGGML_OBJECT_SIZE',32,b'\xFF\xFF\xFF\x0BGGML_OBJECT_TENSOR',0,b'\xFF\xFF\xFF\x0BGGML_OBJECT_WORK_BUFFER',2,b'\xFF\xFF\xFF\x0BGGML_OPT_ADAM',0,b'\xFF\xFF\xFF\x0BGGML_OPT_DID_NOT_CONVERGE',1,b'\xFF\xFF\xFF\x0BGGML_OPT_FAIL',4,b'\xFF\xFF\xFF\x0BGGML_OPT_INVALID_WOLFE',3,b'\xFF\xFF\xFF\x0BGGML_OPT_LBFGS',1,b'\xFF\xFF\xFF\x0BGGML_OPT_NO_CONTEXT',2,b'\xFF\xFF\xFF\x0BGGML_OPT_OK',0,b'\xFF\xFF\xFF\x0BGGML_OP_ACC',4,b'\xFF\xFF\xFF\x0BGGML_OP_ADD',2,b'\xFF\xFF\xFF\x0BGGML_OP_ADD1',3,b'\xFF\xFF\xFF\x0BGGML_OP_ALIBI',40,b'\xFF\xFF\xFF\x0BGGML_OP_ARGMAX',14,b'\xFF\xFF\xFF\x0BGGML_OP_CLAMP',41,b'\xFF\xFF\xFF\x0BGGML_OP_CONT',26,b'\xFF\xFF\xFF\x0BGGML_OP_CONV_1D',42,b'\xFF\xFF\xFF\x0BGGML_OP_CONV_2D',43,b'\xFF\xFF\xFF\x0BGGML_OP_COUNT',62,b'\xFF\xFF\xFF\x0BGGML_OP_CPY',25,b'\xFF\xFF\xFF\x0BGGML_OP_CROSS_ENTROPY_LOSS',60,b'\xFF\xFF\xFF\x0BGGML_OP_CROSS_ENTROPY_LOSS_BACK',61,b'\xFF\xFF\xFF\x0BGGML_OP_DIAG',33,b'\xFF\xFF\xFF\x0BGGML_OP_DIAG_MASK_INF',34,b'\xFF\xFF\xFF\x0BGGML_OP_DIAG_MASK_ZERO',35,b'\xFF\xFF\xFF\x0BGGML_OP_DIV',7,b'\xFF\xFF\xFF\x0BGGML_OP_DUP',1,b'\xFF\xFF\xFF\x0BGGML_OP_FLASH_ATTN',46,b'\xFF\xFF\xFF\x0BGGML_OP_FLASH_ATTN_BACK',48,b'\xFF\xFF\xFF\x0BGGML_OP_FLASH_FF',47,b'\xFF\xFF\xFF\x0BGGML_OP_GET_ROWS',31,b'\xFF\xFF\xFF\x0BGGML_OP_GET_ROWS_BACK',32,b'\xFF\xFF\xFF\x0BGGML_OP_LOG',10,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_BINARY',53,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_CUSTOM1',57,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_CUSTOM1_F32',54,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_CUSTOM2',58,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_CUSTOM2_F32',55,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_CUSTOM3',59,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_CUSTOM3_F32',56,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_UNARY',52,b'\xFF\xFF\xFF\x0BGGML_OP_MEAN',13,b'\xFF\xFF\xFF\x0BGGML_OP_MUL',6,b'\xFF\xFF\xFF\x0BGGML_OP_MUL_MAT',21,b'\xFF\xFF\xFF\x0BGGML_OP_NONE',0,b'\xFF\xFF\xFF\x0BGGML_OP_NORM',18,b'\xFF\xFF\xFF\x0BGGML_OP_OUT_PROD',22,b'\xFF\xFF\xFF\x0BGGML_OP_PERMUTE',29,b'\xFF\xFF\xFF\x0BGGML_OP_POOL_1D',44,b'\xFF\xFF\xFF\x0BGGML_OP_POOL_2D',45,b'\xFF\xFF\xFF\x0BGGML_OP_POOL_AVG',1,b'\xFF\xFF\xFF\x0BGGML_OP_POOL_COUNT',2,b'\xFF\xFF\xFF\x0BGGML_OP_POOL_MAX',0,b'\xFF\xFF\xFF\x0BGGML_OP_REPEAT',15,b'\xFF\xFF\xFF\x0BGGML_OP_REPEAT_BACK',16,b'\xFF\xFF\xFF\x0BGGML_OP_RESHAPE',27,b'\xFF\xFF\xFF\x0BGGML_OP_RMS_NORM',19,b'\xFF\xFF\xFF\x0BGGML_OP_RMS_NORM_BACK',20,b'\xFF\xFF\xFF\x0BGGML_OP_ROPE',38,b'\xFF\xFF\xFF\x0BGGML_OP_ROPE_BACK',39,b'\xFF\xFF\xFF\x0BGGML_OP_SCALE',23,b'\xFF\xFF\xFF\x0BGGML_OP_SET',24,b'\xFF\xFF\xFF\x0BGGML_OP_SILU_BACK',17,b'\xFF\xFF\xFF\x0BGGML_OP_SOFT_MAX',36,b'\xFF\xFF\xFF\x0BGGML_OP_SOFT_MAX_BACK',37,b'\xFF\xFF\xFF\x0BGGML_OP_SQR',8,b'\xFF\xFF\xFF\x0BGGML_OP_SQRT',9,b'\xFF\xFF\xFF\x0BGGML_OP_SUB',5,b'\xFF\xFF\xFF\x0BGGML_OP_SUM',11,b'\xFF\xFF\xFF\x0BGGML_OP_SUM_ROWS',12,b'\xFF\xFF\xFF\x0BGGML_OP_TRANSPOSE',30,b'\xFF\xFF\xFF\x0BGGML_OP_UNARY',51,b'\xFF\xFF\xFF\x0BGGML_OP_VIEW',28,b'\xFF\xFF\xFF\x0BGGML_OP_WIN_PART',49,b'\xFF\xFF\xFF\x0BGGML_OP_WIN_UNPART',50,b'\xFF\xFF\xFF\x0BGGML_TASK_COMPUTE',1,b'\xFF\xFF\xFF\x0BGGML_TASK_FINALIZE',2,b'\xFF\xFF\xFF\x0BGGML_TASK_INIT',0,b'\xFF\xFF\xFF\x1FGGML_TENSOR_SIZE',288,b'\xFF\xFF\xFF\x0BGGML_TYPE_COUNT',19,b'\xFF\xFF\xFF\x0BGGML_TYPE_F16',1,b'\xFF\xFF\xFF\x0BGGML_TYPE_F32',0,b'\xFF\xFF\xFF\x0BGGML_TYPE_I16',17,b'\xFF\xFF\xFF\x0BGGML_TYPE_I32',18,b'\xFF\xFF\xFF\x0BGGML_TYPE_I8',16,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q2_K',10,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q3_K',11,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q4_0',2,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q4_1',3,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q4_K',12,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q5_0',6,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q5_1',7,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q5_K',13,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q6_K',14,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q8_0',8,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q8_1',9,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q8_K',15,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_ABS',0,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_ELU',5,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_GELU',7,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_GELU_QUICK',8,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_NEG',2,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_RELU',6,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_SGN',1,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_SILU',9,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_STEP',3,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_TANH',4,b'\xFF\xFF\xFF\x0BGGUF_TYPE_ARRAY',9,b'\xFF\xFF\xFF\x0BGGUF_TYPE_BOOL',7,b'\xFF\xFF\xFF\x0BGGUF_TYPE_COUNT',10,b'\xFF\xFF\xFF\x0BGGUF_TYPE_FLOAT32',6,b'\xFF\xFF\xFF\x0BGGUF_TYPE_INT16',3,b'\xFF\xFF\xFF\x0BGGUF_TYPE_INT32',5,b'\xFF\xFF\xFF\x0BGGUF_TYPE_INT8',1,b'\xFF\xFF\xFF\x0BGGUF_TYPE_STRING',8,b'\xFF\xFF\xFF\x0BGGUF_TYPE_UINT16',2,b'\xFF\xFF\xFF\x0BGGUF_TYPE_UINT32',4,b'\xFF\xFF\xFF\x0BGGUF_TYPE_UINT8',0,b'\x00\x02\x9A\x23__assert_rtn',0,b'\x00\x02\x7C\x23dequantize_row_q2_K',0,b'\x00\x02\x81\x23dequantize_row_q3_K',0,b'\x00\x02\x86\x23dequantize_row_q4_K',0,b'\x00\x02\x8B\x23dequantize_row_q5_K',0,b'\x00\x02\x90\x23dequantize_row_q6_K',0,b'\x00\x02\x95\x23dequantize_row_q8_K',0,b'\x00\x00\xFA\x23ggml_abs',0,b'\x00\x00\xFA\x23ggml_abs_inplace',0,b'\x00\x01\xDD\x23ggml_acc',0,b'\x00\x01\xDD\x23ggml_acc_inplace',0,b'\x00\x01\x84\x23ggml_add',0,b'\x00\x01\x84\x23ggml_add1',0,b'\x00\x01\x84\x23ggml_add1_inplace',0,b'\x00\x01\x84\x23ggml_add_inplace',0,b'\x00\x01\x26\x23ggml_alibi',0,b'\x00\x02\xEC\x23ggml_allocr_alloc',0,b'\x00\x02\x42\x23ggml_allocr_alloc_graph',0,b'\x00\x02\xE4\x23ggml_allocr_free',0,b'\x00\x00\x03\x23ggml_allocr_is_measure',0,b'\x00\x00\xA2\x23ggml_allocr_new',0,b'\x00\x00\x9F\x23ggml_allocr_new_measure',0,b'\x00\x02\xE4\x23ggml_allocr_reset',0,b'\x00\x02\xE7\x23ggml_allocr_set_parse_seq',0,b'\x00\x00\x17\x23ggml_are_same_shape',0,b'\x00\x00\xFA\x23ggml_argmax',0,b'\x00\x00\x74\x23ggml_blck_size',0,b'\x00\x00\xB3\x23ggml_build_backward',0,b'\x00\x00\xB8\x23ggml_build_forward',0,b'\x00\x00\xAA\x23ggml_build_forward_ctx',0,b'\x00\x02\xF3\x23ggml_build_forward_expand',0,b'\x00\x00\x1B\x23ggml_cl_can_mul_mat',0,b'\x00\x03\x6B\x23ggml_cl_free_data',0,b'\x00\x03\xE0\x23ggml_cl_host_free',0,b'\x00\x02\x72\x23ggml_cl_host_malloc',0,b'\x00\x03\xEC\x23ggml_cl_init',0,b'\x00\x03\x78\x23ggml_cl_mul',0,b'\x00\x03\x7D\x23ggml_cl_mul_mat',0,b'\x00\x02\x54\x23ggml_cl_mul_mat_get_wsize',0,b'\x00\x03\xE3\x23ggml_cl_transform_tensor',0,b'\x00\x01\x1B\x23ggml_clamp',0,b'\x00\x00\xFA\x23ggml_cont',0,b'\x00\x00\xFA\x23ggml_cont_inplace',0,b'\x00\x01\x90\x23ggml_conv_1d',0,b'\x00\x01\x89\x23ggml_conv_1d_ph',0,b'\x00\x01\x98\x23ggml_conv_2d',0,b'\x00\x00\x90\x23ggml_cpu_has_arm_fma',0,b'\x00\x00\x90\x23ggml_cpu_has_avx',0,b'\x00\x00\x90\x23ggml_cpu_has_avx2',0,b'\x00\x00\x90\x23ggml_cpu_has_avx512',0,b'\x00\x00\x90\x23ggml_cpu_has_avx512_vbmi',0,b'\x00\x00\x90\x23ggml_cpu_has_avx512_vnni',0,b'\x00\x00\x90\x23ggml_cpu_has_blas',0,b'\x00\x00\x90\x23ggml_cpu_has_clblast',0,b'\x00\x00\x90\x23ggml_cpu_has_cublas',0,b'\x00\x00\x90\x23ggml_cpu_has_f16c',0,b'\x00\x00\x90\x23ggml_cpu_has_fma',0,b'\x00\x00\x90\x23ggml_cpu_has_fp16_va',0,b'\x00\x00\x90\x23ggml_cpu_has_gpublas',0,b'\x00\x00\x90\x23ggml_cpu_has_neon',0,b'\x00\x00\x90\x23ggml_cpu_has_sse3',0,b'\x00\x00\x90\x23ggml_cpu_has_vsx',0,b'\x00\x00\x90\x23ggml_cpu_has_wasm_simd',0,b'\x00\x01\x84\x23ggml_cpy',0,b'\x00\x01\x84\x23ggml_cpy_inplace',0,b'\x00\x01\x84\x23ggml_cross_entropy_loss',0,b'\x00\x01\xA3\x23ggml_cross_entropy_loss_back',0,b'\x00\x03\x41\x23ggml_cuda_assign_buffers',0,b'\x00\x03\x41\x23ggml_cuda_assign_buffers_force_inplace',0,b'\x00\x03\x41\x23ggml_cuda_assign_buffers_no_scratch',0,b'\x00\x00\x1B\x23ggml_cuda_can_mul_mat',0,b'\x00\x00\x06\x23ggml_cuda_compute_forward',0,b'\x00\x03\x41\x23ggml_cuda_free_data',0,b'\x00\x03\xEC\x23ggml_cuda_free_scratch',0,b'\x00\x00\x90\x23ggml_cuda_get_device_count',0,b'\x00\x02\xCE\x23ggml_cuda_get_device_description',0,b'\x00\x03\xE0\x23ggml_cuda_host_free',0,b'\x00\x02\x72\x23ggml_cuda_host_malloc',0,b'\x00\x02\xCB\x23ggml_cuda_set_main_device',0,b'\x00\x02\x79\x23ggml_cuda_set_mul_mat_q',0,b'\x00\x03\xD8\x23ggml_cuda_set_scratch_size',0,b'\x00\x02\xA0\x23ggml_cuda_set_tensor_split',0,b'\x00\x03\xE3\x23ggml_cuda_transform_tensor',0,b'\x00\x00\x95\x23ggml_cycles',0,b'\x00\x00\x95\x23ggml_cycles_per_ms',0,b'\x00\x00\xFA\x23ggml_diag',0,b'\x00\x01\x21\x23ggml_diag_mask_inf',0,b'\x00\x01\x21\x23ggml_diag_mask_inf_inplace',0,b'\x00\x01\x21\x23ggml_diag_mask_zero',0,b'\x00\x01\x21\x23ggml_diag_mask_zero_inplace',0,b'\x00\x01\x84\x23ggml_div',0,b'\x00\x01\x84\x23ggml_div_inplace',0,b'\x00\x00\xFA\x23ggml_dup',0,b'\x00\x00\xFA\x23ggml_dup_inplace',0,b'\x00\x02\x0B\x23ggml_dup_tensor',0,b'\x00\x02\x4D\x23ggml_element_size',0,b'\x00\x00\xFA\x23ggml_elu',0,b'\x00\x00\xFA\x23ggml_elu_inplace',0,b'\x00\x01\xA9\x23ggml_flash_attn',0,b'\x00\x01\xB0\x23ggml_flash_attn_back',0,b'\x00\x01\xB8\x23ggml_flash_ff',0,b'\x00\x02\x16\x23ggml_format_name',0,b'\x00\x00\x6B\x23ggml_fp16_to_fp32',0,b'\x00\x03\xDB\x23ggml_fp16_to_fp32_row',0,b'\x00\x02\x62\x23ggml_fp32_to_fp16',0,b'\x00\x02\xC1\x23ggml_fp32_to_fp16_row',0,b'\x00\x03\x03\x23ggml_free',0,b'\x00\x00\x53\x23ggml_ftype_to_ggml_type',0,b'\x00\x00\xFA\x23ggml_gelu',0,b'\x00\x00\xFA\x23ggml_gelu_inplace',0,b'\x00\x00\xFA\x23ggml_gelu_quick',0,b'\x00\x00\xFA\x23ggml_gelu_quick_inplace',0,b'\x00\x02\x6C\x23ggml_get_data',0,b'\x00\x00\x5D\x23ggml_get_data_f32',0,b'\x00\x00\x63\x23ggml_get_f32_1d',0,b'\x00\x00\x81\x23ggml_get_i32_1d',0,b'\x00\x02\x4A\x23ggml_get_max_tensor_size',0,b'\x00\x02\x69\x23ggml_get_mem_buffer',0,b'\x00\x02\x4A\x23ggml_get_mem_size',0,b'\x00\x00\x36\x23ggml_get_name',0,b'\x00\x00\x0A\x23ggml_get_no_alloc',0,b'\x00\x01\x84\x23ggml_get_rows',0,b'\x00\x01\xA3\x23ggml_get_rows_back',0,b'\x00\x00\xCE\x23ggml_get_tensor',0,b'\x00\x00\x56\x23ggml_get_unary_op',0,b'\x00\x00\x77\x23ggml_graph_compute',0,b'\x00\x03\x0A\x23ggml_graph_compute_with_ctx',0,b'\x00\x02\xFE\x23ggml_graph_dump_dot',0,b'\x00\x02\xFA\x23ggml_graph_export',0,b'\x00\x00\xCA\x23ggml_graph_get_tensor',0,b'\x00\x00\xAE\x23ggml_graph_import',0,b'\x00\x02\x60\x23ggml_graph_overhead',0,b'\x00\x00\xBE\x23ggml_graph_plan',0,b'\x00\x02\xF7\x23ggml_graph_print',0,b'\x00\x02\xF0\x23ggml_graph_reset',0,b'\x00\x00\xBB\x23ggml_init',0,b'\x00\x03\xEC\x23ggml_init_cublas',0,b'\x00\x00\x6E\x23ggml_internal_get_type_traits',0,b'\x00\x00\x14\x23ggml_is_contiguous',0,b'\x00\x00\x27\x23ggml_is_numa',0,b'\x00\x00\x14\x23ggml_is_permuted',0,b'\x00\x00\x00\x23ggml_is_quantized',0,b'\x00\x00\x14\x23ggml_is_transposed',0,b'\x00\x00\xFA\x23ggml_log',0,b'\x00\x00\xFA\x23ggml_log_inplace',0,b'\x00\x01\xE6\x23ggml_map_binary_f32',0,b'\x00\x01\xE6\x23ggml_map_binary_inplace_f32',0,b'\x00\x02\x04\x23ggml_map_custom1',0,b'\x00\x01\xFF\x23ggml_map_custom1_f32',0,b'\x00\x02\x04\x23ggml_map_custom1_inplace',0,b'\x00\x01\xFF\x23ggml_map_custom1_inplace_f32',0,b'\x00\x01\xF2\x23ggml_map_custom2',0,b'\x00\x01\xEC\x23ggml_map_custom2_f32',0,b'\x00\x01\xF2\x23ggml_map_custom2_inplace',0,b'\x00\x01\xEC\x23ggml_map_custom2_inplace_f32',0,b'\x00\x01\xC7\x23ggml_map_custom3',0,b'\x00\x01\xC0\x23ggml_map_custom3_f32',0,b'\x00\x01\xC7\x23ggml_map_custom3_inplace',0,b'\x00\x01\xC0\x23ggml_map_custom3_inplace_f32',0,b'\x00\x01\xFA\x23ggml_map_unary_f32',0,b'\x00\x01\xFA\x23ggml_map_unary_inplace_f32',0,b'\x00\x00\xFA\x23ggml_mean',0,b'\x00\x00\x0D\x23ggml_metal_add_buffer',0,b'\x00\x03\x1C\x23ggml_metal_free',0,b'\x00\x00\x71\x23ggml_metal_get_concur_list',0,b'\x00\x03\x2C\x23ggml_metal_get_tensor',0,b'\x00\x03\x23\x23ggml_metal_graph_compute',0,b'\x00\x03\x27\x23ggml_metal_graph_find_concurrency',0,b'\x00\x03\xE0\x23ggml_metal_host_free',0,b'\x00\x02\x72\x23ggml_metal_host_malloc',0,b'\x00\x00\x7B\x23ggml_metal_if_optimized',0,b'\x00\x00\xC2\x23ggml_metal_init',0,b'\x00\x03\x1F\x23ggml_metal_set_n_cb',0,b'\x00\x03\x2C\x23ggml_metal_set_tensor',0,b'\x00\x03\xEC\x23ggml_mpi_backend_free',0,b'\x00\x03\xEC\x23ggml_mpi_backend_init',0,b'\x00\x03\x33\x23ggml_mpi_eval_init',0,b'\x00\x03\x30\x23ggml_mpi_free',0,b'\x00\x03\x39\x23ggml_mpi_graph_compute_post',0,b'\x00\x03\x39\x23ggml_mpi_graph_compute_pre',0,b'\x00\x00\xC5\x23ggml_mpi_init',0,b'\x00\x00\x7E\x23ggml_mpi_rank',0,b'\x00\x01\x84\x23ggml_mul',0,b'\x00\x01\x84\x23ggml_mul_inplace',0,b'\x00\x01\x84\x23ggml_mul_mat',0,b'\x00\x02\x4D\x23ggml_nbytes',0,b'\x00\x02\x4D\x23ggml_nbytes_pad',0,b'\x00\x02\x50\x23ggml_nbytes_split',0,b'\x00\x00\xFA\x23ggml_neg',0,b'\x00\x00\xFA\x23ggml_neg_inplace',0,b'\x00\x00\x92\x23ggml_nelements',0,b'\x00\x00\xF2\x23ggml_new_f32',0,b'\x00\x00\xA7\x23ggml_new_graph',0,b'\x00\x00\xF6\x23ggml_new_i32',0,b'\x00\x00\xD2\x23ggml_new_tensor',0,b'\x00\x00\xD8\x23ggml_new_tensor_1d',0,b'\x00\x00\xDD\x23ggml_new_tensor_2d',0,b'\x00\x00\xE3\x23ggml_new_tensor_3d',0,b'\x00\x00\xEA\x23ggml_new_tensor_4d',0,b'\x00\x00\xFA\x23ggml_norm',0,b'\x00\x00\xFA\x23ggml_norm_inplace',0,b'\x00\x00\x92\x23ggml_nrows',0,b'\x00\x03\xEC\x23ggml_numa_init',0,b'\x00\x00\x2D\x23ggml_op_name',0,b'\x00\x00\x2D\x23ggml_op_symbol',0,b'\x00\x00\x4E\x23ggml_opt',0,b'\x00\x00\xC7\x23ggml_opt_default_params',0,b'\x00\x03\x0F\x23ggml_opt_init',0,b'\x00\x00\x42\x23ggml_opt_resume',0,b'\x00\x00\x47\x23ggml_opt_resume_g',0,b'\x00\x01\x84\x23ggml_out_prod',0,b'\x00\x01\x34\x23ggml_permute',0,b'\x00\x00\xFE\x23ggml_pool_1d',0,b'\x00\x01\x06\x23ggml_pool_2d',0,b'\x00\x03\x3E\x23ggml_print_object',0,b'\x00\x03\x19\x23ggml_print_objects',0,b'\x00\x02\x33\x23ggml_quantize_chunk',0,b'\x00\x02\x3B\x23ggml_quantize_q2_K',0,b'\x00\x02\x3B\x23ggml_quantize_q3_K',0,b'\x00\x02\x3B\x23ggml_quantize_q4_0',0,b'\x00\x02\x3B\x23ggml_quantize_q4_1',0,b'\x00\x02\x3B\x23ggml_quantize_q4_K',0,b'\x00\x02\x3B\x23ggml_quantize_q5_0',0,b'\x00\x02\x3B\x23ggml_quantize_q5_1',0,b'\x00\x02\x3B\x23ggml_quantize_q5_K',0,b'\x00\x02\x3B\x23ggml_quantize_q6_K',0,b'\x00\x02\x3B\x23ggml_quantize_q8_0',0,b'\x00\x00\xFA\x23ggml_relu',0,b'\x00\x00\xFA\x23ggml_relu_inplace',0,b'\x00\x01\x84\x23ggml_repeat',0,b'\x00\x01\x84\x23ggml_repeat_back',0,b'\x00\x01\x84\x23ggml_reshape',0,b'\x00\x01\x46\x23ggml_reshape_1d',0,b'\x00\x01\x4B\x23ggml_reshape_2d',0,b'\x00\x01\x51\x23ggml_reshape_3d',0,b'\x00\x01\x58\x23ggml_reshape_4d',0,b'\x00\x01\x16\x23ggml_rms_norm',0,b'\x00\x01\x84\x23ggml_rms_norm_back',0,b'\x00\x01\x16\x23ggml_rms_norm_inplace',0,b'\x00\x01\x34\x23ggml_rope',0,b'\x00\x01\x34\x23ggml_rope_back',0,b'\x00\x01\x3C\x23ggml_rope_custom',0,b'\x00\x01\x3C\x23ggml_rope_custom_inplace',0,b'\x00\x01\x34\x23ggml_rope_inplace',0,b'\x00\x01\x84\x23ggml_scale',0,b'\x00\x01\x84\x23ggml_scale_inplace',0,b'\x00\x01\xDD\x23ggml_set',0,b'\x00\x01\xD0\x23ggml_set_1d',0,b'\x00\x01\xD0\x23ggml_set_1d_inplace',0,b'\x00\x01\xD6\x23ggml_set_2d',0,b'\x00\x01\xD6\x23ggml_set_2d_inplace',0,b'\x00\x02\x1A\x23ggml_set_f32',0,b'\x00\x03\x6E\x23ggml_set_f32_1d',0,b'\x00\x02\x1E\x23ggml_set_i32',0,b'\x00\x03\x73\x23ggml_set_i32_1d',0,b'\x00\x01\xDD\x23ggml_set_inplace',0,b'\x00\x02\x12\x23ggml_set_name',0,b'\x00\x03\x06\x23ggml_set_no_alloc',0,b'\x00\x03\x15\x23ggml_set_param',0,b'\x00\x02\x46\x23ggml_set_scratch',0,b'\x00\x02\x0F\x23ggml_set_zero',0,b'\x00\x00\xFA\x23ggml_sgn',0,b'\x00\x00\xFA\x23ggml_sgn_inplace',0,b'\x00\x00\xFA\x23ggml_silu',0,b'\x00\x01\x84\x23ggml_silu_back',0,b'\x00\x00\xFA\x23ggml_silu_inplace',0,b'\x00\x00\xFA\x23ggml_soft_max',0,b'\x00\x01\x84\x23ggml_soft_max_back',0,b'\x00\x01\x84\x23ggml_soft_max_back_inplace',0,b'\x00\x00\xFA\x23ggml_soft_max_inplace',0,b'\x00\x00\xFA\x23ggml_sqr',0,b'\x00\x00\xFA\x23ggml_sqr_inplace',0,b'\x00\x00\xFA\x23ggml_sqrt',0,b'\x00\x00\xFA\x23ggml_sqrt_inplace',0,b'\x00\x00\xFA\x23ggml_step',0,b'\x00\x00\xFA\x23ggml_step_inplace',0,b'\x00\x01\x84\x23ggml_sub',0,b'\x00\x01\x84\x23ggml_sub_inplace',0,b'\x00\x00\xFA\x23ggml_sum',0,b'\x00\x00\xFA\x23ggml_sum_rows',0,b'\x00\x00\xFA\x23ggml_tanh',0,b'\x00\x00\xFA\x23ggml_tanh_inplace',0,b'\x00\x02\x60\x23ggml_tensor_overhead',0,b'\x00\x03\xEC\x23ggml_time_init',0,b'\x00\x00\x95\x23ggml_time_ms',0,b'\x00\x00\x95\x23ggml_time_us',0,b'\x00\x00\xFA\x23ggml_transpose',0,b'\x00\x00\x30\x23ggml_type_name',0,b'\x00\x02\x30\x23ggml_type_size',0,b'\x00\x00\x60\x23ggml_type_sizef',0,b'\x00\x01\x11\x23ggml_unary',0,b'\x00\x01\x11\x23ggml_unary_inplace',0,b'\x00\x02\x4A\x23ggml_used_mem',0,b'\x00\x02\xDE\x23ggml_vec_dot_q2_K_q8_K',0,b'\x00\x02\xDE\x23ggml_vec_dot_q3_K_q8_K',0,b'\x00\x02\xDE\x23ggml_vec_dot_q4_K_q8_K',0,b'\x00\x02\xDE\x23ggml_vec_dot_q5_K_q8_K',0,b'\x00\x02\xDE\x23ggml_vec_dot_q6_K_q8_K',0,b'\x00\x01\x7E\x23ggml_view_1d',0,b'\x00\x01\x76\x23ggml_view_2d',0,b'\x00\x01\x6C\x23ggml_view_3d',0,b'\x00\x01\x60\x23ggml_view_4d',0,b'\x00\x02\x0B\x23ggml_view_tensor',0,b'\x00\x01\x21\x23ggml_win_part',0,b'\x00\x01\x2D\x23ggml_win_unpart',0,b'\x00\x03\xCC\x23gguf_add_tensor',0,b'\x00\x00\x88\x23gguf_find_key',0,b'\x00\x00\x88\x23gguf_find_tensor',0,b'\x00\x03\x84\x23gguf_free',0,b'\x00\x02\x59\x23gguf_get_alignment',0,b'\x00\x02\x75\x23gguf_get_arr_data',0,b'\x00\x00\x8C\x23gguf_get_arr_n',0,b'\x00\x00\x3D\x23gguf_get_arr_str',0,b'\x00\x00\x59\x23gguf_get_arr_type',0,b'\x00\x02\x6F\x23gguf_get_data',0,b'\x00\x02\x59\x23gguf_get_data_offset',0,b'\x00\x00\x39\x23gguf_get_key',0,b'\x00\x00\x59\x23gguf_get_kv_type',0,b'\x00\x03\xD4\x23gguf_get_meta_data',0,b'\x00\x02\x59\x23gguf_get_meta_size',0,b'\x00\x00\x85\x23gguf_get_n_kv',0,b'\x00\x00\x85\x23gguf_get_n_tensors',0,b'\x00\x00\x29\x23gguf_get_tensor_name',0,b'\x00\x02\x5C\x23gguf_get_tensor_offset',0,b'\x00\x00\x20\x23gguf_get_val_bool',0,b'\x00\x00\x67\x23gguf_get_val_f32',0,b'\x00\x00\x97\x23gguf_get_val_i16',0,b'\x00\x00\x8C\x23gguf_get_val_i32',0,b'\x00\x00\x9B\x23gguf_get_val_i8',0,b'\x00\x00\x39\x23gguf_get_val_str',0,b'\x00\x02\x65\x23gguf_get_val_u16',0,b'\x00\x02\x2C\x23gguf_get_val_u32',0,b'\x00\x02\x28\x23gguf_get_val_u8',0,b'\x00\x00\x85\x23gguf_get_version',0,b'\x00\x02\x26\x23gguf_init_empty',0,b'\x00\x02\x22\x23gguf_init_from_file',0,b'\x00\x03\x9C\x23gguf_set_arr_data',0,b'\x00\x03\x8C\x23gguf_set_arr_str',0,b'\x00\x03\xD0\x23gguf_set_kv',0,b'\x00\x03\xC6\x23gguf_set_tensor_data',0,b'\x00\x03\x97\x23gguf_set_tensor_type',0,b'\x00\x03\x87\x23gguf_set_val_bool',0,b'\x00\x03\xA3\x23gguf_set_val_f32',0,b'\x00\x03\xAD\x23gguf_set_val_i16',0,b'\x00\x03\xA8\x23gguf_set_val_i32',0,b'\x00\x03\xB2\x23gguf_set_val_i8',0,b'\x00\x03\x92\x23gguf_set_val_str',0,b'\x00\x03\xC1\x23gguf_set_val_u16',0,b'\x00\x03\xBC\x23gguf_set_val_u32',0,b'\x00\x03\xB7\x23gguf_set_val_u8',0,b'\x00\x00\x33\x23gguf_type_name',0,b'\x00\x03\x87\x23gguf_write_to_file',0,b'\x00\x02\xC6\x23quantize_row_q2_K',0,b'\x00\x02\xA3\x23quantize_row_q2_K_reference',0,b'\x00\x02\xC6\x23quantize_row_q3_K',0,b'\x00\x02\xA8\x23quantize_row_q3_K_reference',0,b'\x00\x02\xC6\x23quantize_row_q4_K',0,b'\x00\x02\xAD\x23quantize_row_q4_K_reference',0,b'\x00\x02\xC6\x23quantize_row_q5_K',0,b'\x00\x02\xB2\x23quantize_row_q5_K_reference',0,b'\x00\x02\xC6\x23quantize_row_q6_K',0,b'\x00\x02\xB7\x23quantize_row_q6_K_reference',0,b'\x00\x02\xC6\x23quantize_row_q8_K',0,b'\x00\x02\xBC\x23quantize_row_q8_K_reference',0), + _struct_unions = ((b'\x00\x00\x04\x27\x00\x00\x00\x02$1',b'\x00\x00\x22\x11n_iter',b'\x00\x00\xF4\x11sched',b'\x00\x00\xF4\x11decay',b'\x00\x00\xF4\x11alpha',b'\x00\x00\xF4\x11beta1',b'\x00\x00\xF4\x11beta2',b'\x00\x00\xF4\x11eps',b'\x00\x00\xF4\x11eps_f',b'\x00\x00\xF4\x11eps_g'),(b'\x00\x00\x04\x28\x00\x00\x00\x02$2',b'\x00\x00\x22\x11m',b'\x00\x00\x22\x11n_iter',b'\x00\x00\x22\x11max_linesearch',b'\x00\x00\xF4\x11eps',b'\x00\x00\xF4\x11ftol',b'\x00\x00\xF4\x11wolfe',b'\x00\x00\xF4\x11min_step',b'\x00\x00\xF4\x11max_step',b'\x00\x04\x14\x11linesearch'),(b'\x00\x00\x04\x29\x00\x00\x00\x02$3',b'\x00\x00\x08\x11x',b'\x00\x00\x08\x11g1',b'\x00\x00\x08\x11g2',b'\x00\x00\x08\x11m',b'\x00\x00\x08\x11v',b'\x00\x00\x08\x11mh',b'\x00\x00\x08\x11vh',b'\x00\x00\x08\x11pf',b'\x00\x00\xF4\x11fx_best',b'\x00\x00\xF4\x11fx_prev',b'\x00\x00\x22\x11n_no_improvement'),(b'\x00\x00\x04\x2A\x00\x00\x00\x02$4',b'\x00\x00\x08\x11x',b'\x00\x00\x08\x11xp',b'\x00\x00\x08\x11g',b'\x00\x00\x08\x11gp',b'\x00\x00\x08\x11d',b'\x00\x00\x08\x11pf',b'\x00\x00\x08\x11lmal',b'\x00\x00\x08\x11lmys',b'\x00\x00\x08\x11lms',b'\x00\x00\x08\x11lmy',b'\x00\x00\xF4\x11fx_best',b'\x00\x00\xF4\x11step',b'\x00\x00\x22\x11j',b'\x00\x00\x22\x11k',b'\x00\x00\x22\x11end',b'\x00\x00\x22\x11n_no_improvement'),(b'\x00\x00\x03\xF7\x00\x00\x00\x03$__mbstate_t',b'\x00\x03\xFF\x11__mbstate8',b'\x00\x00\xDB\x11_mbstateL'),(b'\x00\x00\x03\xF8\x00\x00\x00\x02$block_q2_K',b'\x00\x04\x44\x11scales',b'\x00\x04\x48\x11qs',b'\x00\x00\x6C\x11d',b'\x00\x00\x6C\x11dmin'),(b'\x00\x00\x03\xF9\x00\x00\x00\x02$block_q3_K',b'\x00\x04\x46\x11hmask',b'\x00\x04\x48\x11qs',b'\x00\x04\x42\x11scales',b'\x00\x00\x6C\x11d'),(b'\x00\x00\x03\xFA\x00\x00\x00\x02$block_q4_K',b'\x00\x00\x6C\x11d',b'\x00\x00\x6C\x11dmin',b'\x00\x04\x42\x11scales',b'\x00\x04\x40\x11qs'),(b'\x00\x00\x03\xFB\x00\x00\x00\x02$block_q5_K',b'\x00\x00\x6C\x11d',b'\x00\x00\x6C\x11dmin',b'\x00\x04\x42\x11scales',b'\x00\x04\x46\x11qh',b'\x00\x04\x40\x11qs'),(b'\x00\x00\x03\xFC\x00\x00\x00\x02$block_q6_K',b'\x00\x04\x40\x11ql',b'\x00\x04\x48\x11qh',b'\x00\x04\x23\x11scales',b'\x00\x00\x6C\x11d'),(b'\x00\x00\x03\xFD\x00\x00\x00\x02$block_q8_K',b'\x00\x00\xF4\x11d',b'\x00\x04\x25\x11qs',b'\x00\x04\x21\x11bsums'),(b'\x00\x00\x04\x18\x00\x00\x00\x02$ggml_type_traits_t',b'\x00\x00\x0F\x11type_name',b'\x00\x00\x22\x11blck_size',b'\x00\x00\x11\x11type_size',b'\x00\x00\xB6\x11is_quantized',b'\x00\x04\x52\x11to_float',b'\x00\x04\x4F\x11from_float',b'\x00\x04\x4F\x11from_float_reference',b'\x00\x04\x50\x11vec_dot',b'\x00\x00\x01\x11vec_dot_type'),(b'\x00\x00\x04\x2C\x00\x00\x00\x02__darwin_pthread_handler_rec',b'\x00\x04\x51\x11__routine',b'\x00\x00\x10\x11__arg',b'\x00\x04\x2B\x11__next'),(b'\x00\x00\x03\xEF\x00\x00\x00\x02_opaque_pthread_attr_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x0B\x11__opaque'),(b'\x00\x00\x03\xF0\x00\x00\x00\x02_opaque_pthread_cond_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x07\x11__opaque'),(b'\x00\x00\x03\xF1\x00\x00\x00\x02_opaque_pthread_condattr_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x11\x11__opaque'),(b'\x00\x00\x03\xF2\x00\x00\x00\x02_opaque_pthread_mutex_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x0B\x11__opaque'),(b'\x00\x00\x03\xF3\x00\x00\x00\x02_opaque_pthread_mutexattr_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x11\x11__opaque'),(b'\x00\x00\x03\xF4\x00\x00\x00\x02_opaque_pthread_once_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x11\x11__opaque'),(b'\x00\x00\x03\xF5\x00\x00\x00\x02_opaque_pthread_rwlock_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x03\x11__opaque'),(b'\x00\x00\x03\xF6\x00\x00\x00\x02_opaque_pthread_rwlockattr_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x01\x11__opaque'),(b'\x00\x00\x04\x2E\x00\x00\x00\x02_opaque_pthread_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x2B\x11__cleanup_stack',b'\x00\x04\x0F\x11__opaque'),(b'\x00\x00\x04\x2F\x00\x00\x00\x10ggml_allocr',),(b'\x00\x00\x04\x30\x00\x00\x00\x02ggml_cgraph',b'\x00\x00\x22\x11n_nodes',b'\x00\x00\x22\x11n_leafs',b'\x00\x04\x39\x11nodes',b'\x00\x04\x39\x11grads',b'\x00\x04\x39\x11leafs',b'\x00\x04\x4D\x11visited_hash_table',b'\x00\x00\x22\x11perf_runs',b'\x00\x00\xDB\x11perf_cycles',b'\x00\x00\xDB\x11perf_time_us'),(b'\x00\x00\x04\x31\x00\x00\x00\x02ggml_compute_params',b'\x00\x04\x17\x11type',b'\x00\x00\x22\x11ith',b'\x00\x00\x22\x11nth',b'\x00\x00\x11\x11wsize',b'\x00\x00\x10\x11wdata'),(b'\x00\x00\x04\x32\x00\x00\x00\x10ggml_context',),(b'\x00\x00\x04\x33\x00\x00\x00\x02ggml_cplan',b'\x00\x00\x11\x11work_size',b'\x00\x04\x3F\x11work_data',b'\x00\x00\x22\x11n_threads',b'\x00\x04\x19\x11n_tasks',b'\x00\x03\xEE\x11abort_callback',b'\x00\x00\x10\x11abort_callback_data'),(b'\x00\x00\x00\xBC\x00\x00\x00\x02ggml_init_params',b'\x00\x00\x11\x11mem_size',b'\x00\x00\x10\x11mem_buffer',b'\x00\x00\xB6\x11no_alloc'),(b'\x00\x00\x04\x34\x00\x00\x00\x10ggml_metal_context',),(b'\x00\x00\x04\x35\x00\x00\x00\x10ggml_mpi_context',),(b'\x00\x00\x04\x37\x00\x00\x00\x02ggml_object',b'\x00\x00\x11\x11offs',b'\x00\x00\x11\x11size',b'\x00\x04\x36\x11next',b'\x00\x04\x15\x11type',b'\x00\x04\x09\x11padding'),(b'\x00\x00\x04\x38\x00\x00\x00\x02ggml_opt_context',b'\x00\x00\x0B\x11ctx',b'\x00\x00\x50\x11params',b'\x00\x00\x22\x11iter',b'\x00\x00\xDB\x11nx',b'\x00\x00\xB6\x11just_initialized',b'\x00\x04\x29\x11adam',b'\x00\x04\x2A\x11lbfgs'),(b'\x00\x00\x00\x50\x00\x00\x00\x02ggml_opt_params',b'\x00\x00\xC8\x11type',b'\x00\x00\x22\x11n_threads',b'\x00\x00\x22\x11past',b'\x00\x00\xF4\x11delta',b'\x00\x00\x22\x11max_no_improvement',b'\x00\x00\xB6\x11print_forward_graph',b'\x00\x00\xB6\x11print_backward_graph',b'\x00\x04\x27\x11adam',b'\x00\x04\x28\x11lbfgs'),(b'\x00\x00\x02\x48\x00\x00\x00\x02ggml_scratch',b'\x00\x00\x11\x11offs',b'\x00\x00\x11\x11size',b'\x00\x00\x10\x11data'),(b'\x00\x00\x04\x3D\x00\x00\x00\x02ggml_tensor',b'\x00\x00\x01\x11type',b'\x00\x04\x13\x11backend',b'\x00\x00\x22\x11n_dims',b'\x00\x04\x1E\x11ne',b'\x00\x04\x4B\x11nb',b'\x00\x00\x2E\x11op',b'\x00\x04\x1B\x11op_params',b'\x00\x00\xB6\x11is_param',b'\x00\x00\x08\x11grad',b'\x00\x04\x3B\x11src',b'\x00\x00\x22\x11perf_runs',b'\x00\x00\xDB\x11perf_cycles',b'\x00\x00\xDB\x11perf_time_us',b'\x00\x00\x10\x11data',b'\x00\x04\x0D\x11name',b'\x00\x00\x10\x11extra',b'\x00\x04\x09\x11padding'),(b'\x00\x00\x04\x3E\x00\x00\x00\x10gguf_context',),(b'\x00\x00\x02\x24\x00\x00\x00\x02gguf_init_params',b'\x00\x00\xB6\x11no_alloc',b'\x00\x00\xB0\x11ctx')), + _enums = (b'\x00\x00\x04\x13\x00\x00\x00\x16ggml_backend\x00GGML_BACKEND_CPU,GGML_BACKEND_GPU,GGML_BACKEND_GPU_SPLIT',b'\x00\x00\x00\x54\x00\x00\x00\x15ggml_ftype\x00GGML_FTYPE_UNKNOWN,GGML_FTYPE_ALL_F32,GGML_FTYPE_MOSTLY_F16,GGML_FTYPE_MOSTLY_Q4_0,GGML_FTYPE_MOSTLY_Q4_1,GGML_FTYPE_MOSTLY_Q4_1_SOME_F16,GGML_FTYPE_MOSTLY_Q8_0,GGML_FTYPE_MOSTLY_Q5_0,GGML_FTYPE_MOSTLY_Q5_1,GGML_FTYPE_MOSTLY_Q2_K,GGML_FTYPE_MOSTLY_Q3_K,GGML_FTYPE_MOSTLY_Q4_K,GGML_FTYPE_MOSTLY_Q5_K,GGML_FTYPE_MOSTLY_Q6_K',b'\x00\x00\x04\x14\x00\x00\x00\x16ggml_linesearch\x00GGML_LINESEARCH_DEFAULT,GGML_LINESEARCH_BACKTRACKING_ARMIJO,GGML_LINESEARCH_BACKTRACKING_WOLFE,GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE',b'\x00\x00\x04\x15\x00\x00\x00\x16ggml_object_type\x00GGML_OBJECT_TENSOR,GGML_OBJECT_GRAPH,GGML_OBJECT_WORK_BUFFER',b'\x00\x00\x00\x2E\x00\x00\x00\x16ggml_op\x00GGML_OP_NONE,GGML_OP_DUP,GGML_OP_ADD,GGML_OP_ADD1,GGML_OP_ACC,GGML_OP_SUB,GGML_OP_MUL,GGML_OP_DIV,GGML_OP_SQR,GGML_OP_SQRT,GGML_OP_LOG,GGML_OP_SUM,GGML_OP_SUM_ROWS,GGML_OP_MEAN,GGML_OP_ARGMAX,GGML_OP_REPEAT,GGML_OP_REPEAT_BACK,GGML_OP_SILU_BACK,GGML_OP_NORM,GGML_OP_RMS_NORM,GGML_OP_RMS_NORM_BACK,GGML_OP_MUL_MAT,GGML_OP_OUT_PROD,GGML_OP_SCALE,GGML_OP_SET,GGML_OP_CPY,GGML_OP_CONT,GGML_OP_RESHAPE,GGML_OP_VIEW,GGML_OP_PERMUTE,GGML_OP_TRANSPOSE,GGML_OP_GET_ROWS,GGML_OP_GET_ROWS_BACK,GGML_OP_DIAG,GGML_OP_DIAG_MASK_INF,GGML_OP_DIAG_MASK_ZERO,GGML_OP_SOFT_MAX,GGML_OP_SOFT_MAX_BACK,GGML_OP_ROPE,GGML_OP_ROPE_BACK,GGML_OP_ALIBI,GGML_OP_CLAMP,GGML_OP_CONV_1D,GGML_OP_CONV_2D,GGML_OP_POOL_1D,GGML_OP_POOL_2D,GGML_OP_FLASH_ATTN,GGML_OP_FLASH_FF,GGML_OP_FLASH_ATTN_BACK,GGML_OP_WIN_PART,GGML_OP_WIN_UNPART,GGML_OP_UNARY,GGML_OP_MAP_UNARY,GGML_OP_MAP_BINARY,GGML_OP_MAP_CUSTOM1_F32,GGML_OP_MAP_CUSTOM2_F32,GGML_OP_MAP_CUSTOM3_F32,GGML_OP_MAP_CUSTOM1,GGML_OP_MAP_CUSTOM2,GGML_OP_MAP_CUSTOM3,GGML_OP_CROSS_ENTROPY_LOSS,GGML_OP_CROSS_ENTROPY_LOSS_BACK,GGML_OP_COUNT',b'\x00\x00\x01\x01\x00\x00\x00\x16ggml_op_pool\x00GGML_OP_POOL_MAX,GGML_OP_POOL_AVG,GGML_OP_POOL_COUNT',b'\x00\x00\x04\x16\x00\x00\x00\x15ggml_opt_result\x00GGML_OPT_OK,GGML_OPT_DID_NOT_CONVERGE,GGML_OPT_NO_CONTEXT,GGML_OPT_INVALID_WOLFE,GGML_OPT_FAIL,GGML_LINESEARCH_FAIL,GGML_LINESEARCH_MINIMUM_STEP,GGML_LINESEARCH_MAXIMUM_STEP,GGML_LINESEARCH_MAXIMUM_ITERATIONS,GGML_LINESEARCH_INVALID_PARAMETERS',b'\x00\x00\x00\xC8\x00\x00\x00\x16ggml_opt_type\x00GGML_OPT_ADAM,GGML_OPT_LBFGS',b'\x00\x00\x04\x17\x00\x00\x00\x16ggml_task_type\x00GGML_TASK_INIT,GGML_TASK_COMPUTE,GGML_TASK_FINALIZE',b'\x00\x00\x00\x01\x00\x00\x00\x16ggml_type\x00GGML_TYPE_F32,GGML_TYPE_F16,GGML_TYPE_Q4_0,GGML_TYPE_Q4_1,GGML_TYPE_Q5_0,GGML_TYPE_Q5_1,GGML_TYPE_Q8_0,GGML_TYPE_Q8_1,GGML_TYPE_Q2_K,GGML_TYPE_Q3_K,GGML_TYPE_Q4_K,GGML_TYPE_Q5_K,GGML_TYPE_Q6_K,GGML_TYPE_Q8_K,GGML_TYPE_I8,GGML_TYPE_I16,GGML_TYPE_I32,GGML_TYPE_COUNT',b'\x00\x00\x01\x14\x00\x00\x00\x16ggml_unary_op\x00GGML_UNARY_OP_ABS,GGML_UNARY_OP_SGN,GGML_UNARY_OP_NEG,GGML_UNARY_OP_STEP,GGML_UNARY_OP_TANH,GGML_UNARY_OP_ELU,GGML_UNARY_OP_RELU,GGML_UNARY_OP_GELU,GGML_UNARY_OP_GELU_QUICK,GGML_UNARY_OP_SILU',b'\x00\x00\x00\x34\x00\x00\x00\x16gguf_type\x00GGUF_TYPE_UINT8,GGUF_TYPE_INT8,GGUF_TYPE_UINT16,GGUF_TYPE_INT16,GGUF_TYPE_UINT32,GGUF_TYPE_INT32,GGUF_TYPE_FLOAT32,GGUF_TYPE_BOOL,GGUF_TYPE_STRING,GGUF_TYPE_ARRAY,GGUF_TYPE_COUNT'), + _typenames = (b'\x00\x00\x00\xDB__darwin_blkcnt_t',b'\x00\x00\x00\x22__darwin_blksize_t',b'\x00\x00\x00\x11__darwin_clock_t',b'\x00\x00\x00\x22__darwin_ct_rune_t',b'\x00\x00\x00\x22__darwin_dev_t',b'\x00\x00\x03\xBF__darwin_fsblkcnt_t',b'\x00\x00\x03\xBF__darwin_fsfilcnt_t',b'\x00\x00\x03\xBF__darwin_gid_t',b'\x00\x00\x03\xBF__darwin_id_t',b'\x00\x00\x04\x4A__darwin_ino64_t',b'\x00\x00\x04\x4A__darwin_ino_t',b'\x00\x00\x04\x20__darwin_intptr_t',b'\x00\x00\x03\xBF__darwin_mach_port_name_t',b'\x00\x00\x03\xBF__darwin_mach_port_t',b'\x00\x00\x03\xF7__darwin_mbstate_t',b'\x00\x00\x00\x6C__darwin_mode_t',b'\x00\x00\x03\xBF__darwin_natural_t',b'\x00\x00\x00\xDB__darwin_off_t',b'\x00\x00\x00\x22__darwin_pid_t',b'\x00\x00\x03\xEF__darwin_pthread_attr_t',b'\x00\x00\x03\xF0__darwin_pthread_cond_t',b'\x00\x00\x03\xF1__darwin_pthread_condattr_t',b'\x00\x00\x00\x11__darwin_pthread_key_t',b'\x00\x00\x03\xF2__darwin_pthread_mutex_t',b'\x00\x00\x03\xF3__darwin_pthread_mutexattr_t',b'\x00\x00\x03\xF4__darwin_pthread_once_t',b'\x00\x00\x03\xF5__darwin_pthread_rwlock_t',b'\x00\x00\x03\xF6__darwin_pthread_rwlockattr_t',b'\x00\x00\x04\x2D__darwin_pthread_t',b'\x00\x00\x04\x20__darwin_ptrdiff_t',b'\x00\x00\x00\x22__darwin_rune_t',b'\x00\x00\x03\xBF__darwin_sigset_t',b'\x00\x00\x00\x11__darwin_size_t',b'\x00\x00\x03\xBF__darwin_socklen_t',b'\x00\x00\x04\x20__darwin_ssize_t',b'\x00\x00\x00\x22__darwin_suseconds_t',b'\x00\x00\x04\x20__darwin_time_t',b'\x00\x00\x03\xBF__darwin_uid_t',b'\x00\x00\x03\xBF__darwin_useconds_t',b'\x00\x00\x04\x05__darwin_uuid_string_t',b'\x00\x00\x04\x44__darwin_uuid_t',b'\x00\x00\x00\x22__darwin_wchar_t',b'\x00\x00\x00\x22__darwin_wint_t',b'\x00\x00\x03\xB0__int16_t',b'\x00\x00\x00\x22__int32_t',b'\x00\x00\x00\xDB__int64_t',b'\x00\x00\x03\xB5__int8_t',b'\x00\x00\x03\xF7__mbstate_t',b'\x00\x00\x00\x6C__uint16_t',b'\x00\x00\x03\xBF__uint32_t',b'\x00\x00\x04\x4A__uint64_t',b'\x00\x00\x03\xBA__uint8_t',b'\x00\x00\x03\xF8block_q2_K',b'\x00\x00\x03\xF9block_q3_K',b'\x00\x00\x03\xFAblock_q4_K',b'\x00\x00\x03\xFBblock_q5_K',b'\x00\x00\x03\xFCblock_q6_K',b'\x00\x00\x03\xFDblock_q8_K',b'\x00\x00\x01\xEAggml_binary_op_f32_t',b'\x00\x00\x02\x02ggml_custom1_op_f32_t',b'\x00\x00\x02\x07ggml_custom1_op_t',b'\x00\x00\x01\xF0ggml_custom2_op_f32_t',b'\x00\x00\x01\xF6ggml_custom2_op_t',b'\x00\x00\x01\xC5ggml_custom3_op_f32_t',b'\x00\x00\x01\xCCggml_custom3_op_t',b'\x00\x00\x00\x6Cggml_fp16_t',b'\x00\x00\x04\x4Fggml_from_float_t',b'\x00\x00\x04\x52ggml_to_float_t',b'\x00\x00\x04\x18ggml_type_traits_t',b'\x00\x00\x01\xFDggml_unary_op_f32_t',b'\x00\x00\x04\x50ggml_vec_dot_t',b'\x00\x00\x03\xB0int16_t',b'\x00\x00\x00\x22int32_t',b'\x00\x00\x00\xDBint64_t',b'\x00\x00\x03\xB5int8_t',b'\x00\x00\x03\xB0int_fast16_t',b'\x00\x00\x00\x22int_fast32_t',b'\x00\x00\x00\xDBint_fast64_t',b'\x00\x00\x03\xB5int_fast8_t',b'\x00\x00\x03\xB0int_least16_t',b'\x00\x00\x00\x22int_least32_t',b'\x00\x00\x00\xDBint_least64_t',b'\x00\x00\x03\xB5int_least8_t',b'\x00\x00\x04\x20intmax_t',b'\x00\x00\x04\x20intptr_t',b'\x00\x00\x04\x1Dmax_align_t',b'\x00\x00\x04\x20ptrdiff_t',b'\x00\x00\x00\xDBregister_t',b'\x00\x00\x00\x11rsize_t',b'\x00\x00\x00\x11size_t',b'\x00\x00\x04\x4Asyscall_arg_t',b'\x00\x00\x00\x6Cu_int16_t',b'\x00\x00\x03\xBFu_int32_t',b'\x00\x00\x04\x4Au_int64_t',b'\x00\x00\x03\xBAu_int8_t',b'\x00\x00\x00\x6Cuint16_t',b'\x00\x00\x03\xBFuint32_t',b'\x00\x00\x04\x4Auint64_t',b'\x00\x00\x03\xBAuint8_t',b'\x00\x00\x00\x6Cuint_fast16_t',b'\x00\x00\x03\xBFuint_fast32_t',b'\x00\x00\x04\x4Auint_fast64_t',b'\x00\x00\x03\xBAuint_fast8_t',b'\x00\x00\x00\x6Cuint_least16_t',b'\x00\x00\x03\xBFuint_least32_t',b'\x00\x00\x04\x4Auint_least64_t',b'\x00\x00\x03\xBAuint_least8_t',b'\x00\x00\x00\x11uintmax_t',b'\x00\x00\x00\x11uintptr_t',b'\x00\x00\x04\x4Auser_addr_t',b'\x00\x00\x00\xDBuser_long_t',b'\x00\x00\x00\xDBuser_off_t',b'\x00\x00\x04\x4Auser_size_t',b'\x00\x00\x00\xDBuser_ssize_t',b'\x00\x00\x00\xDBuser_time_t',b'\x00\x00\x04\x4Auser_ulong_t',b'\x00\x00\x00\x22wchar_t'), +) diff --git a/seamless_communication/ggml/examples/python/ggml/ffi/__init__.pyi b/seamless_communication/ggml/examples/python/ggml/ffi/__init__.pyi new file mode 100644 index 0000000..73117a1 --- /dev/null +++ b/seamless_communication/ggml/examples/python/ggml/ffi/__init__.pyi @@ -0,0 +1,7 @@ +# Phony stubs. + +class CData: + pass + +class CType: + pass \ No newline at end of file diff --git a/seamless_communication/ggml/examples/python/ggml/utils.py b/seamless_communication/ggml/examples/python/ggml/utils.py new file mode 100644 index 0000000..7cea2bf --- /dev/null +++ b/seamless_communication/ggml/examples/python/ggml/utils.py @@ -0,0 +1,182 @@ +""" + Common helpers for working with ggml + numpy +""" +from ggml import ffi, lib +from typing import Union, Optional +import numpy as np + +def init(mem_size: int, mem_buffer: ffi.CData = ffi.NULL, no_alloc: bool = False) -> ffi.CData: + """ + Initialize a ggml context, which will be freed automatically when the pointer is garbage collected. + """ + params = ffi.new('struct ggml_init_params*') + params.mem_size = mem_size + params.mem_buffer = mem_buffer + params.no_alloc = no_alloc + return ffi.gc(lib.ggml_init(params[0]), lib.ggml_free) + +TensorLike = Union[ffi.CData, np.ndarray] + +def copy(from_tensor: TensorLike, to_tensor: TensorLike, allow_requantize: bool = True): + """ + Copy the contents of one tensor to another, doing any necessary (de/re)quantization transparently. + Works across numpy & ggml tensors, but they must have the same shape (and be contiguous). + + Parameters + ---------- + from_tensor : TensorLike + The tensor to copy from (a numpy array or possibly-quantized ggml tensor) + to_tensor : TensorLike + The tensor to copy to (a numpy array or possibly-quantized ggml tensor) + allow_requantize : bool + If False, will throw an error if requantization is required (i.e. both from_tensor + and to_tensor are quantized with different quantization types) + """ + if id(from_tensor) == id(to_tensor): + return + + __expect_same_layout("source", from_tensor, "destination", to_tensor) + __check_shape_consistent_with_type(from_tensor) + __check_shape_consistent_with_type(to_tensor) + + from_type = __get_type(from_tensor) + to_type = __get_type(to_tensor) + + if from_type == to_type: + ffi.memmove(__get_data(to_tensor), __get_data(from_tensor), __get_nbytes(from_tensor)) + else: + assert allow_requantize or not lib.ggml_is_quantized(from_type) or not lib.ggml_is_quantized(to_type), \ + f"Requantizing from {__type_name(from_type)} to {__type_name(to_type)} is disabled. Force with allow_requantize=True" + + __set_floats(to_tensor, __get_floats(from_tensor)) + +def numpy(tensor: ffi.CData, allow_copy: Union[bool, np.ndarray] = False, allow_requantize=False) -> np.ndarray: + """ + Convert a ggml tensor to a numpy array. + If the tensor isn't quantized, the returned numpy array will be a view over its data. + + If it is quantized (and allow_copy is True), the copy will involve dequantization and the returned array will + be a copy of the original tensor (any changes to the numpy array won't then be reflected back to the tensor). + + Parameters + ---------- + tensor : ffi.CData + The tensor to convert to a numpy array + allow_copy : bool or np.ndarray + If False, will throw an error if the tensor is quantized (since dequantization requires extra memory). + If True, will dequantize the tensor and return a copy of the data in a new float32 numpy array. + If an np.ndarray, will copy the data into the given array (which must be the same shape as the tensor) when dequantization is needed + allow_requantize : bool + If allow_copy is a tensor with a different quantization type than the source tensor, will throw an error unless allow_requantize is True. + """ + shape = __get_shape(tensor) + + if lib.ggml_is_quantized(tensor.type): + if allow_copy == False: + raise ValueError(f"{__describe(tensor)} is quantized, conversion to numpy requires a copy (pass allow_copy=True; changes to the numpy array won't affect the original).") + elif isinstance(allow_copy, np.ndarray): + __expect_same_layout("source tensor", tensor, "dequantization output tensor", allow_copy) + destination = allow_copy + else: + destination = np.empty(shape, dtype=np.float32) + + copy(tensor, destination, allow_requantize=allow_requantize) + return destination + else: + dtype = __type_to_dtype(tensor.type) + if not dtype: + raise NotImplementedError(f'Cannot convert {__describe(tensor)} to numpy') + + assert __is_contiguous(tensor), f"Cannot convert {__describe(tensor)} to numpy (support contiguous tensors only)" + nbytes = lib.ggml_nelements(tensor) * lib.ggml_type_size(tensor.type) + array = np.frombuffer(ffi.buffer(lib.ggml_get_data(tensor), nbytes), dtype=dtype) + array.shape = shape + return array + +def __type_name(type: int) -> str: + name = lib.ggml_type_name(type) + return ffi.string(name).decode('utf-8') if name else None + +__k_quant_types = set([ + lib.GGML_TYPE_Q2_K, + lib.GGML_TYPE_Q3_K, + lib.GGML_TYPE_Q4_K, + lib.GGML_TYPE_Q5_K, + lib.GGML_TYPE_Q6_K, + lib.GGML_TYPE_Q8_K, +]) + +__type_to_dtype_dict = { + lib.GGML_TYPE_I8: np.int8, + lib.GGML_TYPE_I16: np.int16, + lib.GGML_TYPE_I32: np.int32, + lib.GGML_TYPE_F16: np.float16, + lib.GGML_TYPE_F32: np.float32, +} + +def __type_to_dtype(type: int) -> Optional[np.dtype]: return __type_to_dtype_dict.get(type) +def __dtype_to_type(dtype: np.dtype): + if dtype == np.float32: return lib.GGML_TYPE_F32 + elif dtype == np.float16: return lib.GGML_TYPE_F16 + elif dtype == np.int32: return lib.GGML_TYPE_I32 + elif dtype == np.int16: return lib.GGML_TYPE_I16 + elif dtype == np.int8: return lib.GGML_TYPE_I8 + else: raise ValueError(f"Unsupported dtype: {dtype}") + +def __describe(tensor: ffi.CType): return f'Tensor[{__type_name(__get_type(tensor))}, {__get_shape(tensor)}]' +def __get_type(tensor: TensorLike): return __dtype_to_type(tensor.dtype) if isinstance(tensor, np.ndarray) else tensor.type +def __get_shape(x: TensorLike): return x.shape if isinstance(x, np.ndarray) else tuple([x.ne[i] for i in range(x.n_dims)]) +def __get_strides(x: TensorLike): return x.strides if isinstance(x, np.ndarray) else tuple([x.nb[i] for i in range(x.n_dims)]) +def __get_data(x: TensorLike) -> ffi.CData: return ffi.from_buffer(x) if isinstance(x, np.ndarray) else lib.ggml_get_data(x) +def __get_nbytes(tensor: TensorLike): return tensor.nbytes if isinstance(tensor, np.ndarray) else lib.ggml_nbytes(tensor) +def __get_nelements(tensor: TensorLike): return tensor.size if isinstance(tensor, np.ndarray) else lib.ggml_nelements(tensor) +def __is_contiguous(tensor: TensorLike): return tensor.flags['C_CONTIGUOUS'] if isinstance(tensor, np.ndarray) else lib.ggml_is_contiguous(tensor) + +def __get_floats(tensor: TensorLike) -> ffi.CData: + data, type = __get_data(tensor), __get_type(tensor) + if type == lib.GGML_TYPE_F32: + return ffi.cast('float*', data) + else: + nelements = __get_nelements(tensor) + floats = ffi.new('float[]', nelements) + if type == lib.GGML_TYPE_F16: + lib.ggml_fp16_to_fp32_row(ffi.cast('uint16_t*', data), floats, nelements) + elif lib.ggml_is_quantized(type): + qtype = lib.ggml_internal_get_type_traits(type) + assert qtype.to_float, f"Type {__type_name(type)} is not supported by ggml" + qtype.to_float(data, floats, nelements) + else: + raise NotImplementedError(f'Cannot read floats from {__describe(tensor)}') + return floats + +def __set_floats(tensor: TensorLike, f32_data: ffi.CData) -> None: + data, type, nbytes = __get_data(tensor), __get_type(tensor), __get_nbytes(tensor) + if type == lib.GGML_TYPE_F32: + ffi.memmove(data, f32_data, nbytes) + else: + nelements = __get_nelements(tensor) + if type == lib.GGML_TYPE_F16: + lib.ggml_fp32_to_fp16_row(f32_data, ffi.cast('uint16_t*', data), nelements) + elif lib.ggml_is_quantized(type): + qtype = lib.ggml_internal_get_type_traits(type) + assert qtype.from_float, f"Type {__type_name(type)} is not supported by ggml" + qtype.from_float(f32_data, data, nelements) + else: + raise NotImplementedError(f'Cannot write floats to {__describe(tensor)}') + +def __expect_same_layout(name1: str, tensor1: TensorLike, name2: str, tensor2: TensorLike): + shape1, shape2 = __get_shape(tensor1), __get_shape(tensor2) + assert shape1 == shape2, f"Shape mismatch: {name1} has {shape1} but {name2} has {shape2}" + assert __is_contiguous(tensor1) and __is_contiguous(tensor2), f"Only contiguous tensors are supported (got {name1} with strides {__get_strides(tensor1)} and {name2} with strides {__get_strides(tensor2)})" + +def __check_shape_consistent_with_type(tensor: TensorLike): + type = __get_type(tensor) + if not lib.ggml_is_quantized(type): + return + shape = __get_shape(tensor) + + block_size = lib.ggml_blck_size(type) + assert not (block_size == 0 and type in __k_quant_types), f"Can't quantize, native library was not compiled with USE_K_QUANTS!" + assert block_size > 0, f"Invalid block size {block_size} for type {__type_name(type)}" + for i, d in enumerate(shape): + assert d % block_size == 0, f"Dimension {i} of {__describe(tensor)} is not divisible by {block_size}, required for quantization." diff --git a/seamless_communication/ggml/examples/python/regenerate.py b/seamless_communication/ggml/examples/python/regenerate.py new file mode 100644 index 0000000..08d84c0 --- /dev/null +++ b/seamless_communication/ggml/examples/python/regenerate.py @@ -0,0 +1,42 @@ +# Generates bindings for the ggml library. +# +# cffi requires prior C preprocessing of the headers, and it uses pycparser which chokes on a couple of things +# so we help it a bit (e.g. replace sizeof expressions with their value, remove exotic syntax found in Darwin headers). +import os, sys, re, subprocess +import cffi +from stubs import generate_stubs + +API = os.environ.get('API', 'api.h') +CC = os.environ.get('CC') or 'gcc' +C_INCLUDE_DIR = os.environ.get('C_INCLUDE_DIR', '../../../llama.cpp') +CPPFLAGS = [ + "-I", C_INCLUDE_DIR, + '-D__fp16=uint16_t', # pycparser doesn't support __fp16 + '-D__attribute__(x)=', + '-D_Static_assert(x, m)=', +] + [x for x in os.environ.get('CPPFLAGS', '').split(' ') if x != ''] + +try: header = subprocess.run([CC, "-E", *CPPFLAGS, API], capture_output=True, text=True, check=True).stdout +except subprocess.CalledProcessError as e: print(f'{e.stderr}\n{e}', file=sys.stderr); raise + +header = '\n'.join([l for l in header.split('\n') if '__darwin_va_list' not in l]) # pycparser hates this + +# Replace constant size expressions w/ their value (compile & run a mini exe for each, because why not). +# First, extract anyting *inside* square brackets and anything that looks like a sizeof call. +for expr in set(re.findall(f'(?<=\\[)[^\\]]+(?=])|sizeof\\s*\\([^()]+\\)', header)): + if re.match(r'^(\d+|\s*)$', expr): continue # skip constants and empty bracket contents + subprocess.run([CC, "-o", "eval_size_expr", *CPPFLAGS, "-x", "c", "-"], text=True, check=True, + input=f'''#include + #include "{API}" + int main() {{ printf("%lu", (size_t)({expr})); }}''') + size = subprocess.run(["./eval_size_expr"], capture_output=True, text=True, check=True).stdout + print(f'Computed constexpr {expr} = {size}') + header = header.replace(expr, size) + +ffibuilder = cffi.FFI() +ffibuilder.cdef(header) +ffibuilder.set_source(f'ggml.cffi', None) # we're not compiling a native extension, as this quickly gets hairy +ffibuilder.compile(verbose=True) + +with open("ggml/__init__.pyi", "wt") as f: + f.write(generate_stubs(header)) \ No newline at end of file diff --git a/seamless_communication/ggml/examples/python/stubs.py b/seamless_communication/ggml/examples/python/stubs.py new file mode 100644 index 0000000..adf3d6c --- /dev/null +++ b/seamless_communication/ggml/examples/python/stubs.py @@ -0,0 +1,128 @@ +""" + This generates .pyi stubs for the cffi Python bindings generated by regenerate.py +""" +import sys, re, itertools +sys.path.extend(['.', '..']) # for pycparser + +from pycparser import c_ast, parse_file, CParser +import pycparser.plyparser +from pycparser.c_ast import PtrDecl, TypeDecl, FuncDecl, EllipsisParam, IdentifierType, Struct, Enum, Typedef +from typing import Tuple + +__c_type_to_python_type = { + 'void': 'None', '_Bool': 'bool', + 'char': 'int', 'short': 'int', 'int': 'int', 'long': 'int', + 'ptrdiff_t': 'int', 'size_t': 'int', + 'int8_t': 'int', 'uint8_t': 'int', + 'int16_t': 'int', 'uint16_t': 'int', + 'int32_t': 'int', 'uint32_t': 'int', + 'int64_t': 'int', 'uint64_t': 'int', + 'float': 'float', 'double': 'float', + 'ggml_fp16_t': 'np.float16', +} + +def format_type(t: TypeDecl): + if isinstance(t, PtrDecl) or isinstance(t, Struct): + return 'ffi.CData' + if isinstance(t, Enum): + return 'int' + if isinstance(t, TypeDecl): + return format_type(t.type) + if isinstance(t, IdentifierType): + assert len(t.names) == 1, f'Expected a single name, got {t.names}' + return __c_type_to_python_type.get(t.names[0]) or 'ffi.CData' + return t.name + +class PythonStubFuncDeclVisitor(c_ast.NodeVisitor): + def __init__(self): + self.sigs = {} + self.sources = {} + + def get_source_snippet_lines(self, coord: pycparser.plyparser.Coord) -> Tuple[list[str], list[str]]: + if coord.file not in self.sources: + with open(coord.file, 'rt') as f: + self.sources[coord.file] = f.readlines() + source_lines = self.sources[coord.file] + ncomment_lines = len(list(itertools.takewhile(lambda i: re.search(r'^\s*(//|/\*)', source_lines[i]), range(coord.line - 2, -1, -1)))) + comment_lines = [l.strip() for l in source_lines[coord.line - 1 - ncomment_lines:coord.line - 1]] + decl_lines = [] + for line in source_lines[coord.line - 1:]: + decl_lines.append(line.rstrip()) + if (';' in line) or ('{' in line): break + return (comment_lines, decl_lines) + + def visit_Enum(self, node: Enum): + if node.values is not None: + for e in node.values.enumerators: + self.sigs[e.name] = f' @property\n def {e.name}(self) -> int: ...' + + def visit_Typedef(self, node: Typedef): + pass + + def visit_FuncDecl(self, node: FuncDecl): + ret_type = node.type + is_ptr = False + while isinstance(ret_type, PtrDecl): + ret_type = ret_type.type + is_ptr = True + + fun_name = ret_type.declname + if fun_name.startswith('__'): + return + + args = [] + argnames = [] + def gen_name(stem): + i = 1 + while True: + new_name = stem if i == 1 else f'{stem}{i}' + if new_name not in argnames: return new_name + i += 1 + + for a in node.args.params: + if isinstance(a, EllipsisParam): + arg_name = gen_name('args') + argnames.append(arg_name) + args.append('*' + gen_name('args')) + elif format_type(a.type) == 'None': + continue + else: + arg_name = a.name or gen_name('arg') + argnames.append(arg_name) + args.append(f'{arg_name}: {format_type(a.type)}') + + ret = format_type(ret_type if not is_ptr else node.type) + + comment_lines, decl_lines = self.get_source_snippet_lines(node.coord) + + lines = [f' def {fun_name}({", ".join(args)}) -> {ret}:'] + if len(comment_lines) == 0 and len(decl_lines) == 1: + lines += [f' """{decl_lines[0]}"""'] + else: + lines += [' """'] + lines += [f' {c.lstrip("/* ")}' for c in comment_lines] + if len(comment_lines) > 0: + lines += [''] + lines += [f' {d}' for d in decl_lines] + lines += [' """'] + lines += [' ...'] + self.sigs[fun_name] = '\n'.join(lines) + +def generate_stubs(header: str): + """ + Generates a .pyi Python stub file for the GGML API using C header files. + """ + + v = PythonStubFuncDeclVisitor() + v.visit(CParser().parse(header, "")) + + keys = list(v.sigs.keys()) + keys.sort() + + return '\n'.join([ + '# auto-generated file', + 'import ggml.ffi as ffi', + 'import numpy as np', + 'class lib:', + *[v.sigs[k] for k in keys] + ]) diff --git a/seamless_communication/ggml/examples/python/test_tensor.py b/seamless_communication/ggml/examples/python/test_tensor.py new file mode 100644 index 0000000..1a365fa --- /dev/null +++ b/seamless_communication/ggml/examples/python/test_tensor.py @@ -0,0 +1,258 @@ +import pytest +from pytest import raises + +from ggml import lib, ffi +from ggml.utils import init, copy, numpy +import numpy as np +import numpy.testing as npt + +@pytest.fixture() +def ctx(): + print("setup") + yield init(mem_size=10*1024*1024) + print("teardown") + +class TestNumPy: + + # Single element + + def test_set_get_single_i32(self, ctx): + i = lib.ggml_new_i32(ctx, 42) + assert lib.ggml_get_i32_1d(i, 0) == 42 + assert numpy(i) == np.array([42], dtype=np.int32) + + def test_set_get_single_f32(self, ctx): + i = lib.ggml_new_f32(ctx, 4.2) + + epsilon = 0.000001 # Not sure why so large a difference?? + pytest.approx(lib.ggml_get_f32_1d(i, 0), 4.2, epsilon) + pytest.approx(numpy(i), np.array([4.2], dtype=np.float32), epsilon) + + def _test_copy_np_to_ggml(self, a: np.ndarray, t: ffi.CData): + a2 = a.copy() # Clone original + copy(a, t) + npt.assert_array_equal(numpy(t), a2) + + # I32 + + def test_copy_np_to_ggml_1d_i32(self, ctx): + t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_I32, 10) + a = np.arange(10, dtype=np.int32) + self._test_copy_np_to_ggml(a, t) + + def test_copy_np_to_ggml_2d_i32(self, ctx): + t = lib.ggml_new_tensor_2d(ctx, lib.GGML_TYPE_I32, 2, 3) + a = np.arange(2 * 3, dtype=np.int32).reshape((2, 3)) + self._test_copy_np_to_ggml(a, t) + + def test_copy_np_to_ggml_3d_i32(self, ctx): + t = lib.ggml_new_tensor_3d(ctx, lib.GGML_TYPE_I32, 2, 3, 4) + a = np.arange(2 * 3 * 4, dtype=np.int32).reshape((2, 3, 4)) + self._test_copy_np_to_ggml(a, t) + + def test_copy_np_to_ggml_4d_i32(self, ctx): + t = lib.ggml_new_tensor_4d(ctx, lib.GGML_TYPE_I32, 2, 3, 4, 5) + a = np.arange(2 * 3 * 4 * 5, dtype=np.int32).reshape((2, 3, 4, 5)) + self._test_copy_np_to_ggml(a, t) + + def test_copy_np_to_ggml_4d_n_i32(self, ctx): + dims = [2, 3, 4, 5] # GGML_MAX_DIMS is 4, going beyond would crash + pdims = ffi.new('int64_t[]', len(dims)) + for i, d in enumerate(dims): pdims[i] = d + t = lib.ggml_new_tensor(ctx, lib.GGML_TYPE_I32, len(dims), pdims) + a = np.arange(np.prod(dims), dtype=np.int32).reshape(tuple(pdims)) + self._test_copy_np_to_ggml(a, t) + + # F32 + + def test_copy_np_to_ggml_1d_f32(self, ctx): + t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, 10) + a = np.arange(10, dtype=np.float32) + self._test_copy_np_to_ggml(a, t) + + def test_copy_np_to_ggml_2d_f32(self, ctx): + t = lib.ggml_new_tensor_2d(ctx, lib.GGML_TYPE_F32, 2, 3) + a = np.arange(2 * 3, dtype=np.float32).reshape((2, 3)) + self._test_copy_np_to_ggml(a, t) + + def test_copy_np_to_ggml_3d_f32(self, ctx): + t = lib.ggml_new_tensor_3d(ctx, lib.GGML_TYPE_F32, 2, 3, 4) + a = np.arange(2 * 3 * 4, dtype=np.float32).reshape((2, 3, 4)) + self._test_copy_np_to_ggml(a, t) + + def test_copy_np_to_ggml_4d_f32(self, ctx): + t = lib.ggml_new_tensor_4d(ctx, lib.GGML_TYPE_F32, 2, 3, 4, 5) + a = np.arange(2 * 3 * 4 * 5, dtype=np.float32).reshape((2, 3, 4, 5)) + self._test_copy_np_to_ggml(a, t) + + def test_copy_np_to_ggml_4d_n_f32(self, ctx): + dims = [2, 3, 4, 5] # GGML_MAX_DIMS is 4, going beyond would crash + pdims = ffi.new('int64_t[]', len(dims)) + for i, d in enumerate(dims): pdims[i] = d + t = lib.ggml_new_tensor(ctx, lib.GGML_TYPE_F32, len(dims), pdims) + a = np.arange(np.prod(dims), dtype=np.float32).reshape(tuple(pdims)) + self._test_copy_np_to_ggml(a, t) + + # F16 + + def test_copy_np_to_ggml_1d_f16(self, ctx): + t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F16, 10) + a = np.arange(10, dtype=np.float16) + self._test_copy_np_to_ggml(a, t) + + def test_copy_np_to_ggml_2d_f16(self, ctx): + t = lib.ggml_new_tensor_2d(ctx, lib.GGML_TYPE_F16, 2, 3) + a = np.arange(2 * 3, dtype=np.float16).reshape((2, 3)) + self._test_copy_np_to_ggml(a, t) + + def test_copy_np_to_ggml_3d_f16(self, ctx): + t = lib.ggml_new_tensor_3d(ctx, lib.GGML_TYPE_F16, 2, 3, 4) + a = np.arange(2 * 3 * 4, dtype=np.float16).reshape((2, 3, 4)) + self._test_copy_np_to_ggml(a, t) + + def test_copy_np_to_ggml_4d_f16(self, ctx): + t = lib.ggml_new_tensor_4d(ctx, lib.GGML_TYPE_F16, 2, 3, 4, 5) + a = np.arange(2 * 3 * 4 * 5, dtype=np.float16).reshape((2, 3, 4, 5)) + self._test_copy_np_to_ggml(a, t) + + def test_copy_np_to_ggml_4d_n_f16(self, ctx): + dims = [2, 3, 4, 5] # GGML_MAX_DIMS is 4, going beyond would crash + pdims = ffi.new('int64_t[]', len(dims)) + for i, d in enumerate(dims): pdims[i] = d + t = lib.ggml_new_tensor(ctx, lib.GGML_TYPE_F16, len(dims), pdims) + a = np.arange(np.prod(dims), dtype=np.float16).reshape(tuple(pdims)) + self._test_copy_np_to_ggml(a, t) + + # Mismatching shapes + + def test_copy_mismatching_shapes_1d(self, ctx): + t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, 10) + a = np.arange(10, dtype=np.float32) + copy(a, t) # OK + + a = a.reshape((5, 2)) + with raises(AssertionError): copy(a, t) + with raises(AssertionError): copy(t, a) + + def test_copy_mismatching_shapes_2d(self, ctx): + t = lib.ggml_new_tensor_2d(ctx, lib.GGML_TYPE_F32, 2, 3) + a = np.arange(6, dtype=np.float32) + copy(a.reshape((2, 3)), t) # OK + + a = a.reshape((3, 2)) + with raises(AssertionError): copy(a, t) + with raises(AssertionError): copy(t, a) + + def test_copy_mismatching_shapes_3d(self, ctx): + t = lib.ggml_new_tensor_3d(ctx, lib.GGML_TYPE_F32, 2, 3, 4) + a = np.arange(24, dtype=np.float32) + copy(a.reshape((2, 3, 4)), t) # OK + + a = a.reshape((2, 4, 3)) + with raises(AssertionError): copy(a, t) + with raises(AssertionError): copy(t, a) + + def test_copy_mismatching_shapes_4d(self, ctx): + t = lib.ggml_new_tensor_4d(ctx, lib.GGML_TYPE_F32, 2, 3, 4, 5) + a = np.arange(24*5, dtype=np.float32) + copy(a.reshape((2, 3, 4, 5)), t) # OK + + a = a.reshape((2, 3, 5, 4)) + with raises(AssertionError): copy(a, t) + with raises(AssertionError): copy(t, a) + + def test_copy_f16_to_f32(self, ctx): + t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, 1) + a = np.array([123.45], dtype=np.float16) + copy(a, t) + np.testing.assert_allclose(lib.ggml_get_f32_1d(t, 0), 123.45, rtol=1e-3) + + def test_copy_f32_to_f16(self, ctx): + t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F16, 1) + a = np.array([123.45], dtype=np.float32) + copy(a, t) + np.testing.assert_allclose(lib.ggml_get_f32_1d(t, 0), 123.45, rtol=1e-3) + + def test_copy_f16_to_Q5_K(self, ctx): + n = 256 + t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_Q5_K, n) + a = np.arange(n, dtype=np.float16) + copy(a, t) + np.testing.assert_allclose(a, numpy(t, allow_copy=True), rtol=0.05) + + def test_copy_Q5_K_to_f16(self, ctx): + n = 256 + t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_Q5_K, n) + copy(np.arange(n, dtype=np.float32), t) + a = np.arange(n, dtype=np.float16) + copy(t, a) + np.testing.assert_allclose(a, numpy(t, allow_copy=True), rtol=0.05) + + def test_copy_i16_f32_mismatching_types(self, ctx): + t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, 1) + a = np.arange(1, dtype=np.int16) + with raises(NotImplementedError): copy(a, t) + with raises(NotImplementedError): copy(t, a) + +class TestTensorCopy: + + def test_copy_self(self, ctx): + t = lib.ggml_new_i32(ctx, 42) + copy(t, t) + assert lib.ggml_get_i32_1d(t, 0) == 42 + + def test_copy_1d(self, ctx): + t1 = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, 10) + t2 = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, 10) + a = np.arange(10, dtype=np.float32) + copy(a, t1) + copy(t1, t2) + assert np.allclose(a, numpy(t2)) + assert np.allclose(numpy(t1), numpy(t2)) + +class TestGraph: + + def test_add(self, ctx): + n = 256 + ta = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, n) + tb = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, n) + tsum = lib.ggml_add(ctx, ta, tb) + assert tsum.type == lib.GGML_TYPE_F32 + + gf = ffi.new('struct ggml_cgraph*') + lib.ggml_build_forward_expand(gf, tsum) + + a = np.arange(0, n, dtype=np.float32) + b = np.arange(n, 0, -1, dtype=np.float32) + copy(a, ta) + copy(b, tb) + + lib.ggml_graph_compute_with_ctx(ctx, gf, 1) + + assert np.allclose(numpy(tsum, allow_copy=True), a + b) + +class TestQuantization: + + def test_quantized_add(self, ctx): + n = 256 + ta = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_Q5_K, n) + tb = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, n) + tsum = lib.ggml_add(ctx, ta, tb) + assert tsum.type == lib.GGML_TYPE_Q5_K + + gf = ffi.new('struct ggml_cgraph*') + lib.ggml_build_forward_expand(gf, tsum) + + a = np.arange(0, n, dtype=np.float32) + b = np.arange(n, 0, -1, dtype=np.float32) + copy(a, ta) + copy(b, tb) + + lib.ggml_graph_compute_with_ctx(ctx, gf, 1) + + unquantized_sum = a + b + sum = numpy(tsum, allow_copy=True) + + diff = np.linalg.norm(unquantized_sum - sum, np.inf) + assert diff > 4 + assert diff < 5 diff --git a/seamless_communication/ggml/examples/unity/CMakeLists.txt b/seamless_communication/ggml/examples/unity/CMakeLists.txt new file mode 100644 index 0000000..e5cdfe5 --- /dev/null +++ b/seamless_communication/ggml/examples/unity/CMakeLists.txt @@ -0,0 +1,18 @@ +# unity +add_library(fairseq2_cpp) +target_include_directories(fairseq2_cpp PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../..) +target_link_libraries(fairseq2_cpp PRIVATE ggml kaldi-native-fbank) +target_sources(fairseq2_cpp + PRIVATE + fairseq2.cpp + model_loader.cpp +) +add_executable(unity unity.cpp) +find_package(PkgConfig REQUIRED) +pkg_check_modules(SNDFILE REQUIRED IMPORTED_TARGET sndfile) +target_link_libraries(unity PRIVATE ggml PkgConfig::SNDFILE) +target_sources(unity + PRIVATE + fairseq2.cpp + model_loader.cpp +) diff --git a/seamless_communication/ggml/examples/unity/fairseq2.cpp b/seamless_communication/ggml/examples/unity/fairseq2.cpp new file mode 100644 index 0000000..b40b8e2 --- /dev/null +++ b/seamless_communication/ggml/examples/unity/fairseq2.cpp @@ -0,0 +1,1756 @@ +#include +#include +#include +#include +#include +#include + +#include "kaldi-native-fbank/csrc/feature-fbank.h" +#include "kaldi-native-fbank/csrc/feature-window.h" +#include "fairseq2.h" +#include "ggml.h" +#include "ggml-alloc.h" + +ggml_tensor* ggml_detach(ggml_tensor* a) { + a->op = GGML_OP_NONE; + std::fill(a->src, a->src + GGML_MAX_SRC, nullptr); + return a; +} + +// generate_sequence uses ggml_context and ggml_allocr to reuse memory buffers across steps. +// This can lead to dangling pointers, which don't segfault, but instead read garbage data. +// Enabling this flag allows to explictly reset memory buffers, making it more explicit +// when we read garbage data. +// It also prints memory usage information, which is useful to +#define DEBUG_MEM_USAGE DEBUG + +void printf_mem_usage(ggml_context* ctx, std::string name) { +#if DEBUG_MEM_USAGE + double mb = 1024.0 * 1024.0; + printf( + "ctx %s: memory used = %8.2f MB, memory reserved = %8.2f Mb\n", + name.c_str(), + ggml_used_mem(ctx) / mb, + ggml_get_mem_size(ctx) / mb + ); +#endif +} + +#define SWAP(x, y) \ + auto tmp_ ## x = x; x = y; y = tmp_ ## x; + + +#define GGML_ASSERT_SHAPE(x, ne0, ne1, ne2, ne3) \ + GGML_ASSERT((ne0 == -1 || x->ne[0] == ne0) && (ne1 == -1 || x->ne[1] == ne1) && (ne2 == -1 || x->ne[2] == ne2) && (ne3 == -1 || x->ne[3] == ne3)); + +/// allocate the fairseq2 model and hyperparameters +extern "C" fairseq2_model* fairseq2_model_alloc() { + // pre-allocate some memory to write hyperparameters and tensors pointers + auto* model = new fairseq2_model; + model->tensors_ctx = nullptr; + return model; +} + +extern "C" void fairseq2_kv_cache_alloc(fairseq2_model& model, ggml_context* kv_cache_ctx, int beam_size, int max_seq_len) { + // Note: we only allocate the masks, proper kv cache allocation is delayed. + GGML_ASSERT(kv_cache_ctx); + GGML_ASSERT(!ggml_get_no_alloc(kv_cache_ctx)); // We need to be able to alloc the kv_cache buffers + model.kv_cache_ctx = kv_cache_ctx; + auto attn_glob = "text_decoder.*_attn.k_proj.weight"; + FORCE_ALLOC(self_attn_mask, kv_cache_ctx, ggml_new_tensor_2d(kv_cache_ctx, GGML_TYPE_F32, max_seq_len, max_seq_len)); + self_attn_mask = ggml_diag_mask_inf_inplace(kv_cache_ctx, self_attn_mask, 0); + ggml_format_name(self_attn_mask, "self_attn_mask[%d]", max_seq_len); + + for (auto named_tensor : model.tensors) { + const std::string& name = named_tensor.first; + if (::fnmatch(attn_glob, name.c_str(), 0) == FNM_NOMATCH) + continue; + // create a cache entry without the ".k_proj.weight" suffix + const std::string& shortname = name.substr(0, name.size() - 14); + KeyValueTensor& kv = model.kv_cache[shortname]; + kv.step_nr = 0; + + kv.full_k = nullptr; + kv.full_v = nullptr; + kv.self_attn_mask = self_attn_mask; + } +} + +extern "C" void fairseq2_kv_cache_reset(const fairseq2_model& model) { + // TODO: use a dedicated allocator, so that kv_cache.clear actually frees the memory + model.kv_cache.clear(); +} + + +bool has_kv_cache(const fairseq2_model& model) { + return model.kv_cache.size() > 0; +} + + +inline ggml_tensor* ggml_squeeze(ggml_context* ctx, ggml_tensor* x, int dim) { + int n_dims = x->n_dims; + GGML_ASSERT(dim >= 0); + GGML_ASSERT(dim < n_dims); + GGML_ASSERT(x->ne[dim] == 1); + return ggml_flatten_1d(ctx, x, dim); +} + +inline ggml_tensor* ggml_unsqueeze(ggml_context* ctx, ggml_tensor* x, int dim) { + return ggml_unflatten_1d(ctx, x, dim, 1); +} + + +// copy k and v to kv cache +// kv.full_k[step_nr] = k; +// kv.full_v[step_nr] = v; +void append_to_prev_kv(const fairseq2_model& model, const std::string& prefix, ggml_tensor** k, ggml_tensor** v, ggml_tensor** self_attn_mask) { + KeyValueTensor& kv = model.kv_cache[prefix]; + int step_nr = kv.step_nr; + ggml_context* ctx = model.kv_cache_ctx ? model.kv_cache_ctx : model.ctx; + int n_steps = (*k)->ne[1]; + int k_proj, batch_size; + + if (kv.full_k != nullptr) { + // (N, S_kv, K_proj) + k_proj = kv.full_k->ne[0]; + batch_size = kv.full_k->ne[2]; + ggml_detach(kv.full_k); + ggml_detach(kv.full_v); + kv.full_k = ggml_squeeze(ctx, ggml_concat(ctx, ggml_unsqueeze(ctx, kv.full_k, 1), ggml_unsqueeze(ctx, *k, 1)), 1); + kv.full_v = ggml_squeeze(ctx, ggml_concat(ctx, ggml_unsqueeze(ctx, kv.full_v, 1), ggml_unsqueeze(ctx, *v, 1)), 1); + } else { + GGML_ASSERT(step_nr == 0); + k_proj = (*k)->ne[0]; + batch_size = (*v)->ne[2]; + kv.full_k = ggml_dup(ctx, *k); + kv.full_v = ggml_dup(ctx, *v); + } + *k = kv.full_k; + *v = kv.full_v; + ggml_format_name(kv.full_k, "%s.k (step=%d)", prefix.c_str(), step_nr); + ggml_format_name(kv.full_v, "%s.v (step=%d)", prefix.c_str(), step_nr); + step_nr += n_steps; + + GGML_ASSERT_SHAPE(kv.full_k, k_proj, step_nr, batch_size, 1); + + // qk is (B * H, Sq, Sk) == (B*H, 1, Sk) in incremental mode + // we return the Sq slice of the (Sq, Sk) attention mask + *self_attn_mask = ggml_slice( + model.ctx, + ggml_slice(model.ctx, kv.self_attn_mask, 0, 0, step_nr), + 1, + step_nr - 1, + step_nr + ); + + kv.step_nr = step_nr; +} + +// variant of ggml_get_rows that allows for a with more than 2 dims. +ggml_tensor* ggml_get_rows2(ggml_context* ctx, ggml_tensor* a, ggml_tensor* b) { + int flattened = 0; + GGML_ASSERT(a->n_dims <= 3); + if (a->n_dims == 3) { + flattened = a->ne[0]; + a = ggml_flatten_1d(ctx, a, 0); + } + a = ggml_get_rows(ctx, a, b); + if (flattened) { + a = ggml_unflatten_1d(ctx, a, 0, flattened); + } + return a; +} + + +void _reorder_kv_cache(ggml_context* ctx, ggml_cgraph* gf, KeyValueTensor& kv, ggml_tensor* new_order) { + // GGML_ASSERT(ctx == kv.full_k->con); + if (kv.full_k != nullptr) { + ggml_detach(kv.full_k); + const char* name = kv.full_k->name; + kv.full_k = ggml_get_rows2(ctx, kv.full_k, new_order); + ggml_build_forward_expand(gf, kv.full_k); + ggml_format_name(kv.full_k, "%s (sorted)", name); + } + + if (kv.full_v != nullptr) { + ggml_detach(kv.full_v); + const char* name = kv.full_v->name; + kv.full_v = ggml_get_rows2(ctx, kv.full_v, new_order); + ggml_build_forward_expand(gf, kv.full_v); + ggml_format_name(kv.full_v, "%s (sorted)", name); + } +} + + +void reorder_kv_cache(const fairseq2_model& model, ggml_context* ctx, ggml_cgraph* gf, ggml_tensor* new_order) { + auto self_attn_glob = "*.self_attn"; + for (auto& named_kv : model.kv_cache) { + if (::fnmatch(self_attn_glob, named_kv.first.c_str(), 0) == FNM_NOMATCH) + continue; + + _reorder_kv_cache(ctx, gf, named_kv.second, new_order); + } +} + + +inline double model_layer_config_d(const fairseq2_model& model, std::string name) { + const std::int64_t* data = &model.layer_config.at(name); + double val = *(const double*)data; + return val; +} + +extern "C" double fairseq2_model_layer_config_double(const fairseq2_model& model, const char* name) { + return model_layer_config_d(model, std::string(name)); +} + +extern "C" std::int64_t fairseq2_model_layer_config_int(const fairseq2_model& model, const char* name) { + return model.layer_config.at(std::string(name)); +} + + +extern "C" void fairseq2_model_free(fairseq2_model* model) { + if (model->tensors_ctx) ggml_free(model->tensors_ctx); + delete model; +} + +extern "C" void fairseq2_model_set_inference_ctx(fairseq2_model* model, ggml_context* ctx) { + model->ctx = ctx; +} + +extern "C" std::string* std_string_alloc(char* c_str) { + return new std::string(c_str); +} + +extern "C" void std_string_free(std::string* str) { + delete str; +} + +bool has_layer(fairseq2_model& model, const std::string& name) { + return model.tensors.find(name) != model.tensors.end(); +} + +ggml_tensor* mul_mat(ggml_context* ctx, ggml_tensor* a, ggml_tensor* b) { + if (b->ne[1] == 1 && b->ne[2] > 1 && a->n_dims == 2) { + // `b` has shape (B, 1, D). + // if `a` is (D_out, D), then we do one matmul for the full batch. + b = ggml_flatten_1d(ctx, b, 1); + return ggml_unflatten_1d(ctx, ggml_mul_mat(ctx, a, b), 1, 1); + } + // there is also the k * q matmul -> (D, 1, B) * (D, 1, B) -> (1, 1, B) + // not sure what's the best way to compute this with BLAS + + return ggml_mul_mat(ctx, a, b); // (d_out) +} + + +extern "C" ggml_tensor* Linear_forward( + fairseq2_model& model, + const std::string &prefix, + ggml_tensor* input // (d_in) +) { + // Note: for now we assumed un-batched input + ggml_tensor* weight = model.tensors[prefix + ".weight"]; // (d_in, d_out) + GGML_ASSERT(weight != nullptr); + ggml_tensor* out = mul_mat(model.ctx, weight, input); // (d_out) + ggml_tensor* bias = model.tensors[prefix + ".bias"]; // (d_out) + if (bias == nullptr) return out; + + return ggml_add(model.ctx, out, bias); +} + +extern "C" ggml_tensor* LayerNorm_forward( + fairseq2_model& model, + const std::string &prefix, + ggml_tensor* input +) { + ggml_tensor* weight = model.tensors[prefix + ".weight"]; + GGML_ASSERT(weight != nullptr); + ggml_tensor* bias = model.tensors[prefix + ".bias"]; + GGML_ASSERT(bias != nullptr); + + auto ctx = model.ctx; + double eps = model_layer_config_d(model, prefix + ".eps"); + + input = ggml_norm(ctx, input, /*eps*/eps); + return ggml_add_inplace( + ctx, + ggml_mul_inplace(ctx, ggml_repeat(ctx, weight, input), input), + ggml_repeat(ctx, bias, input) + ); +} + + +extern "C" ggml_tensor* StandardFeedForwardNetwork_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs +) { + seqs = Linear_forward(model, prefix + ".inner_proj", seqs); + // inner_activation = ReLu // TODO: allow other activation + seqs = ggml_relu_inplace(model.ctx, seqs); + + if (has_layer(model, prefix + ".inner_layer_norm")) { + seqs = LayerNorm_forward(model, prefix + ".inner_layer_norm", seqs); + } + + seqs = Linear_forward(model, prefix + ".output_proj", seqs); + return seqs; +} + +extern "C" ggml_tensor* SiluFeedForwardNetwork_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs +) { + seqs = Linear_forward(model, prefix + ".inner_proj", seqs); + seqs = ggml_silu(model.ctx, seqs); + + if (has_layer(model, prefix + ".inner_layer_norm")) { + seqs = LayerNorm_forward(model, prefix + ".inner_layer_norm", seqs); + } + + seqs = Linear_forward(model, prefix + ".output_proj", seqs); + return seqs; +} + +ggml_tensor* ggml_flatten_1d(ggml_context* ctx, ggml_tensor* x, int dim) { + int n_dims = x->n_dims; + GGML_ASSERT(dim >= 0); + GGML_ASSERT(dim < n_dims); + GGML_ASSERT(ggml_is_contiguous(x)); + // Nothing to do + if (dim == n_dims - 1) return x; + + if (n_dims == 2) { + return ggml_reshape_1d(ctx, x, x->ne[0] * x->ne[1]); + } else if (n_dims == 3) { + if (dim == 0) { + return ggml_reshape_2d(ctx, x, x->ne[0] * x->ne[1], x->ne[2]); + } else { // dim == 1 + return ggml_reshape_2d(ctx, x, x->ne[0], x->ne[1] * x->ne[2]); + } + } else { // n_dims == 4 + if (dim == 0) { + return ggml_reshape_3d(ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]); + } else if (dim == 1) { + return ggml_reshape_3d(ctx, x, x->ne[0], x->ne[1] * x->ne[2], x->ne[3]); + } else { // dim == 2 + return ggml_reshape_3d(ctx, x, x->ne[0], x->ne[1], x->ne[2] * x->ne[3]); + } + } +} + +ggml_tensor* ggml_unflatten_1d(ggml_context* ctx, ggml_tensor* x, int dim, int num_el) { + int n_dims = x->n_dims; + GGML_ASSERT(dim >= 0); + GGML_ASSERT(dim < n_dims); + GGML_ASSERT(n_dims < 4); + GGML_ASSERT(x->ne[dim] % num_el == 0); + GGML_ASSERT(x->nb[dim + 1] == x->nb[dim] * x->ne[dim]); // `x` isn't contiguous along `dim` + if (n_dims == 1) { + return ggml_view_2d(ctx, x, num_el, x->ne[0] / num_el, x->nb[0] * num_el, 0); + } else if (n_dims == 2) { + if (dim == 0) { + return ggml_view_3d(ctx, x, num_el, x->ne[0] / num_el, x->ne[1], x->nb[0] * num_el, x->nb[1], 0); + } else { // dim == 1 + return ggml_view_3d(ctx, x, x->ne[0], num_el, x->ne[1] / num_el, x->nb[1], num_el * x->nb[1], 0); + } + } else { // (n_dims == 3) + if (dim == 0) { + return ggml_view_4d(ctx, x, num_el, x->ne[0] / num_el, x->ne[1], x->ne[2], x->nb[0] * num_el, x->nb[1], x->nb[2], 0); + } else if (dim == 1) { + return ggml_view_4d(ctx, x, x->ne[0], num_el, x->ne[1] / num_el, x->ne[2], x->nb[1], num_el * x->nb[1], x->nb[2], 0); + } else { // dim == 2 + return ggml_view_4d(ctx, x, x->ne[0], x->ne[1], num_el, x->ne[2] / num_el, x->nb[1], x->nb[2], num_el * x->nb[2], 0); + } + } +} + + +ggml_tensor* _reshape_num_head(ggml_context* ctx, ggml_tensor* x, int head_dim) { + // (B, S, dim) -> (B, S, H, H_dim) + x = ggml_unflatten_1d(ctx, x, 0, head_dim); + x = ggml_permute(ctx, x, 0, 2, 1, 3); // (B, H, S, H_dim) + x = ggml_cont(ctx, x); + x = ggml_flatten_1d(ctx, x, 2); // (B * H, S, H_dim) + return x; +} + +/// (B, Sk, dim) -> // (B?, H, H_dim, Sk) +ggml_tensor* _reshape_num_head_values(ggml_context* ctx, ggml_tensor* v, int head_dim ) { + // (B, Sk, dim) -> (B, Sk, H, H_dim) + v = ggml_unflatten_1d(ctx, v, 0, head_dim); + v = ggml_permute(ctx, v, 1, 2, 0, 3); // (B?, H, H_dim, Sk) + v = ggml_cont(ctx, v); + v = ggml_flatten_1d(ctx, v, 2); // (B * H, S, H_dim) + return v; +} + + +// flash_attn doesn't work for cross attention because it assumes Q <= K +// and it seems to yield slightly different scores than expected, and thus a different beam search +# define UNITY_FLASH_ATTN 0 + +extern "C" ggml_tensor* MultiheadAttention_forward( + fairseq2_model& model, + const std::string &prefix, + ggml_tensor* queries, // (slen, d_in) + ggml_tensor* keys, // (klen, d_in) + ggml_tensor* values, // (klen, d_out) + ggml_tensor* attn_mask // (klen, slen) +) { + int model_dim = queries->ne[0]; + int num_heads = model.layer_config.at(prefix + ".num_heads"); + int head_dim = model_dim / num_heads; + GGML_ASSERT(model_dim % num_heads == 0); + + ggml_context* ctx = model.ctx; + ggml_tensor* q = Linear_forward(model, prefix + ".q_proj", queries); // (B, S, H * H_dim) + q = _reshape_num_head(ctx, q, head_dim); // (B * H, S, H_dim) + ggml_set_name(q, "q"); + + ggml_tensor *k, *v; + if (!has_kv_cache(model)) { + k = Linear_forward(model, prefix + ".k_proj", keys); + ggml_set_name(k, "k"); + v = Linear_forward(model, prefix + ".v_proj", values); + ggml_set_name(v, "v"); + } else { + bool encoder_decoder_attn = keys == values && keys != queries; + if (encoder_decoder_attn) { + // The K and V tensors of an encoder-decoder attention (i.e. the + // projected encoder outputs) remain static during evaluation. + + KeyValueTensor& kv_cache = model.kv_cache[prefix]; + if (kv_cache.step_nr == 0) { + // If possible we use the ctx dedicated to kv_cache here, + // because the enc dec attention is typically long lived. + if (model.kv_cache_ctx) model.ctx = model.kv_cache_ctx; + k = Linear_forward(model, prefix + ".k_proj", keys); + ggml_set_name(k, "k"); + v = Linear_forward(model, prefix + ".v_proj", values); + ggml_set_name(v, "v"); + // Note we are only storing a pointer to the buffer, not the full graph + kv_cache.full_k = ggml_detach(ggml_dup_inplace(model.ctx, k)); + ggml_format_name(kv_cache.full_k, "%s.k_cache", prefix.c_str()); + kv_cache.full_v = ggml_detach(ggml_dup_inplace(model.ctx, v)); + ggml_format_name(kv_cache.full_v, "%s.v_cache", prefix.c_str()); + kv_cache.step_nr = keys->ne[1]; + model.ctx = ctx; + } else { + k = kv_cache.full_k; + v = kv_cache.full_v; + GGML_ASSERT(keys->ne[1] == k->ne[1]); // cache content doesn't match the input sequence + GGML_ASSERT(values->ne[1] == v->ne[1]); // cache content doesn't match the input sequence + } + } else { // self attention + // (1, K) -> (N, 1, K_proj) + k = Linear_forward(model, prefix + ".k_proj", keys); + ggml_set_name(k, "k"); + // (1, V) -> (N, 1, V_proj) + v = Linear_forward(model, prefix + ".v_proj", values); + ggml_set_name(v, "v"); + + append_to_prev_kv(model, prefix, &k, &v, &attn_mask); + } + } + k = _reshape_num_head(ctx, k, head_dim); // (B * H, Sk, H_dim) + v = _reshape_num_head_values(ctx, v, head_dim); // (B * H, H_dim, Sk) + v = ggml_cont(ctx, v); + +#if UNITY_FLASH_ATTN + // For flash_attn, we assume either no masks, or triangular masks. + ggml_tensor* attn = ggml_flash_attn(ctx, q, k, v, /*masked*/attn_mask != nullptr); // (B * H, S, H_dim) + ggml_set_name(attn, "attn"); + attn = ggml_unflatten_1d(ctx, attn, 2, num_heads); // (B, H, H_dim, S) + attn = ggml_permute(ctx, attn, 0, 2, 1, 3); // (B, S, H, H_dim) +#else + // (B * H, Sk, H_dim) x (B * H, S, H_dim) -> (B * H, S, Sk) + ggml_tensor* qk = mul_mat(ctx, k, q); + ggml_set_name(qk, "qk"); + FORCE_ALLOC(qk_scale, ctx, ggml_new_tensor_1d(ctx, qk->type, 1)); + ggml_set_f32(qk_scale, 1.0f/sqrtf(float(head_dim))); + qk = ggml_scale(ctx, qk, qk_scale); + ggml_set_name(qk, "qk_scaled"); + + if (attn_mask) qk = ggml_add_inplace(ctx, qk, attn_mask); + // TODO: upgrade qk to float32 if needed + ggml_tensor* attn_weights = ggml_soft_max(ctx, qk); // (B * H, S, Sk) + ggml_set_name(attn_weights, "attn_weights"); + + // (B * H, S, Sk) x (B * H, H_dim, Sk) -> (B * H, H_dim, S) + ggml_tensor* attn = mul_mat(ctx, attn_weights, v); + ggml_set_name(attn, "attn"); + attn = ggml_unflatten_1d(ctx, attn, 2, num_heads); // (B, H, H_dim, S) + attn = ggml_permute(ctx, attn, 2, 0, 1, 3); // (B, S, H, H_dim) +#endif // UNITY_FLASH_ATTN + attn = ggml_cont(ctx, attn); + attn = ggml_flatten_1d(ctx, attn, 0); // (B, S, H * H_dim) + // out -> (B, S, d_out) + ggml_tensor* out = Linear_forward(model, prefix + ".output_proj", attn); + ggml_set_name(out, "out"); + + return out; +} + + +extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs, + ggml_tensor* padding_mask +) { + ggml_context* ctx = model.ctx; + auto norm_order = model.layer_config.at(prefix + ".norm_order"); + + // _forward_self_attn(seqs, padding_mask) + auto residual = seqs; + if (norm_order != TRANSFORMER_NORM_ORDER_POST) + seqs = LayerNorm_forward(model, prefix + ".self_attn_layer_norm", seqs); + + // TODO: add padding_mask to MultiheadAttention_forward + GGML_ASSERT(padding_mask == nullptr); + seqs = MultiheadAttention_forward( + model, + prefix + ".self_attn", + seqs, + seqs, + seqs, + /*attn_mask=*/nullptr + ); + + if (has_layer(model, prefix + ".self_attn_norm")) + seqs = LayerNorm_forward(model, prefix + ".self_attn_norm", seqs); + + seqs = ggml_add_inplace(ctx, seqs, residual); + + if (norm_order == TRANSFORMER_NORM_ORDER_POST) + seqs = LayerNorm_forward(model, prefix + ".self_attn_layer_norm", seqs); + + // _forward_ffn(seqs) + residual = seqs; + + if (norm_order != TRANSFORMER_NORM_ORDER_POST) + seqs = LayerNorm_forward(model, prefix + ".ffn_layer_norm", seqs); + + seqs = StandardFeedForwardNetwork_forward(model, prefix + ".ffn", seqs); + + // TODO: if self.residual_scale is not None: + // residual = self.residual_scale * residual + + seqs = ggml_add_inplace(ctx, seqs, residual); + + if (norm_order == TRANSFORMER_NORM_ORDER_POST) + seqs = LayerNorm_forward(model, prefix + ".ffn_layer_norm", seqs); + + return seqs; +} + +extern "C" ggml_tensor* WaveformToFbank_forward( + fairseq2_model& model, + const std::string &prefix, + ggml_tensor* waveform +) { + // Hardcoding: num_bins 80, sample rate 16k, always standardize + ggml_context* ctx = model.ctx; + knf::MelBanksOptions mel_opts{}; + mel_opts.num_bins = 80; + + knf::FrameExtractionOptions frame_opts{}; + frame_opts.samp_freq = 16000; + + knf::FbankOptions opts{}; + opts.frame_opts = frame_opts; + opts.mel_opts = mel_opts; + + + std::vector signal_frame{}; + std::int32_t num_frames = knf::NumFrames(/*num_samples=*/waveform->ne[0], frame_opts); + FORCE_ALLOC(output, ctx, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 80, num_frames)); + knf::FbankComputer native_(opts); + knf::FeatureWindowFunction window_fn_(native_.GetFrameOptions()); + + for (std::int32_t frame_nr = 0; frame_nr < num_frames; ++frame_nr) { + signal_frame.resize(0); + + // Extract the frame from the waveform tensor. + knf::ExtractWindow( + /*sample_offset=*/0, + (float *)(waveform->data), + waveform->ne[0], + frame_nr, + frame_opts, + window_fn_, + &signal_frame); + + native_.Compute( + /*signal_raw_log_energy=*/0, /*vtln_warp=*/1.0, &signal_frame, ((float *)(output->data) + frame_nr * 80)); + } + output = ggml_dup(ctx, ggml_transpose(ctx, output)); + output = ggml_norm(ctx, output, 1e-5); + output = ggml_dup(ctx, ggml_transpose(ctx, output)); + if (output->ne[1] % 2 == 1) { + ggml_tensor* remove_last = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, output->ne[1]-1); + for (int i = 0; i < output->ne[1]-1; ++i) { + ((int32_t *) remove_last->data)[i] = i; + } + output = ggml_get_rows(ctx, output, remove_last); + } + output = ggml_reshape_2d(ctx, output, output->ne[0] * 2, output->ne[1] / 2); + return output; +} + +// TODO: Check if it's possible to merge with standard MHA +extern "C" ggml_tensor* RelativePositionMHA_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs +) { + ggml_context* ctx = model.ctx; + + ggml_tensor* residual = seqs; + seqs = LayerNorm_forward(model, prefix + "_layer_norm", seqs); + // self_attn: qkv + ggml_tensor* Qcur = Linear_forward(model, prefix + ".q_proj", seqs); + ggml_tensor* Kcur = Linear_forward(model, prefix + ".k_proj", seqs); + ggml_tensor* Vcur = Linear_forward(model, prefix + ".v_proj", seqs); + + // self_attn: rel_pos SDPA + int32_t S = seqs->ne[1]; + int32_t H = 16; // TODO: Make this configurable + int32_t n_ctx = 4096; + int32_t K_h = seqs->ne[0] / H; + + int32_t start_index = n_ctx - S; + int32_t end_index = n_ctx + S - 1; + + int num_indices = end_index - start_index; + + FORCE_ALLOC(rows, ctx, ggml_new_tensor_1d(ctx, GGML_TYPE_I32, num_indices)); + for (int i = 0; i < num_indices; i++) { + ((int32_t *)rows->data)[i] = start_index + i; + } + + // self_attn: load pos_enc weights & compute_r + // In fairseq2 pos_enc weights are calculated on the fly, since some more custom operators might be needed to enable this, + // we store the results (fixed) in checkpoint as model.audio_enc_pos_enc_w and load directly. + ggml_tensor* r = ggml_get_rows(ctx, model.tensors["speech_encoder.pos_enc"], rows); + r = mul_mat(ctx, model.tensors[prefix + ".sdpa.r_proj.weight"], r); + r = ggml_dup(ctx, ggml_permute(ctx, + ggml_cpy(ctx, + r, + ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S*2-1)), + 0, 2, 1, 3)); + + ggml_tensor* u_bias = ggml_reshape_3d(ctx, model.tensors[prefix + ".sdpa.u_bias"], K_h, 1, H); + ggml_tensor* v_bias = ggml_reshape_3d(ctx, model.tensors[prefix + ".sdpa.v_bias"], K_h, 1, H); + + // self_attn: Permute QKV + + ggml_tensor* Q = ggml_cont(ctx, ggml_permute(ctx, + ggml_cpy(ctx, + Qcur, + ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S)), + 0, 2, 1, 3)); // (H * K_h, S) -> (K_h, H, S) -> (K_h, S, H) + ggml_tensor* K = ggml_cont(ctx, ggml_permute(ctx, + ggml_cpy(ctx, + Kcur, + ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S)), + 0, 2, 1, 3)); // (H * K_h, S) -> (K_h, H, S) -> (K_h, S, H) + ggml_tensor* V = ggml_cont(ctx, ggml_permute(ctx, + ggml_cpy(ctx, + Vcur, + ggml_new_tensor_3d(ctx, GGML_TYPE_F32, K_h, H, S)), + 1, 2, 0, 3)); // (H * K_h, S) -> (K_h, H, S) -> (H, S, K_h) + + + ggml_tensor* q_with_u_bias = ggml_add_inplace(ctx, ggml_dup(ctx, Q), u_bias); // (K_h, S, H) + ggml_tensor* q_with_v_bias = ggml_add_inplace(ctx, Q, v_bias); // (K_h, S, H) + + ggml_tensor* ac = mul_mat(ctx, K, q_with_u_bias); + ggml_tensor* bd = mul_mat(ctx, r, q_with_v_bias); + + + // self_attn: shift_bd. Logic follows https://github.com/facebookresearch/fairseq2/blob/main/src/fairseq2/nn/transformer/relative_attention.py#L161 + bd = ggml_dup(ctx, ggml_permute(ctx, bd, 2, 1, 0, 3)); // H, S, 2S-1 + + FORCE_ALLOC(pad, ctx, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, H, S, 1)); + pad = ggml_set_f32(pad, 0.0); + + bd = ggml_concat(ctx, pad, bd); // bd[i][j][0] == 0, (H, S, 2S) + bd = ggml_dup(ctx, ggml_permute(ctx, bd, 2, 1, 0, 3)); // (2S, S, H) + bd = ggml_reshape_3d(ctx, bd, S, 2 * S, H); // (S, 2S, H) + // discard the first set of positive positions + bd = ggml_dup(ctx, ggml_slice(ctx, bd, 1, 1, 2 * S)); + // shifts each row by an extra step + bd = ggml_reshape_3d(ctx, bd, 2 * S - 1, S, H); + // Discard positions used for shift. + bd = ggml_slice(ctx, bd, 0, 0, S); + + // self_attn: compute attn / weights + ggml_tensor* attn_weights = ggml_add_inplace(ctx, ac, bd); + FORCE_ALLOC(attn_scale, ctx, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 1)); + ggml_set_f32(attn_scale, 1.0 / pow(K_h, 0.5)); + attn_weights = ggml_mul_inplace(ctx, attn_weights, ggml_repeat(ctx, attn_scale, attn_weights)); + attn_weights = ggml_soft_max(ctx, attn_weights); + + ggml_tensor* attn = mul_mat(ctx, V, attn_weights); // K_h, S, H + attn = ggml_dup(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3)); + ggml_tensor* attn_2d = ggml_reshape_2d(ctx, attn, K_h * H, S); + + ggml_tensor* attn_out = mul_mat(ctx, model.tensors[prefix + ".output_proj.weight"], attn_2d); + attn_out = ggml_add_inplace( + ctx, + attn_out, + ggml_repeat(ctx, model.tensors[prefix + ".output_proj.bias"], attn_out) + ); + attn_out = ggml_add_inplace(ctx, attn_out, residual); + return attn_out; +} + +extern "C" ggml_tensor* ConvModule_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs +) { + ggml_context* ctx = model.ctx; + ggml_tensor* residual = seqs; + seqs = LayerNorm_forward(model, prefix + "_layer_norm", seqs); + // conv: Use matmul for pointwise conv 1 - kernel_size=1, no padding case + seqs = mul_mat(ctx, model.tensors[prefix + ".pointwise_conv1.weight"], seqs); + + // conv: GLU + seqs = ggml_glu(ctx, seqs); + seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3)); + + // S x C -> (S+K-1) x C -> K x S x C -> S x C + seqs = ggml_conv_1d(ctx, model.tensors[prefix + ".depthwise_conv.weight"], seqs, 1, 15, 1); + + // conv: Custom implementation of batch norm + seqs = ggml_batch_norm(ctx, seqs, model.tensors[prefix + ".batch_norm.weight"], model.tensors[prefix + ".batch_norm.bias"], model.tensors[prefix + ".batch_norm.running_mean"], model.tensors[prefix + ".batch_norm.running_var"], 1e-5); + + // conv: SiLU actvation + seqs = ggml_silu_inplace(ctx, seqs); + seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3)); + + // conv: Use matmul for pointwise conv 2 - kernel_size=1, no padding case + seqs = mul_mat(ctx, model.tensors[prefix + ".pointwise_conv2.weight"], seqs); + + // conv: + residual + seqs = ggml_add_inplace(ctx, seqs, residual); + return seqs; +} + +extern "C" ggml_tensor* StandardConformerEncoderLayer_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs, + ggml_tensor* padding_mask +) { + ggml_context* ctx = model.ctx; + FORCE_ALLOC(ffn_scale, ctx, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 1)); + ggml_set_f32(ffn_scale, 0.5f); + ggml_tensor* residual = seqs; + seqs = LayerNorm_forward(model, prefix + ".ffn1_layer_norm", seqs); + seqs = SiluFeedForwardNetwork_forward(model, prefix + ".ffn1", seqs); + seqs = ggml_mul_inplace(ctx, seqs, ggml_repeat(ctx, ffn_scale, seqs)); + seqs = ggml_add_inplace(ctx, seqs, residual); + seqs = RelativePositionMHA_forward(model, prefix + ".self_attn", seqs); + seqs = ConvModule_forward(model, prefix + ".conv", seqs); + residual = seqs; + seqs = LayerNorm_forward(model, prefix + ".ffn2_layer_norm", seqs); + seqs = SiluFeedForwardNetwork_forward(model, prefix + ".ffn2", seqs); + seqs = ggml_mul_inplace(ctx, seqs, ggml_repeat(ctx, ffn_scale, seqs)); + seqs = ggml_add_inplace(ctx, seqs, residual); + seqs = LayerNorm_forward(model, prefix + ".layer_norm", seqs); + return seqs; +} + +extern "C" ggml_tensor* StandardConformerEncoder_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs, + ggml_tensor* padding_mask +) { + ggml_context* ctx = model.ctx; + seqs = WaveformToFbank_forward(model, prefix, seqs); + seqs = LayerNorm_forward(model, prefix + "_frontend.post_extract_layer_norm", seqs); + seqs = Linear_forward(model, prefix + "_frontend.model_dim_proj", seqs); + int layer_idx = 0; + + std::string layer_name = prefix + ".inner.layers." + std::to_string(layer_idx); + + while (has_layer(model, layer_name)) { + seqs = StandardConformerEncoderLayer_forward( + model, layer_name, seqs, padding_mask + ); + ggml_set_name(seqs, ("x_enc_" + std::to_string(layer_idx)).c_str()); + layer_idx += 1; + layer_name = prefix + ".inner.layers." + std::to_string(layer_idx); + } + + seqs = LayerNorm_forward(model, prefix + ".inner_layer_norm", seqs); + ggml_tensor* residual = seqs; + seqs = Linear_forward(model, prefix + ".proj1", seqs); + seqs = ggml_relu_inplace(ctx, seqs); + seqs = Linear_forward(model, prefix + ".proj2", seqs); + FORCE_ALLOC(ffn_scale, ctx, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, 1)); + ggml_set_f32(ffn_scale, 0.5f); + seqs = ggml_mul(ctx, ggml_repeat(ctx, ffn_scale, seqs), seqs); + seqs = ggml_add_inplace(ctx, seqs, residual); + layer_idx = 0; + layer_name = prefix + ".adaptor_layers." + std::to_string(layer_idx); + while (has_layer(model, layer_name)) { + seqs = StandardConformerEncoderAdaptorLayer_forward( + model, layer_name, seqs, padding_mask + ); + ggml_set_name(seqs, ("x_ada_" + std::to_string(layer_idx)).c_str()); + layer_idx += 1; + layer_name = prefix + ".adaptor_layers." + std::to_string(layer_idx); + } + seqs = LayerNorm_forward(model, prefix + ".layer_norm", seqs); + + return seqs; +} + +extern "C" ggml_tensor* StandardConformerEncoderAdaptorLayer_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs, + ggml_tensor* padding_mask +) { + ggml_context* ctx = model.ctx; + ggml_tensor* residual = seqs; + residual = LayerNorm_forward(model, prefix + ".residual_layer_norm", residual); + residual = ggml_dup(ctx, ggml_permute(ctx, residual, 1, 0, 2, 3)); + residual = ggml_conv_1d_generic(ctx, model.tensors[prefix + ".residual_conv.weight"], residual, 8, 4, 1); + residual = ggml_dup(ctx, ggml_permute(ctx, residual, 1, 0, 2, 3)); + residual = ggml_add_inplace(ctx, ggml_repeat(ctx, model.tensors[prefix + ".residual_conv.bias"], residual), residual); + residual = ggml_glu(ctx, residual); + + seqs = LayerNorm_forward(model, prefix + ".self_attn_layer_norm", seqs); + seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3)); + seqs = ggml_conv_1d_generic(ctx, model.tensors[prefix + ".self_attn_conv.weight"], seqs, 8, 4, 1); + seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3)); + seqs = ggml_add_inplace(ctx, seqs, ggml_repeat(ctx, model.tensors[prefix + ".self_attn_conv.bias"], seqs)); + seqs = ggml_glu(ctx, seqs); + + seqs = MultiheadAttention_forward( + model, + prefix + ".self_attn", + seqs, + seqs, + seqs, + /*attention masks=*/nullptr + ); + seqs = ggml_add_inplace(ctx, seqs, residual); + residual = seqs; + seqs = LayerNorm_forward(model, prefix + ".ffn_layer_norm", seqs); + seqs = StandardFeedForwardNetwork_forward(model, prefix + ".ffn", seqs); + seqs = ggml_add_inplace(ctx, seqs, residual); + return seqs; +} + + +/// ggml_slice(X, -1, start, end) is equivalent to X[start:end] +/// ggml_slice(X, 0, start, end) is equivalent to X[..., start:end] +ggml_tensor* ggml_slice( + struct ggml_context * ctx, + struct ggml_tensor * a, + int axis, + int64_t start, + int64_t end +) { + int64_t ne[4]; + std::copy(a->ne, a->ne + 4, ne); + if (axis < 0) axis = a->n_dims + axis; + if (start < 0) start = ne[axis] + start; + if (end <= 0) end = ne[axis] + end; + GGML_ASSERT(0 <= start); + GGML_ASSERT(start < end); + GGML_ASSERT(end <= ne[axis]); + + + ne[axis] = end - start; + size_t offset = a->nb[axis] * start; + + size_t* nb = a->nb; + ggml_tensor* result = ggml_view_4d(ctx, a, ne[0], ne[1], ne[2], ne[3], nb[1], nb[2], nb[3], offset); + ggml_format_name(result, "%s [(%d)%ld:%ld]", a->name, axis, start, end); + result->n_dims = a->n_dims; + return result; +} + +ggml_tensor* ggml_select( + struct ggml_context * ctx, + struct ggml_tensor * a, + int axis, + int64_t index +) { + int64_t ne[GGML_MAX_DIMS]; + std::copy(a->ne, a->ne + GGML_MAX_DIMS, ne); + + if (axis < 0) axis = a->n_dims + axis; + if (index < 0) index = ne[axis] + index; + GGML_ASSERT(0 <= index); + GGML_ASSERT(index < ne[axis]); + + std::copy(a->ne + axis + 1, a->ne + GGML_MAX_DIMS, ne + axis); + + size_t offset = a->nb[axis] * index; + size_t* nb = a->nb; + GGML_ASSERT(GGML_MAX_DIMS == 4); + ggml_tensor* result = ggml_view_3d(ctx, a, ne[0], ne[1], ne[2], nb[1], nb[2], offset); + ggml_format_name(result, "%s [(%d)%ld]", a->name, axis, index); + result->n_dims = a->n_dims - 1; + return result; +} + + +// Inplace computation of PositionalEmbedding +extern "C" ggml_tensor* PositionalEmbedding_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* embeds +) { + // This only work with the simple pos encoders + int seq_len = embeds->ne[1]; + ggml_tensor* full_pos_embeds = model.tensors[prefix]; + + int start_step = 0; + if (has_kv_cache(model)) { + start_step = model.kv_cache[prefix].step_nr++; + } + ggml_tensor* pos_embeds = ggml_slice(model.ctx, full_pos_embeds, /*axis*/1, start_step, seq_len + start_step); + return ggml_add(model.ctx, embeds, pos_embeds); +} + +extern "C" ggml_tensor* TransformerEmbeddingFrontend_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs +) { + GGML_ASSERT(seqs->n_dims < GGML_MAX_DIMS); + ggml_context* ctx = model.ctx; + ggml_tensor* embed_weights = model.tensors[prefix + ".embed.weight"]; + GGML_ASSERT(embed_weights != nullptr); + ggml_tensor* embeds; + if (seqs->n_dims == 1) { + embeds = ggml_get_rows(ctx, embed_weights, seqs); + } else { + // ggml_get_rows isn't very flexible, we have to handle the reshape ourselves. + ggml_tensor* flat_seqs = seqs; + if (!ggml_is_contiguous(seqs)) { + flat_seqs = ggml_cont(ctx, flat_seqs); + } + flat_seqs = ggml_reshape_1d(ctx, flat_seqs, ggml_nelements(seqs)); + embeds = ggml_get_rows(ctx, embed_weights, flat_seqs); + embeds = ggml_reshape_4d(ctx, embeds, embed_weights->ne[0], seqs->ne[0], seqs->ne[1], seqs->ne[2]); + embeds->n_dims = seqs->n_dims + 1; + } + + // padding mask ? + // padding_mask = to_padding_mask(embeds, seq_lens) + + if (has_layer(model, prefix + ".pos_encoder")) { + embeds = PositionalEmbedding_forward(model, prefix + ".pos_encoder", embeds); + } + + if (has_layer(model, prefix + ".layer_norm")) { + embeds = LayerNorm_forward(model, prefix + ".layer_norm", embeds); + } + + return embeds; +} + +extern "C" ggml_tensor* StandardTransformerEncoder_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs, + ggml_tensor* padding_mask +) { + int layer_idx = 0; + std::string layer_name = prefix + ".layers." + std::to_string(layer_idx); + while (has_layer(model, layer_name)) { + seqs = StandardTransformerEncoderLayer_forward( + model, layer_name, seqs, padding_mask + ); + + ggml_set_name(seqs, ("x_enc_" + std::to_string(layer_idx)).c_str()); + layer_idx += 1; + layer_name = prefix + ".layers." + std::to_string(layer_idx); + } + + if (has_layer(model, prefix + ".layer_norm")) + seqs = LayerNorm_forward(model, prefix + ".layer_norm", seqs); + + return seqs; +} + +extern "C" ggml_tensor* StandardTransformerDecoderLayer_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs, + ggml_tensor* self_attn_mask, + ggml_tensor* encoder_output, + ggml_tensor* encoder_padding_mask +) { + ggml_context* ctx = model.ctx; + auto norm_order = model.layer_config.at(prefix + ".norm_order"); + + // _forward_self_attn(seqs, padding_mask) + auto residual = seqs; + if (norm_order != TRANSFORMER_NORM_ORDER_POST) + seqs = LayerNorm_forward(model, prefix + ".self_attn_layer_norm", seqs); + + seqs = MultiheadAttention_forward( + model, + prefix + ".self_attn", + seqs, + seqs, + seqs, + /*attn_mask=*/self_attn_mask + ); + + if (has_layer(model, prefix + ".self_attn_norm")) + seqs = LayerNorm_forward(model, prefix + ".self_attn_norm", seqs); + + seqs = ggml_add_inplace(ctx, seqs, residual); + + if (norm_order == TRANSFORMER_NORM_ORDER_POST) + seqs = LayerNorm_forward(model, prefix + ".self_attn_layer_norm", seqs); + + // _forward_encoder_decoder_attn + if (! has_layer(model, prefix + ".encoder_decoder_attn")) { + // `encoder_output` must be `None` for decoder-only attention. + GGML_ASSERT(encoder_output == nullptr); + return seqs; + } + + // `encoder_output` must not be `None` for encoder-decoder attention. + GGML_ASSERT(encoder_output != nullptr); + + residual = seqs; + + if (norm_order != TRANSFORMER_NORM_ORDER_POST) + seqs = LayerNorm_forward(model, prefix + ".encoder_decoder_attn_layer_norm", seqs); + + + seqs = MultiheadAttention_forward( + model, + prefix + ".encoder_decoder_attn", + seqs, + encoder_output, + encoder_output, + /*attention masks=*/encoder_padding_mask + ); + + seqs = ggml_add_inplace(ctx, seqs, residual); + + if (norm_order == TRANSFORMER_NORM_ORDER_POST) + seqs = LayerNorm_forward(model, prefix + ".encoder_decoder_attn_layer_norm", seqs); + + // _forward_ffn(seqs) + residual = seqs; + + if (norm_order != TRANSFORMER_NORM_ORDER_POST) + seqs = LayerNorm_forward(model, prefix + ".ffn_layer_norm", seqs); + + seqs = StandardFeedForwardNetwork_forward(model, prefix + ".ffn", seqs); + + // TODO: + // if self.residual_scale is not None: + // residual = self.residual_scale * residual + + seqs = ggml_add_inplace(ctx, seqs, residual); + + if (norm_order == TRANSFORMER_NORM_ORDER_POST) + seqs = LayerNorm_forward(model, prefix + ".ffn_layer_norm", seqs); + + return seqs; +} + +extern "C" ggml_tensor* causal_attention_mask(ggml_context* ctx, ggml_tensor* seqs) { + auto seq_len = seqs->ne[1]; + // TODO: allow other ggml_type + ggml_tensor* mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, seq_len, seq_len); + return ggml_diag_mask_inf(ctx, mask, 0); +} + +extern "C" ggml_tensor* StandardTransformerDecoder_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs, + ggml_tensor* padding_mask, + ggml_tensor* encoder_output, + ggml_tensor* encoder_padding_mask +) { + int layer_idx = 0; + std::string layer_name = prefix + ".layers." + std::to_string(layer_idx); + ggml_tensor* self_attn_mask = causal_attention_mask(model.ctx, seqs); + while (has_layer(model, layer_name)) { + seqs = StandardTransformerDecoderLayer_forward( + model, layer_name, seqs, self_attn_mask, encoder_output, encoder_padding_mask + ); + + ggml_set_name(seqs, ("x_dec_" + std::to_string(layer_idx)).c_str()); + layer_idx += 1; + layer_name = prefix + ".layers." + std::to_string(layer_idx); + } + + if (has_layer(model, prefix + ".layer_norm")) + seqs = LayerNorm_forward(model, prefix + ".layer_norm", seqs); + + return seqs; +} + + +int _determine_max_seq_len(const SequenceGeneratorJob& job, int source_seq_len) { + auto opts = job.opts; + int max_seq_len = -1; + if (source_seq_len <= 0 || opts.soft_max_seq_len_a <= 0) { + max_seq_len = opts.hard_max_seq_len; + } else { + max_seq_len = std::min(opts.hard_max_seq_len, int(opts.soft_max_seq_len_a * source_seq_len) + opts.soft_max_seq_len_b); + } + + if (opts.min_seq_len > max_seq_len) { + printf( + "The effective maximum sequence length must be greater than or equal to `min_seq_len` (%d), but is %d instead. Adjust your soft and hard maximum sequence length limits.\n", + opts.min_seq_len, + max_seq_len + ); + GGML_ASSERT(opts.min_seq_len <= max_seq_len); + } + + int prefix_seq_len = job.prefix_seq->ne[0]; + if (prefix_seq_len >= max_seq_len) { + printf( + "The effective maximum sequence length must be greater than `prefix_seq_len` (%d), but is %d instead.\n", + prefix_seq_len, + max_seq_len + ); + GGML_ASSERT(prefix_seq_len < max_seq_len); + } + + return max_seq_len; +} + +void _fan_out_encoder_output( + ggml_context* ctx, + ggml_tensor** encoder_output_out, + ggml_tensor** encoder_padding_mask_out, + int beam_size +) { + // (S_enc, M) + ggml_tensor* encoder_output = *encoder_output_out; + ggml_tensor* encoder_padding_mask = *encoder_padding_mask_out; + + // (B, S_enc, M) + ggml_tensor* shape = ggml_new_tensor_3d(ctx, GGML_TYPE_I8, encoder_output->ne[0], encoder_output->ne[1], beam_size); + // (S_enc, M) -> (B, S_enc, M) + *encoder_output_out = ggml_repeat(ctx, encoder_output, shape); + // (S_enc) -> (B, S_enc) + if (encoder_padding_mask != nullptr) { + ggml_tensor* shape_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_I8, encoder_padding_mask->ne[0], 1, beam_size); + *encoder_padding_mask_out = ggml_repeat(ctx, encoder_padding_mask, shape_mask); + } +} + +ggml_tensor* ggml_log_softmax(ggml_context* ctx, ggml_tensor* logits) { + // TODO: this isn't the most precise way of doing this + return ggml_log_inplace(ctx, ggml_soft_max_inplace(ctx, logits)); +} + +ggml_tensor* ggml_expand_2d(ggml_context* ctx, ggml_tensor* x, int64_t ne0, int64_t ne1) { + ggml_tensor* shape = ggml_new_tensor_2d(ctx, GGML_TYPE_I8, ne0, ne1); + ggml_type true_type = x->type; + ggml_tensor* y = ggml_repeat(ctx, x, shape); + y->type = true_type; + return y; +} + +extern "C" void _bootstrap_seqs_and_scores( + fairseq2_model& model, + const SequenceGeneratorJob& job, + ggml_tensor* full_seqs, + ggml_tensor* scores, + ggml_tensor* encoder_output, + ggml_tensor* encoder_padding_mask, + int n_threads +) { + int prefix_seq_len = job.prefix_seq->ne[0]; + int max_seq_len = scores->ne[0]; + int beam_size = scores->ne[1]; + GGML_ASSERT(prefix_seq_len > 0); + if (prefix_seq_len == 1) + return; + + ggml_context* ctx = model.ctx; + + // full_seqs[:, : prefix_seq_len] = job.prefix_seq; + ggml_tensor* seqs = ggml_slice(ctx, full_seqs, 0, 0, prefix_seq_len); + seqs = ggml_cpy(ctx, ggml_repeat(ctx, job.prefix_seq, seqs), seqs); + + // We have to bootstrap the model with the already fanned-out encoder + // output to correctly initialize its incremental state. + // Note: we don't start decoding the last prefix token just yet. + seqs = ggml_slice(ctx, seqs, 0, 0, prefix_seq_len - 1); + + // Bootstrap the model state with prefix sequence. + seqs = TransformerEmbeddingFrontend_forward(model, "text_decoder_frontend", seqs); + ggml_tensor* decoder_output = StandardTransformerDecoder_forward( + model, + "text_decoder", + seqs, + /*padding_mask*/ nullptr, + encoder_output, + encoder_padding_mask + ); + + // logits, lprobs: (N, S_pfx - 1, V) + ggml_tensor* logits = Linear_forward(model, "final_proj", decoder_output); + int vocab_size = logits->ne[0]; + ggml_tensor* lprobs = ggml_log_softmax(ctx, ggml_slice(ctx, logits, 1, 0, 1)); + + ggml_cgraph gf = ggml_build_forward(lprobs); + ggml_graph_compute_with_ctx(ctx, &gf, n_threads); + + // Fetch scores of next steps from "lprobs" + float p_score = 0; + for (int i = 1; i < prefix_seq_len; ++i) { + int p = ggml_get_i32_1d(job.prefix_seq, i); + p_score += ggml_get_f32_1d(lprobs, i * vocab_size + p); + for (int b = 0; b < beam_size; ++b) { + // scores: (N, S) + // Note: First step (e.g. BOS)'s score is always 0. + ggml_set_f32_1d(scores, b * max_seq_len + i, p_score); + } + } +} + +/// Finds the topk indices, and write the winning indices in "candidate_indices" array. +int topk( + ggml_tensor* lprobs, // (B, V) + std::int64_t k, + ggml_tensor* candidate_indices +) { + // Take the best 2 x `beam_size` predictions. We'll choose the first + // `beam_size` of these which don't predict EOS to continue with. + // (N, 2 x B) + // `vocab_size` - 1 to never select PAD. + std::int64_t K = std::min(k, ggml_nelements(lprobs)); + auto comp = [lprobs](std::int32_t a, std::int32_t b) { + return ggml_get_f32_1d(lprobs, a) > ggml_get_f32_1d(lprobs, b); + }; + GGML_ASSERT(ggml_nelements(candidate_indices) >= k); + auto cand = (std::int32_t*)candidate_indices->data; + std::partial_sort(cand, cand + K, cand + ggml_nelements(lprobs), comp); + + return K; +} + +void _tweak_lprobs(const SequenceGeneratorJob& job, ggml_tensor* lprobs, int step_nr, int max_seq_len, std::size_t vocab_size) { + std::size_t beam_size = job.opts.beam_size; + std::size_t eos_idx = job.eos_idx; + + // Do not allow EOS before reaching the minimum sequence length. + if (step_nr < job.opts.min_seq_len) { + // lprobs[:, :, self.eos_idx] = -INFINITY; + for (size_t i = 0; i < beam_size; ++i) + ggml_set_f32_1d(lprobs, vocab_size * i + eos_idx, -INFINITY); + } + + // If we have reached the maximum length, force the last step to be EOS. + if (step_nr == max_seq_len - 2) { + // lprobs[:, :, : self.eos_idx] = -torch.inf + // lprobs[:, :, self.eos_idx + 1 :] = -torch.inf + for (size_t b = 0; b < beam_size; ++b) { + size_t t = 0; + for (t = 0; t < eos_idx; ++t) + ggml_set_f32_1d(lprobs, vocab_size * b + t, -INFINITY); + for (t = eos_idx + 1; t < vocab_size; ++t) + ggml_set_f32_1d(lprobs, vocab_size * b + t, -INFINITY); + } + } + + // Never allow PAD. + std::size_t pad_idx = job.pad_idx; + for (size_t i = 0; i < beam_size; ++i) + ggml_set_f32_1d(lprobs, vocab_size * i + pad_idx, -INFINITY); + + // Apply UNK penalty. + if (job.unk_idx >= 0 && job.opts.unk_penalty != 0) { + // lprobs[:, :, self.unk_idx] -= self.opts.unk_penalty + auto lprobs_raw = ggml_get_data_f32(lprobs); + for (size_t i = 0; i < beam_size; ++i) + lprobs_raw[vocab_size * i + job.unk_idx] -= job.opts.unk_penalty; + } +} + + + +/// Copies the sequence and scores of a given candidate beam. +void _finalize_hypothesis( + const SequenceGeneratorJob& job, + ggml_context* ctx, + int step_nr, + std::int32_t beam, + std::int32_t token, + float eos_score, + ggml_tensor* seqs, // (beam_size, seq_len) + ggml_tensor* scores, // (beam_size, seq_len) + Hypothesis* hypothesis +) { + ggml_tensor* seq = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, step_nr + 2); + hypothesis->seq = seq; + ggml_tensor* step_scores = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, step_nr + 2); + hypothesis->step_scores = step_scores; + + auto tok = (std::int32_t*)seq->data; + for (int i = 0; i < step_nr + 1; ++i) { + tok[i] = ggml_get_i32_1d(seqs, seqs->ne[0] * beam + i); + } + tok[step_nr + 1] = token; + + // Convert from cumulative to per-step scores. + auto sc = (float*)step_scores->data; + float last_score = eos_score; + for (int i = step_nr; i >= 0; --i) { + float sc0 = ggml_get_f32_1d(scores, scores->ne[0] * beam + i); + sc[i + 1] = last_score - sc0; + last_score = sc0; + } + sc[0] = 0; + + if (job.opts.normalize_scores) + // Skip first EOS since it is always 0 and skews normalization. + eos_score /= (float)std::pow((step_nr + 1), job.opts.len_penalty); + hypothesis->score = eos_score; +} + +// Uses ggml_context to store any object. +#define GGML_CTX_ALLOC(ctx, Type, n) \ + (Type*)(ggml_new_tensor_1d(ctx, GGML_TYPE_I8, sizeof(Type) * n)->data); + + +ggml_context* ctx_from_buffer(std::vector& buffer) { + return ggml_init({ + /*.mem_size =*/ static_cast(buffer.capacity()), + /*.mem_buffer =*/ buffer.data(), + /*.no_alloc =*/ false, + }); +} + +ggml_allocr* new_arena_allocr(std::vector& buffer) { + return ggml_allocr_new(buffer.data(), buffer.capacity(), 8); +} + + + +/// Generates a translation for a single sequence +/// The results Hypothesis are written inside `result_ctx`. +extern "C" Hypothesis* generate_sequence( + fairseq2_model& model, + const SequenceGeneratorJob& job, + ggml_tensor* encoder_output, + ggml_tensor* encoder_padding_mask, + ggml_context* result_ctx, + int n_threads +) { + // Pre allocate memory buffers. + // * step_ctx: contains metadata for the model graph, as well as some explicit + // buffers for the lprobs tweaking. + // * prev_step_ctx: is an additional buffer because we need some results from previous steps, + // to compute next step. Notably self attention kv cache. + // * search_ctx contains tensors that should live for the full search, + // like encoder kv cache. + // * step_alloc contains buffer for the forward pass of the model. + // TODO: the size allocated should depend on the input length and vocab size + std::vector local_bufs[5] = { + std::vector(128 * 1024 * 1024), // step_ctx + std::vector(128 * 1024 * 1024), // prev_step_ctx + std::vector(256 * 1024 * 1024), // search_ctx + std::vector(256 * 1024 * 1024), // step_alloc + }; + ggml_allocr* step_alloc = new_arena_allocr(local_bufs[3]); + + ggml_tensor* embed = model.tensors["text_decoder_frontend.embed.weight"]; + size_t vocab_size = embed->ne[1]; + std::size_t beam_size = job.opts.beam_size; + ggml_detach(encoder_output); + int source_seq_len = encoder_output->ne[1]; + int max_seq_len = _determine_max_seq_len(job, source_seq_len); + + ggml_context* search_ctx = ctx_from_buffer(local_bufs[2]); + ggml_context* original_ctx = model.ctx; + fairseq2_kv_cache_alloc(model, search_ctx, beam_size, max_seq_len); + + // (S_enc, M) -> (B, S_enc, M) + model.ctx = search_ctx; + _fan_out_encoder_output(search_ctx, &encoder_output, &encoder_padding_mask, beam_size); + + // Allocate results in the context provided by the caller. + ggml_set_no_alloc(result_ctx, false); + Hypothesis* finished_searches_begin = GGML_CTX_ALLOC(result_ctx, Hypothesis, beam_size); + Hypothesis* finished_searches = finished_searches_begin; + for (std::size_t i = 0; i < beam_size; ++i) finished_searches[i] = {nullptr, -INFINITY, nullptr}; + Hypothesis* finished_searches_end = finished_searches + beam_size; + + // Initialize buffers. (B, S) + ggml_tensor* seqs = ggml_new_tensor_2d(search_ctx, GGML_TYPE_I32, max_seq_len, beam_size); + ggml_set_i32(seqs, 0); + ggml_set_name(seqs, "seqs_0"); + ggml_tensor* scores = ggml_new_tensor_2d(search_ctx, GGML_TYPE_F32, max_seq_len, beam_size); + ggml_set_name(scores, "scores_0"); + ggml_set_f32(scores, 0.0); + + int prefix_seq_len = job.prefix_seq->ne[0]; + int start_step = prefix_seq_len - 1; + + ggml_context* prev_step_ctx = ctx_from_buffer(local_bufs[(start_step - 1) % 2]); + ggml_context* step_ctx = ctx_from_buffer(local_bufs[start_step % 2]); + GGML_ASSERT(step_ctx != search_ctx); + GGML_ASSERT(prev_step_ctx != step_ctx); + model.ctx = prev_step_ctx; + // search_ctx because we need encoder_decoder_attn.k_cache to survive for the full search + model.kv_cache_ctx = search_ctx; + _bootstrap_seqs_and_scores( + model, job, seqs, scores, encoder_output, encoder_padding_mask, n_threads + ); + + // Holds the indices of beams (a beam can occur more than once) that we + // should continue with in the next step. + ggml_tensor* beam_indices = ggml_new_tensor_1d(search_ctx, GGML_TYPE_I32, beam_size); + ggml_tensor* next_tokens = ggml_new_tensor_1d(search_ctx, GGML_TYPE_I32, beam_size); + ggml_tensor* next_scores = ggml_new_tensor_1d(search_ctx, GGML_TYPE_F32, beam_size); + + // Array with integers up to 'vocab_size * beam_size' to represent next beams to explore + ggml_tensor* candidate_indices = ggml_new_tensor_1d(search_ctx, GGML_TYPE_I32, vocab_size * beam_size); + for (std::size_t i = 0; i < vocab_size * beam_size; ++i) + ((int32_t *)(candidate_indices->data))[i] = i; + + printf_mem_usage(search_ctx, "search_ctx"); + + for (int step_nr = start_step; step_nr < max_seq_len - 1; ++step_nr) { + model.ctx = step_ctx; + ggml_set_no_alloc(step_ctx, true); // Use allocr for the model forward pass + ggml_tensor* prev_token = ggml_slice(step_ctx, seqs, 0, step_nr, step_nr + 1); + + ggml_tensor* decoder_input = TransformerEmbeddingFrontend_forward(model, "text_decoder_frontend", prev_token); + ggml_tensor* decoder_output = StandardTransformerDecoder_forward( + model, + "text_decoder", + decoder_input, + nullptr, // We never generate PAD. + encoder_output, + encoder_padding_mask + ); // (B, 1, D) + + decoder_output = ggml_flatten_1d(step_ctx, decoder_output, 0); // (B, model_dim) + // Force logits to be allocated in step_ctx, not in step_alloc. + ggml_set_no_alloc(step_ctx, false); + ggml_tensor* logits = Linear_forward(model, "final_proj", decoder_output); // (B, vocab_size) + ggml_tensor* lprobs = ggml_log_softmax(step_ctx, logits); + + // Compute lprobs here so we can modify it in place in the lprob tweaking phase + // TODO: use ggml properly compute the tweaks + ggml_cgraph gf = ggml_build_forward(lprobs); + size_t fwd_mem = ggml_allocr_alloc_graph(step_alloc, &gf); + GGML_UNUSED(fwd_mem); + ggml_graph_compute_with_ctx(step_ctx, &gf, n_threads); + ggml_detach(lprobs); + ggml_allocr_reset(step_alloc); +#if DEBUG_MEM_USAGE + printf("beam search step %d. Graph.n_nodes: %d.\n", step_nr, gf.n_nodes); + printf(" Fwd mem: %.1fMB\n", fwd_mem/1024.0/1024.0); + std::fill(local_bufs[3].begin(), local_bufs[3].end(), 0xAA); +#endif + _tweak_lprobs(job, lprobs, step_nr, max_seq_len, vocab_size); + + ggml_tensor* last_scores = ggml_slice(step_ctx, scores, 0, step_nr, step_nr+1); + if (step_nr == start_step) { + // At the initial step, all hypotheses are equally likely, so we use + // only the first beam. + lprobs = ggml_slice(step_ctx, lprobs, 1, 0, 1); + lprobs = ggml_cont(step_ctx, lprobs); + // The first step always indicates the beginning of the sequence and has no score. + if (step_nr > 0) { + last_scores = ggml_slice(step_ctx, last_scores, 1, 0, 1); + lprobs = ggml_add_inplace(step_ctx, lprobs, ggml_repeat(step_ctx, last_scores, lprobs)); + } + } else { + // Make probabilities contain cumulative scores for each hypothesis. + lprobs = ggml_add_inplace(step_ctx, lprobs, ggml_repeat(step_ctx, last_scores, lprobs)); + } + + gf = ggml_build_forward(lprobs); + ggml_graph_compute_with_ctx(step_ctx, &gf, n_threads); + + // Determine (beam, token) candidates for the next step. + // (N, 2 x B) + std::int64_t K = topk( + lprobs, std::min(2 * beam_size, vocab_size - 1), candidate_indices + ); + + std::size_t ongoing_beams = 0; + for (std::int32_t i = 0; i < K; ++i) { + int c = ggml_get_f32_1d(candidate_indices, i); + std::int32_t beam = c / vocab_size; + std::int32_t token = c % vocab_size; + float tok_score = ggml_get_f32_1d(lprobs, c); + + // Detect beams that reached the minimum length and that end with an EOS. + bool eos = token == job.eos_idx; + eos &= tok_score != -INFINITY; + if (eos) { + _finalize_hypothesis(job, result_ctx, step_nr, beam, token, tok_score, seqs, scores, finished_searches++); + if (finished_searches == finished_searches_end) + goto end_of_beam_search; + continue; + } + + ggml_set_f32_1d(beam_indices, ongoing_beams, beam); + ggml_set_f32_1d(next_tokens, ongoing_beams, token); + ggml_set_f32_1d(next_scores, ongoing_beams, tok_score); + ongoing_beams += 1; + if (ongoing_beams >= beam_size) break; + } + + // Reorder beams in the `seq` and `score` buffers. The same beam can + // be selected more than once. + // (B, S), (B) -> (B, S) + ggml_tensor* new_seqs = ggml_get_rows(step_ctx, seqs, beam_indices); + ggml_tensor* new_scores = ggml_get_rows(step_ctx, scores, beam_indices); + ggml_cgraph gf_reorder = ggml_build_forward(new_seqs); + ggml_build_forward_expand(&gf_reorder, new_scores); + reorder_kv_cache(model, step_ctx, &gf_reorder, beam_indices); + ggml_graph_compute_with_ctx(step_ctx, &gf_reorder, n_threads); + seqs = ggml_detach(new_seqs); + scores = ggml_detach(new_scores); + + // seqs[:, step_nr + 1] = next_tokens + // scores[:, step_nr + 1] = next_scores + for (std::size_t i = 0; i < beam_size; ++i) { + ((std::int32_t*)seqs->data)[step_nr + 1 + i * max_seq_len] = ggml_get_i32_1d(next_tokens, i); + ((float*)scores->data)[step_nr + 1 + i * max_seq_len] = ggml_get_f32_1d(next_scores, i); + } + + printf_mem_usage(step_ctx, "step_ctx"); + ggml_free(prev_step_ctx); + prev_step_ctx = step_ctx; +#if DEBUG_MEM_USAGE + std::fill(local_bufs[(step_nr + 1) % 2].begin(), local_bufs[(step_nr + 1) % 2].end(), 0xAA); +#endif + step_ctx = ctx_from_buffer(local_bufs[(step_nr + 1) % 2]); + } + +end_of_beam_search: + // Ensure that hypotheses are sorted by decreasing scores before returning. + std::sort( + finished_searches_begin, + finished_searches_end, + [](Hypothesis a, Hypothesis b) { return a.score > b.score; } + ); + + fairseq2_kv_cache_reset(model); + model.ctx = original_ctx; + return finished_searches_begin; +} + +extern "C" Hypothesis* _testing_return_hypothesis_ptr(ggml_context* ctx) { + Hypothesis* result = GGML_CTX_ALLOC(ctx, struct Hypothesis, 2); + + result[0] = {ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1), 3.14f, (ggml_tensor*)result}; + ggml_set_i32_1d(result[0].seq, 0, 314); + + result[1] = {ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1), 4.21f, nullptr}; + ggml_set_i32_1d(result[1].seq, 0, 421); + + return result; +} + +// SPM tokenizer +// original implementation: +// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4 + + + +struct llm_symbol { + using index = int; + index prev; + index next; + const char * text; + size_t n; + llama_vocab::id id; +}; + +static_assert(std::is_trivially_copyable::value, "llm_symbol is not trivially copyable"); + +static size_t utf8_len(char src) { + const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; + uint8_t highbits = static_cast(src) >> 4; + return lookup[highbits]; +} + +struct llm_bigram_spm { + struct comparator { + bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) { + return (l.score < r.score) || (l.score == r.score && l.left > r.left); + } + }; + using queue_storage = std::vector; + using queue = std::priority_queue; + llm_symbol::index left; + llm_symbol::index right; + float score; + size_t size; + llama_vocab::id id; +}; + +struct llm_tokenizer_spm { + llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {} + + void tokenize(const std::string& input_text, ggml_tensor& output) { + llama_vocab::id unk_idx = vocab.token_to_id.at(""); + + // split string into utf8 chars + int index = 0; + size_t offs = 0; + // This is kind of annoying, but needed because with SPM, + // characters following a space have a special meaning. + // And the algorithm rely on substrings to do the lookups. + std::string text = input_text; + bool need_extra_space = text.size() > 0 && text[0] != ' '; + if (need_extra_space) text = " " + text; + + while (offs < text.size()) { + size_t len = utf8_len(text[offs]); + size_t n = std::min(len, text.size() - offs); + + auto token = vocab.token_to_id.find(std::string(text, offs, n)); + llama_vocab::id id = token == vocab.token_to_id.end() ? unk_idx : token->second; + llm_symbol sym = { + /*prev*/ index - 1, + /*next*/ offs + n == text.size() ? -1 : index + 1, + /*text*/ text.c_str() + offs, + /*n*/ n, + /*id*/ id + }; + offs += n; + index++; + symbols.emplace_back(sym); + } + + // seed the work queue with all possible 2-character tokens. + for (size_t i = 1; i < symbols.size(); ++i) { + try_add_bigram(i - 1, i); + } + + // keep substituting the highest frequency pairs for as long as we can. + while (!work_queue.empty()) { + auto bigram = work_queue.top(); + work_queue.pop(); + + auto & left_sym = symbols[bigram.left]; + auto & right_sym = symbols[bigram.right]; + const std::string text = std::string(left_sym.text, left_sym.n + right_sym.n); + + // if one of the symbols already got merged, skip it. + if ( + left_sym.n == 0 + || right_sym.n == 0 + || left_sym.n + right_sym.n != bigram.size + ) continue; + + // merge the right sym into the left one + left_sym.n += right_sym.n; + left_sym.id = bigram.id; + right_sym.n = 0; + + // remove the right sym from the chain + left_sym.next = right_sym.next; + if (right_sym.next >= 0) { + symbols[right_sym.next].prev = bigram.left; + } + + // find more substitutions + try_add_bigram(left_sym.prev, bigram.left); + try_add_bigram(bigram.left, left_sym.next); + } + + llama_vocab::id* out = (llama_vocab::id*)output.data; + int out_step = sizeof(llama_vocab::id) / output.nb[0]; + int num_tokens = 0; + for (int i = 0; i > -1; i = symbols[i].next) { + llm_symbol& symbol = symbols[i]; + *(out + num_tokens * out_step) = symbol.id; + num_tokens += 1; + } + *(out + num_tokens * out_step) = vocab.token_to_id.at(""); + num_tokens += 1; + output.ne[0] = num_tokens; + } + +private: + + void try_add_bigram(int left, int right) { + if (left == -1 || right == -1) { + return; + } + + const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n); + auto token = vocab.token_to_id.find(text); + + if (token == vocab.token_to_id.end()) { + return; + } + + llama_vocab::id id = token->second; + if (static_cast(id) >= vocab.id_to_token.size()) { + return; + } + + const auto& tok_data = vocab.id_to_token[id]; + llm_bigram_spm bigram = { + /*left */ left, + /*right*/ right, + /*score*/ tok_data.score, + /*size */ text.size(), + /*id */ id + }; + work_queue.push(bigram); + } + + const llama_vocab& vocab; + std::vector symbols; + llm_bigram_spm::queue work_queue; +}; + + +extern "C" void fairseq2_spm_tokenize(fairseq2_model* model, const char* text, ggml_tensor& out) { + llm_tokenizer_spm spm = {model->vocab}; + spm.tokenize(std::string(text), out); +} + +extern "C" std::size_t fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, char* out) { + int eos_idx = model->vocab.token_to_id[""]; + int sent_len = tokens->ne[0]; + std::size_t written = 0; + for (int i = 0; i < sent_len; ++i) { + int id = ggml_get_i32_1d(tokens, i); + // Don't print the EOS token but only if it appear at the end. + if (i == sent_len - 1 && eos_idx == id) break; + + std::string token = model->vocab.id_to_token.at(id).text; + // Skip the first space outputted. + auto begin = token.begin(); + if (i == 0 && token.size() > 0 && token[0] == ' ') begin += 1; + std::copy(begin, token.end(), out); + std::size_t n = token.end() - begin; + written += n; + out += n; + } + *out = '0'; + return written; +} diff --git a/seamless_communication/ggml/examples/unity/fairseq2.h b/seamless_communication/ggml/examples/unity/fairseq2.h new file mode 100644 index 0000000..1ffc8e5 --- /dev/null +++ b/seamless_communication/ggml/examples/unity/fairseq2.h @@ -0,0 +1,315 @@ +#pragma once + +#include +#include +#include +#include "ggml.h" +#include "kaldi-native-fbank/csrc/feature-fbank.h" + +#include "ggml-alloc.h" + +#define FORCE_ALLOC(name, ctx, ggml_new_tensor)\ + bool name ## _save_no_alloc_ = ggml_get_no_alloc(ctx); \ + ggml_set_no_alloc(ctx, false); \ + ggml_tensor* name = ggml_new_tensor; \ + ggml_set_no_alloc(ctx, name ## _save_no_alloc_); + +typedef int32_t llama_token; + +extern "C" enum llama_token_type { + LLAMA_TOKEN_TYPE_UNDEFINED = 0, + LLAMA_TOKEN_TYPE_NORMAL = 1, + LLAMA_TOKEN_TYPE_UNKNOWN = 2, + LLAMA_TOKEN_TYPE_CONTROL = 3, + LLAMA_TOKEN_TYPE_USER_DEFINED = 4, + LLAMA_TOKEN_TYPE_UNUSED = 5, + LLAMA_TOKEN_TYPE_BYTE = 6, +}; + + +struct llama_vocab { + using id = int32_t; + using token = std::string; + using ttype = llama_token_type; + + struct token_data { + token text; + float score; + ttype type; + }; + + std::unordered_map token_to_id; + std::vector id_to_token; + + std::unordered_map special_tokens_cache; + std::map, int> bpe_ranks; + + // default LLaMA special tokens + id special_bos_id = 1; + id special_eos_id = 2; + id special_unk_id = 0; + id special_sep_id = -1; + id special_pad_id = -1; + + int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add. + int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add. + + id linefeed_id = 13; + id special_prefix_id = 32007; + id special_middle_id = 32009; + id special_suffix_id = 32008; + id special_eot_id = 32010; + + int find_bpe_rank(std::string token_left, std::string token_right) const { + GGML_ASSERT(token_left.find(" ") == std::string::npos); + GGML_ASSERT(token_left.find("\n") == std::string::npos); + GGML_ASSERT(token_right.find(" ") == std::string::npos); + GGML_ASSERT(token_right.find("\n") == std::string::npos); + + auto it = bpe_ranks.find(std::make_pair(token_left, token_right)); + if (it == bpe_ranks.end()) { + return -1; + } + + return it->second; + } +}; + + +struct KeyValueTensor { + ggml_tensor* full_k; + ggml_tensor* full_v; + ggml_tensor* self_attn_mask; + int step_nr; +}; + +struct fairseq2_model { + // Context containing all tensors memory + ggml_context* tensors_ctx = nullptr; + + // Named tensors, all tensors should belong to tensors_ctx + std::unordered_map tensors = {}; + + // Hashmap containing model hyper-parameters. + std::unordered_map hparams = {}; + + // Hashmap containing layers hyper-parameters. + // Normally those can be inferred from hparams, but it avoids doing this logic in GGML + std::unordered_map layer_config = {}; + + llama_vocab vocab; + + // KV cache for attention layers + mutable std::unordered_map kv_cache = {}; + + // an inference context, not managed by this object + // TODO: is this the best place to store this or should we also pass this to all forward methods ? + ggml_context* ctx = nullptr; + + ggml_context* kv_cache_ctx = nullptr; +}; + +double fairseq2_model_layer_config_double(const fairseq2_model& model, std::string name); + +/// allocate the fairseq2 model and hyperparameters +extern "C" fairseq2_model* fairseq2_model_alloc(); +// free the models and all its owned tensors +extern "C" void fairseq2_model_free(fairseq2_model* model); +extern "C" void fairseq2_model_set_inference_ctx(fairseq2_model* model, ggml_context* ctx); +extern "C" void fairseq2_kv_cache_reset(const fairseq2_model& model); +ggml_context* ctx_from_buffer(std::vector& buffer); + +extern "C" std::string* std_string_alloc(char* c_str); +extern "C" void std_string_free(std::string* str); + +extern "C" ggml_tensor* WaveformToFbank_forward( + fairseq2_model& model, + const std::string &prefix, + ggml_tensor* waveform +); +extern "C" ggml_tensor* ggml_slice( + struct ggml_context* ctx, + struct ggml_tensor* a, + int axis, + int64_t start, + int64_t end +); + +/// Merge the given dimension and the previous one in the tensor. +/// (..., num_heads, N, ...) -> (..., num_heads * N, ...) +/// dim is the position of the resulting merged dimension +/// ggml_flatten_1d(x, d) <==> torch.flatten(x, -1-d-1, -1-d0 +extern "C" ggml_tensor* ggml_flatten_1d(ggml_context* ctx, ggml_tensor* x, int dim); + +/// Split the given dimension. +/// (..., K * N, ...) -> (..., K, N, ...) +/// dim is the position of the output dimension with the given number of element (N). +extern "C" ggml_tensor* ggml_unflatten_1d(ggml_context* ctx, ggml_tensor* x, int dim, int num_el); + +extern "C" ggml_tensor* Linear_forward( + fairseq2_model& model, + const std::string &prefix, + ggml_tensor* input +); + +extern "C" ggml_tensor* LayerNorm_forward( + fairseq2_model& model, + const std::string &prefix, + ggml_tensor* input +); + +extern "C" ggml_tensor* StandardFeedForwardNetwork_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs +); + +extern "C" ggml_tensor* SiluFeedForwardNetwork_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs +); + +extern "C" ggml_tensor* MultiheadAttention_forward( + fairseq2_model& model, + const std::string &prefix, + ggml_tensor* queries, // (slen, d_in) + ggml_tensor* keys, // (klen, d_in) + ggml_tensor* values, // (klen, d_out) + ggml_tensor* attn_mask // (klen, slen) +); + + +extern "C" ggml_tensor* PositionalEmbedding_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* embeds +); + +extern "C" ggml_tensor* TransformerEmbeddingFrontend_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs +); + +extern "C" ggml_tensor* StandardTransformerEncoderLayer_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs, + ggml_tensor* padding_mask +); + +extern "C" ggml_tensor* RelativePositionMHA_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs +); + +extern "C" ggml_tensor* ConvModule_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs +); + +extern "C" ggml_tensor* StandardConformerEncoderLayer_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs, + ggml_tensor* padding_mask +); + +extern "C" ggml_tensor* StandardConformerEncoder_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs, + ggml_tensor* padding_mask +); + +extern "C" ggml_tensor* StandardConformerEncoderAdaptorLayer_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs, + ggml_tensor* padding_mask +); + +extern "C" ggml_tensor* StandardConformerEncoderAdaptor_forward( + fairseq2_model& model, + const std::string& prefix, + ggml_tensor* seqs, + ggml_tensor* padding_mask +); +// Specifies the Layer Normalization order. +// see fairseq2/nn/transformer/norm_order.py +enum TransformerNormOrder { + TRANSFORMER_NORM_ORDER_POST = 0, + TRANSFORMER_NORM_ORDER_PRE = 1, + TRANSFORMER_NORM_ORDER_PRE_WITH_NORMFORMER = 2 +}; + + + +/// Holds the options to pass to a sequence generator. +struct SequenceGeneratorOptions { + /// The beam size. + int beam_size = 5; + + /// The minimum length of generated sequences (including prefix sequence). + int min_seq_len = 1; + + /// The terms ``a`` and ``b`` of ``ax + b`` where ``x`` is the source + /// sequence length. The generated sequences (including prefix sequence) will + /// have the maximum length of ``min(hard_max_seq_len, ax + b)``. See also + /// ``hard_max_seq_len``. + float soft_max_seq_len_a = 1; + int soft_max_seq_len_b = 200; + + /// The hard limit on maximum length of generated sequences. + int hard_max_seq_len = 1024; + + /// The length penalty, where values less than 1.0 favor shorter, values + /// greater than 1.0 favor longer sequences. + float len_penalty = 1.0; + + /// The unknown symbol penalty, where values less than 0 produce more UNKs, + /// values greater than 0 produce fewer UNKs. + float unk_penalty = 0.0; + + /// If ``True``, normalizes scores by the length of generated sequences. + bool normalize_scores = true; +}; + + +struct SequenceGeneratorJob { + SequenceGeneratorOptions opts; + ggml_tensor* prefix_seq; + std::int32_t pad_idx; + std::int32_t unk_idx; + std::int32_t bos_idx; + std::int32_t eos_idx; + std::int32_t num_threads; +}; + +/// Represents a hypothesis produced by a sequence generator. +struct Hypothesis { + /// The generated sequence. + ggml_tensor* seq; + + /// The score of the hypothesis. + float score; + + /// The score of each individual sequence step. + ggml_tensor* step_scores; +}; + + +extern "C" Hypothesis* generate_sequence( + fairseq2_model& model, + const SequenceGeneratorJob& opts, + ggml_tensor* encoder_output, + ggml_tensor* encoder_padding_mask, + ggml_context* result_ctx, + int n_threads +); + +extern "C" void fairseq2_spm_tokenize(fairseq2_model* model, const char* text, ggml_tensor& out); +extern "C" std::size_t fairseq2_spm_detokenize(fairseq2_model* model, ggml_tensor* tokens, char* out); diff --git a/seamless_communication/ggml/examples/unity/model_loader.cpp b/seamless_communication/ggml/examples/unity/model_loader.cpp new file mode 100644 index 0000000..d7fbf51 --- /dev/null +++ b/seamless_communication/ggml/examples/unity/model_loader.cpp @@ -0,0 +1,223 @@ +#include +#include "model_loader.h" + +#define DEBUG_MODEL_LOAD 0 + +std::ifstream open_ggml_file(const char* fname) { + printf("%s: loading model from '%s'\n", __func__, fname); + + auto fin = std::ifstream(std::string(fname), std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname); + throw std::invalid_argument("failed to open file."); // TODO Merge error message. + } + + std::uint32_t magic; + fin.read((char*)&magic, 4); + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid model file '%s' (bad header %d)\n", __func__, fname, magic); + throw std::invalid_argument("failed to open file."); // TODO Merge error message. + } + return fin; +} + +void register_prefix(fairseq2_model &model, const std::string& name) { + std::size_t i = name.find_last_of('.'); + while(i != std::string::npos && i > 0) { + std::string prefix = name.substr(0, i); + auto prev_tensor = model.tensors.find(prefix); + if (prev_tensor != model.tensors.end()) { + GGML_ASSERT(prev_tensor->second == nullptr); + } + model.tensors[prefix] = nullptr; + i = name.find_last_of('.', i - 1); + } +} + + +std::int64_t +model_loader::load_model_weights(fairseq2_model &model, std::ifstream &fin) +{ + std::int64_t num_tensor = 0; + std::int64_t f32_tensor_size = 0; + fin.read((char*) &num_tensor, sizeof(num_tensor)); + fin.read((char*) &f32_tensor_size, sizeof(f32_tensor_size)); + + // TODO: it might be interesting to allow the caller to not upcast the weights to float32. + // Note this require changing the on disk format + bool as_float32 = true; + struct ggml_init_params params = { + /*.mem_size =*/ f32_tensor_size + num_tensor * (int64_t)ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + model.tensors_ctx = ggml_init(params); + + size_t model_size = 0; + for (int i = 0; i < num_tensor; ++i) { + std::string name = get_name(fin); + if (name.length() == 0) + break; + auto tensor = load_tensor_value(fin, model.tensors_ctx, as_float32); + if (tensor == nullptr) { + // Abort in case of error, the input stream is corrupted at this point. + printf("Error while reading tensor %s\n", name.c_str() ); + throw std::invalid_argument("Error while reading tensor from file."); + } + register_prefix(model, name); + ggml_set_name(tensor, name.c_str()); + model.tensors[name] = tensor; + if (DEBUG_MODEL_LOAD) { + printf("%s [%5ld, %5ld], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), tensor->ne[0], tensor->ne[1], ggml_type_name(tensor->type), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + } + model_size += ggml_nbytes(tensor); + } + + double mb = 1024.0 * 1024.0; + printf("%s: model size: %8.2f MB, memory used: %8.2f MB, memory reserved: %8.2f MB\n", + __func__, + model_size / mb, + ggml_used_mem(model.tensors_ctx) / mb, + ggml_get_mem_size(model.tensors_ctx) / mb + ); + + return ggml_get_mem_size(model.tensors_ctx); +} + +void assert_endianness() { + union { + unsigned int i; + char c[4]; + } un; + un.i = 0x12345678; + + if (un.c[0] == 0x78 && un.c[3] == 0x12) { + printf("little-endian\n"); + } + else if (un.c[0] == 0x12 && un.c[3] == 0x78) { + printf("big-endian\n"); + GGML_ASSERT(false); // model_loader.cpp assumes the system is little-endian + } + else { + printf("unknown-endian\n"); + GGML_ASSERT(false); // model_loader.cpp assumes the system is little-endian + } +} + + +void model_loader::load_hparams(std::unordered_map& hparams, std::ifstream &fin) +{ + std::int64_t num_params = 0; + fin.read(reinterpret_cast(&num_params), sizeof num_params); + GGML_ASSERT(fin.gcount() == 8); + + hparams.reserve(num_params); + + std::int64_t value; + for (int i = 0; i < num_params; ++i) { + std::string name = get_name(fin); + if (name.length() == 0) + break; + fin.read((char*) &value, sizeof(value)); + hparams[name] = value; + } +} + +void model_loader::load_vocab(llama_vocab& vocab, std::ifstream &fin) +{ + // vocab.special_bos_id = 1; + // vocab.special_eos_id = 2; + // vocab.special_unk_id = 0; + // vocab.special_sep_id = -1; + // vocab.special_pad_id = -1; + + std::int64_t vocab_size = 0; + fin.read(reinterpret_cast(&vocab_size), sizeof(vocab_size)); + GGML_ASSERT(fin.gcount() == 8); + + vocab.token_to_id.reserve(vocab_size); + vocab.id_to_token.reserve(vocab_size); + + std::string packed_vocab = get_name(fin); + std::int64_t ctx_size = vocab_size * sizeof(float) + vocab_size + 2 * ggml_tensor_overhead(); + ctx_size *= 2; + ggml_context* ctx = ggml_init(ggml_init_params{ctx_size, nullptr, false}); + ggml_tensor* lengths_tensor = load_tensor_value(fin, ctx, true); + std::int8_t* lengths = (std::int8_t*)lengths_tensor->data; + ggml_tensor* scores_tensor = load_tensor_value(fin, ctx, true); + float* scores = ggml_get_data_f32(scores_tensor); + + int64_t offset = 0; + for (int i = 0; i < vocab_size; ++i) { + // TODO: we should use string view instead of copying each word in a new string + std::string word = packed_vocab.substr(offset, lengths[i]); + vocab.token_to_id[word] = i; + vocab.id_to_token.push_back({word, scores[i], LLAMA_TOKEN_TYPE_NORMAL}); + offset += lengths[i] + 1; + } + // Since we copied lengths and scores, we don't need the context anymore. + ggml_free(ctx); + + // vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); + // TODO: special tokens stuff ? +} + +ggml_tensor* load_tensor_value(std::ifstream &fin, ggml_context* ctx, bool as_float32) +{ + int32_t n_dims = 0; + int32_t raw_type = 0; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&raw_type), sizeof(raw_type)); + ggml_type type = ggml_type(raw_type); + + if (n_dims <= 0 || n_dims > GGML_MAX_DIMS || raw_type < 0 || raw_type > GGML_TYPE_COUNT) { + return nullptr; + } + int64_t ne[4] = {1, 1, 1, 1}; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + } + + ggml_tensor* tensor; + if (as_float32 && type == GGML_TYPE_F16) { + // read quantized weights from disk, and convert them to f32. + tensor = ggml_new_tensor(ctx, GGML_TYPE_F32, n_dims, ne); + ggml_fp16_t buf[128]; + int num_el = ggml_nelements(tensor); + for (int i = 0; i < num_el; i += 128) { + int block_size = std::min(128, num_el - i); + fin.read(reinterpret_cast(&buf), ggml_type_size(type) * block_size); + ggml_fp16_to_fp32_row((const ggml_fp16_t*)&buf, (float*)tensor->data + i, block_size); + } + } else { + tensor = ggml_new_tensor(ctx, type, n_dims, ne); + fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + } + return tensor; +} + +std::string +model_loader::get_name(std::ifstream& fin) +{ + std::uint32_t length = 0; + fin.read(reinterpret_cast(&length), sizeof(length)); + if (length == 0) + return ""; + + std::string name(length, 0); + fin.read(&name[0], length); + + return name; +} + +extern "C" int load_fairseq2_ggml_file(fairseq2_model& model, const char* fname) { + model_loader loader; + assert_endianness(); + auto fin = open_ggml_file(fname); + loader.load_hparams(model.hparams, fin); + loader.load_hparams(model.layer_config, fin); + loader.load_vocab(model.vocab, fin); + loader.load_model_weights(model, fin); + return 0; +} diff --git a/seamless_communication/ggml/examples/unity/model_loader.h b/seamless_communication/ggml/examples/unity/model_loader.h new file mode 100644 index 0000000..f855f7e --- /dev/null +++ b/seamless_communication/ggml/examples/unity/model_loader.h @@ -0,0 +1,37 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// MIT_LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include +#include + +#include "ggml/ggml.h" +#include "ggml/ggml-alloc.h" + +#include "fairseq2.h" + + +class model_loader { +public: + std::int64_t load_model_weights(fairseq2_model &model, std::ifstream &fin); + + void load_hparams(std::unordered_map& hparams, std::ifstream &fin); + + void load_vocab(llama_vocab& vocab, std::ifstream &fin); + +private: + ggml_tensor * next_tensor(std::ifstream &fin, fairseq2_model &model); + + std::string get_name(std::ifstream &fin); +}; + +ggml_tensor* load_tensor_value(std::ifstream &fin, ggml_context* ctx, bool as_float32); + +std::ifstream open_ggml_file(const char* fname); + +extern "C" int load_fairseq2_ggml_file(fairseq2_model& model, const char* fname); diff --git a/seamless_communication/ggml/examples/unity/unity.cpp b/seamless_communication/ggml/examples/unity/unity.cpp new file mode 100644 index 0000000..f09dd39 --- /dev/null +++ b/seamless_communication/ggml/examples/unity/unity.cpp @@ -0,0 +1,213 @@ +#include "ggml/ggml.h" +#include "ggml/ggml-alloc.h" + +#include "math.h" +#include "model_loader.h" +#include "fairseq2.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ggml-alloc.h" + +struct unity_params { + int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + std::string model = "seamlessM4T_medium.ggml"; // model path + std::string tgt_lang = "eng"; + std::vector files = {}; + bool text = false; + SequenceGeneratorOptions opts = { + /*beam_size*/ 5, + /*min_seq_len*/ 1, + /*soft_max_seq_len_a*/ 1, + /*soft_max_seq_len_b*/ 200, + /*hard_max_seq_len*/ 1000, + /*len_penalty*/ 1.0, + /*unk_penalty*/ 0.0, + /*normalize_scores*/ true, + }; +}; + + +void unity_print_usage(int /*argc*/, char ** argv, const unity_params & params) { + fprintf(stderr, "usage: %s [options] file1 file2 ...\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, " --text text output\n"); + fprintf(stderr, " --beam-size beam size (default: %d)\n", params.opts.beam_size); + fprintf(stderr, "\n"); +} + +std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, unity_params& params) { + if (i + 1 < argc && argv[i + 1][0] != '-') { + return argv[++i]; + } else { + fprintf(stderr, "error: %s requires one argument.\n", flag.c_str()); + unity_print_usage(argc, argv, params); + exit(0); + } +} + + +bool unity_params_parse(int argc, char ** argv, unity_params & params) { + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg == "-h" || arg == "--help") { + unity_print_usage(argc, argv, params); + } else if (arg == "-t" || arg == "--threads") { + params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-m" || arg == "--model") { + params.model = get_next_arg(i, argc, argv, arg, params); + } else if (arg == "-l" || arg == "--tgt-lang") { + params.tgt_lang = get_next_arg(i, argc, argv, arg, params); + } else if (arg == "--text") { + params.text = true; + } else if (arg == "-b" || arg == "--beam-size") { + params.opts.beam_size = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else { + params.files.push_back(std::string(arg)); + } + } + return true; +} + +struct ggml_cgraph * unity_speech_encoder( + fairseq2_model& model, + struct ggml_tensor * speech_input) { + ggml_context* ctx0 = model.ctx; + ggml_cgraph* gf = ggml_new_graph(ctx0); + ggml_tensor* seqs = StandardConformerEncoder_forward(model, "speech_encoder", speech_input, nullptr); + seqs = ggml_dup(model.ctx, seqs); + ggml_build_forward_expand(gf, seqs); + return gf; +} + + +Hypothesis* unity_decode( + fairseq2_model& model, + const SequenceGeneratorOptions& opts, + int tgt_lang_idx, + ggml_tensor* encoder_output, + int n_threads +) { + SequenceGeneratorJob job = { + opts, + /*prefix_seq*/ nullptr, + /*pad_idx*/model.vocab.token_to_id[""], + /*unk_idx*/model.vocab.token_to_id[""], + /*bos_idx*/model.vocab.token_to_id[""], + /*eos_idx*/model.vocab.token_to_id[""], + /*num_threads*/n_threads, + }; + FORCE_ALLOC(prefix_seq, model.ctx, ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, 2)); + ((int *)prefix_seq->data)[0] = job.eos_idx; + ((int *)prefix_seq->data)[1] = tgt_lang_idx; + job.prefix_seq = prefix_seq; + return generate_sequence(model, job, encoder_output, nullptr, model.ctx, n_threads); +} + +int main(int argc, char ** argv) { + + unity_params params; + + if (unity_params_parse(argc, argv, params) == false) { + return 1; + } + + fairseq2_model model; + + // load the model + if (load_fairseq2_ggml_file(model, params.model.c_str())) { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); + return 1; + } + + // The ctx_size_mb mostly depends of input length and model dim. + int ctx_size_mb = 128; + auto encoder_buf = std::vector(128 * 1024 * 1024); + auto encoder_fwd_buf = std::vector(ctx_size_mb * 1024 * 1024); + ggml_allocr* fwd_alloc = ggml_allocr_new(encoder_fwd_buf.data(), encoder_fwd_buf.capacity(), 8); + char result_str[4096]; + + std::string input; + bool interactive = params.files.size() == 0; + auto next_file = params.files.begin(); + while (true) { + if (interactive) { + std::cout << "\nEnter audio_path and tgt_lang, separated by space (or 'exit' to quit):\n"; + std::getline(std::cin, input); + if (input == "exit") { + break; + } + } else { + if (next_file == params.files.end()) break; + input = *(next_file++); + } + std::istringstream iss(input); + std::string audio_path; + std::string tgt_lang = params.tgt_lang; + iss >> audio_path >> tgt_lang; + if (audio_path == "-") { + audio_path = "/proc/self/fd/0"; + } + std::cerr << "Translating (Transcribing) " << audio_path << " to " << tgt_lang << "\n"; + SF_INFO info; + SNDFILE* sndfile = sf_open(audio_path.c_str(), SFM_READ, &info); + if (!sndfile) { + std::cerr << "Could not open file\n"; + if (interactive) continue; + else return 1; + } + auto tgt_lang_ptr = model.vocab.token_to_id.find("__" + tgt_lang + "__"); + if (tgt_lang_ptr == model.vocab.token_to_id.end()) { + std::cerr << "Unknown language " << tgt_lang << "\n"; + if (interactive) continue; + else return 2; + } + int tgt_lang_idx = tgt_lang_ptr->second; + + + // Reset the ggml_context + model.ctx = ctx_from_buffer(encoder_buf); + ggml_set_no_alloc(model.ctx, false); + ggml_tensor* seqs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, info.frames, info.channels); + ggml_set_no_alloc(model.ctx, true); + + // Load audio input + sf_readf_float(sndfile, (float*)seqs->data, info.frames); + + // Audio encoder + ggml_cgraph* gf = unity_speech_encoder(model, seqs); + ggml_allocr_alloc_graph(fwd_alloc, gf); + ggml_graph_compute_with_ctx(model.ctx, gf, params.n_threads); + // encoder_output is valid until we call `ggml_allocr_reset(fwd_alloc)` + ggml_tensor* encoder_output = gf->nodes[gf->n_nodes - 1]; + + // Beam search decoding + const Hypothesis* result = unity_decode(model, params.opts, tgt_lang_idx, encoder_output, params.n_threads); + + // Drop language and bos token. + ggml_tensor* tokens = ggml_slice(model.ctx, result[0].seq, 0, 2, 0); + + // Collect result string + int n = fairseq2_spm_detokenize(&model, tokens, (char*)&result_str); + std::cout << std::string((char*)&result_str, n) << std::endl; + ggml_free(model.ctx); + ggml_allocr_reset(fwd_alloc); + } + + return 0; +} diff --git a/seamless_communication/ggml/ggml.pc.in b/seamless_communication/ggml/ggml.pc.in new file mode 100644 index 0000000..5f53cb8 --- /dev/null +++ b/seamless_communication/ggml/ggml.pc.in @@ -0,0 +1,10 @@ +prefix=@CMAKE_INSTALL_PREFIX@ +exec_prefix=${prefix} +includedir=${prefix}/include +libdir=${prefix}/lib + +Name: ggml +Description: The GGML Tensor Library for Machine Learning +Version: 0.0.0 +Cflags: -I${includedir}/ggml +Libs: -L${libdir} -lggml diff --git a/seamless_communication/ggml/ggml.py b/seamless_communication/ggml/ggml.py new file mode 100644 index 0000000..3010aa3 --- /dev/null +++ b/seamless_communication/ggml/ggml.py @@ -0,0 +1,553 @@ +""" +We are vendoring https://github.com/abetlen/ggml-python (MIT License) +adding a few utilities to convert between ggml and numpy tensors for testing. +""" + +import contextlib +import ctypes +import dataclasses +import functools +import logging +from pathlib import Path +from typing import Any, Callable, Dict, Iterator, NamedTuple, Tuple, Type, Union + +import numpy as np +import torch +import subprocess +import sys + +from ctypes_utils import NULLPTR, Ptr, c_fn, c_struct +from third_party_ggml import * + +### Helpers + + +@functools.lru_cache(4) +def numpy_dtype(ggml_type: ctypes.c_int) -> np.dtype: + if ggml_type == 0: + # GGML_TYPE_F32 = 0, + return np.dtype(np.float32) + + if ggml_type == 1: + # GGML_TYPE_F16 = 1, + return np.dtype(np.float16) + + if ggml_type == 18: + return np.dtype(np.int32) + + raise NotImplementedError(f"Can't convert GGML_TYPE({ggml_type}) to a numpy.dtype") + + +@functools.lru_cache() +def from_numpy_dtype(dtype: np.dtype) -> ctypes.c_int: + def _ggml_type(name: bytes, value: int) -> ctypes.c_int: + t = ctypes.c_int(value) + type_name = ggml_type_name(t) + if name != type_name: + raise RuntimeError( + f"Type {name!r} doesn't have value {value}. ggml.h was probably updated but not ggml.py" + ) + return t + + if dtype == np.float32: + return _ggml_type(b"f32", 0) + elif dtype == np.float16: + return _ggml_type(b"f16", 1) + elif dtype == np.dtype("bool"): + return _ggml_type(b"i8", 16) + elif dtype == np.int32: + return _ggml_type(b"i32", 18) + + raise NotImplementedError(f"Can't convert {dtype} to a GGML_TYPE") + + +def shape(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]: + if isinstance(tensor, ctypes._Pointer): + tensor = tensor.contents + ndims = tensor.n_dims + return tuple([tensor.ne[i] for i in range(ndims)[::-1]]) + + +def nb(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]: + if isinstance(tensor, ctypes._Pointer): + tensor = tensor.contents + return tuple([tensor.nb[i] for i in range(4)]) + + +def ne(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]: + if isinstance(tensor, ctypes._Pointer): + tensor = tensor.contents + return tuple([tensor.ne[i] for i in range(4)]) + + +def strides(tensor: Union[ggml_tensor, ggml_tensor_p]) -> Tuple[int, ...]: + if isinstance(tensor, ctypes._Pointer): + tensor = tensor.contents + ndims = tensor.n_dims + num_bytes = tuple([tensor.nb[i] for i in range(ndims)]) + strides = num_bytes[::-1] + return strides + + +def to_numpy(tensor_p: ggml_tensor_p) -> np.ndarray: + if not ggml_is_contiguous(tensor_p): + if not _almost_contiguous(tensor_p): + return _strided_to_numpy(tensor_p) + tensor = tensor_p.contents + + res = _void_p_to_np_array(tensor.data, shape(tensor), numpy_dtype(tensor.type)) + + if ggml_is_transposed(tensor_p): + # Patch up strides to work with transposed ggml_tensor + res.strides = strides(tensor) # type: ignore[assignment] + + return res + + +def _almost_contiguous(tensor_p: ggml_tensor_p) -> bool: + """Distinguishes between fully strided and just transposed.""" + tensor = tensor_p.contents + num_bytes = nb(tensor) + num_elem = ne(tensor) + + # Sort the axis according to 'num_bytes' + nbe = sorted(zip(num_bytes, num_elem)) + itemsize = ggml_type_size(tensor.type) + stride_exp = itemsize + for stride, e in nbe: + if stride != stride_exp: + return False + stride_exp *= e + + return True + + +def _strided_to_numpy(tensor_p: ggml_tensor_p) -> np.ndarray: + if ggml_is_transposed(tensor_p): + raise NotImplementedError( + "to_numpy doesn't support tensors both transposed and strided." + ) + + tensor = tensor_p.contents + + n_dim = tensor.n_dims + t_shape = shape(tensor) + t_strides = strides(tensor) + + type_size = ggml_type_size(tensor.type) + + full_shape = [] + num_bytes = nb(tensor) + + # Determine the full backing slice of bytes to read. + # TODO make this work for transposed array + n = 1 + total_elements = 1 + try: + for d in range(n_dim - 1): + n = num_bytes[d + 1] // type_size // n + full_shape.append(n) + total_elements *= n + except ZeroDivisionError: + logging.warning("Can't convert permuted GGML tensor back to numpy") + return None + # We don't need to guess for the first dimension, since this doesn't impact striding. + full_shape.append(t_shape[0]) + total_elements *= t_shape[0] + full_shape = full_shape[::-1] + + res = _void_p_to_np_array(tensor.data, tuple(full_shape), numpy_dtype(tensor.type)) + + # Extract the correct slice + res = res.__getitem__(tuple(slice(0, n) for n in t_shape)) + # TODO: we could handle transposition here + + return res + + +def _void_p_to_np_array( + data: ctypes.c_void_p, shape: Tuple[int, ...], dtype: np.dtype +) -> np.ndarray: + # Convert the ggml data pointer to a pointer of bytes + # This is needed because Python ctypes doesn't have "float16", and `as_array` only works with ctypes + int_width: type = getattr(ctypes, f"c_uint{8 * dtype.itemsize}") + ptr = ctypes.cast(data, ctypes.POINTER(int_width)) + # Create a numpy array with the wrong dtype + int_arr = np.ctypeslib.as_array(ptr, shape=shape) + # Reinterpret it to the right dtype + return np.frombuffer(int_arr, dtype=dtype).reshape(shape) + + +GgmlNElem = ctypes.c_int64 * GGML_MAX_DIMS +GgmlNBytes = ctypes.c_uint64 * GGML_MAX_DIMS + + +def from_file( + ctx: ggml_context_p, file: Path, shape: Tuple[int, ...], dtype: type = np.float32 +) -> ggml_tensor_p: + data = np.fromfile(str(file), dtype=dtype).reshape(shape) # type: ignore + return from_numpy(ctx, data) + + +def _shape_to_ne(shape: Tuple[int, ...]) -> Tuple[int, int, int, int]: + # in GGML ne[0] indicates the contiguous dimension, ie the last one in numpy and torch + ne = shape[::-1] + if len(ne) >= GGML_MAX_DIMS: + return ne # type: ignore + + # ne is always of the same length + padding = (1,) * (GGML_MAX_DIMS - len(ne)) + return ne + padding # type: ignore + + +def _compute_nbytes( + ne: Tuple[int, int, int, int], type: ctypes.c_int +) -> Tuple[int, int, int, int]: + nb0 = ggml_type_size(type) + nb1 = nb0 * (ne[0] // ggml_blck_size(type)) + nb2 = nb1 * ne[1] + nb3 = nb2 * ne[2] + return (nb0, nb1, nb2, nb3) + + +def from_numpy( + ctx: ggml_context_p, array: Union[np.ndarray, "torch.Tensor"], name: bytes = b"" +) -> Ptr[ggml_tensor]: + if type(array).__name__ == "Tensor": + array = array.numpy() + # Create an empty tensor so we don't allocate memory for the data pointer + gtype = from_numpy_dtype(array.dtype) + tensor_p = ggml_new_tensor_1d(ctx, gtype, 0) + # Fill out the correct dimensions and shape. + tensor_p.contents.n_dims = array.ndim + ne = _shape_to_ne(array.shape) + tensor_p.contents.ne = GgmlNElem(*ne) + tensor_p.contents.nb = GgmlNBytes(*_compute_nbytes(ne, gtype)) + # point the tensor data to the content of the numpy array. + tensor_p.contents.data = array.ctypes.data_as(ctypes.c_void_p) + # print(f"array: {array.shape} @0x{array.ctypes.data_as(ctypes.c_void_p)}") + # print(f"tensor_p: {shape(tensor_p)} @0x{tensor_p.contents.data:x}") + + # prevent the underlying numpy array to be freed + setattr(tensor_p, "__data", array) + if name: + ggml_set_name(tensor_p, name) + return tensor_p # type: ignore + + +def ggml_can_mul_mat(t0: ggml_tensor_p, t1: ggml_tensor_p) -> bool: + assert GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function" + + return ( + (t0.contents.ne[0] == t1.contents.ne[0]) + and (t1.contents.ne[2] % t0.contents.ne[2] == 0) + and (t1.contents.ne[3] % t0.contents.ne[3] == 0) + ) + + +def nodes(gf: ggml_cgraph) -> Dict[bytes, ggml_tensor_p]: + res = {} + for i in range(gf.n_nodes): + name = gf.nodes[i].contents.name + res[name] = gf.nodes[i] + return res + + +def leafs(gf: ggml_cgraph) -> Dict[bytes, ggml_tensor_p]: + res = {} + for i in range(gf.n_leafs): + name = gf.leafs[i].contents.name + res[name] = gf.leafs[i] + return res + + +class NativeObj: + AllocFn = Callable[[], ctypes.c_void_p] + FreeFn = Callable[[ctypes.c_void_p], None] + _cache: Dict[str, Tuple[AllocFn, FreeFn]] = {} + + @classmethod + def _init_c_func(cls, kind: str) -> Tuple[AllocFn, FreeFn]: + if kind in cls._cache: + return cls._cache[kind] + + alloc_fn = getattr(lib, f"{kind}_alloc") + alloc_fn.argtypes = [] + alloc_fn.restype = ctypes.c_void_p + + free_fn = getattr(lib, f"{kind}_free") + free_fn.argtypes = [ctypes.c_void_p] + free_fn.restype = None + + cls._cache[kind] = (alloc_fn, free_fn) + return (alloc_fn, free_fn) + + def __init__(self, kind: str, ptr: ctypes.c_void_p = NULL): + self.kind = kind + alloc_fn, self._free_fn = self._init_c_func(kind) + self.ptr = alloc_fn() if ptr is None else ptr + # print(self) + + def free(self) -> None: + if self.ptr is not None: + self._free_fn(self.ptr) + # print(f"freeing {self}") + self.ptr = NULL + + def __enter__(self) -> ctypes.c_void_p: + return self.ptr + + def __exit__(self, *args: Any) -> None: + self.free() + + def __del__(self) -> None: + self.free() + + def __repr__(self) -> str: + return f"<{self.kind} native object at 0x{self.ptr:x}>" + + +def MeasureArena() -> NativeObj: + return NativeObj("ggml_allocr", ggml_allocr_new_measure(GGML_MEM_ALIGN)) + + +def FixedSizeArena(mem_size: int) -> NativeObj: + memory = torch.zeros(mem_size, dtype=torch.uint8) + allocr = ggml_allocr_new( + ctypes.c_void_p(memory.data_ptr()), mem_size, GGML_MEM_ALIGN + ) + arena = NativeObj("ggml_allocr", allocr) + # Add a reference from the arena object to the underlying tensor, otherwise it will be freed to early. + setattr(arena, "__memory", memory) + return arena + + +lib.fairseq2_model_set_inference_ctx.argtypes = [ctypes.c_void_p, ggml_context_p] + + +def Fairseq2Model() -> NativeObj: + return NativeObj("fairseq2_model") + + +lib.std_string_alloc.argtypes = [ctypes.c_char_p] +lib.std_string_alloc.restype = ctypes.c_void_p +lib.std_string_free.argtypes = [ctypes.c_void_p] +lib.std_string_free.restype = None +NativeObj._cache["std_string"] = (lib.std_string_alloc, lib.std_string_free) + + +def CppStr(content: str) -> NativeObj: + c_str = ctypes.create_string_buffer(content.encode("utf-8")) + cpp_str = lib.std_string_alloc(c_str) + return NativeObj("std_string", cpp_str) + + +lib.load_fairseq2_ggml_file.argtypes = [ctypes.c_void_p, ctypes.c_char_p] +lib.load_fairseq2_ggml_file.restype = ctypes.c_int + + +def load_fairseq2_ggml_file(model_file: Path) -> NativeObj: + model = Fairseq2Model() + bytes_file = ctypes.create_string_buffer(str(model_file).encode("utf-8")) + err = lib.load_fairseq2_ggml_file(model.ptr, bytes_file) + if err: + raise Exception("Failed to load model") + return model + + +# lib.unity_audio_encoder_graph.argtypes = [ctypes.c_void_p, ctypes.c_void_p] +# lib.unity_audio_encoder_graph.restype = ctypes.POINTER(ggml_cgraph) + + +# def unity_audio_encoder_graph(model: NativeObj, tensor: ggml_tensor_p) -> ggml_cgraph_p: +# return lib.unity_audio_encoder_graph(model.ptr, tensor) # type: ignore + + +# lib.unity_eval.argtypes = [ +# ctypes.c_void_p, +# ctypes.c_void_p, +# ctypes.POINTER(ggml_tensor), +# ctypes.c_int, +# ] +# lib.unity_eval.restype = ctypes.POINTER(ggml_cgraph) + + +# def unity_eval( +# allocr: ctypes.c_void_p, model: NativeObj, tensor: ggml_tensor_p, n_threads: int +# ) -> ggml_cgraph_p: +# return lib.unity_eval(allocr, model.ptr, tensor, n_threads) + + +_FORWARD_CACHE: Dict[str, Callable[..., ggml_tensor_p]] = {} + + +def forward( + layer_name: str, model: ctypes.c_void_p, prefix: str, *inputs: ggml_tensor_p +) -> ggml_tensor_p: + fwd: Any = _FORWARD_CACHE.get(layer_name) + if fwd is None: + fwd = getattr(lib, layer_name + "_forward") + num_inputs = len(inputs) + fwd.argtypes = [ctypes.c_void_p, ctypes.c_void_p] + [ + ctypes.POINTER(ggml_tensor) + ] * num_inputs + fwd.restype = ctypes.POINTER(ggml_tensor) + _FORWARD_CACHE[layer_name] = fwd + + with CppStr(prefix) as std_prefix: + return fwd(model, std_prefix, *inputs) # ignore: type[no-any-return] + + +def build_and_compute( + ctx: ggml_context_p, tensor: ggml_tensor_p, num_threads: int = 1, dump: Union[bool, str] = False +) -> ggml_cgraph: + gf = ggml_build_forward(tensor) + need_alloc = tensor.contents.data == NULLPTR + if need_alloc: + alloc = FixedSizeArena(1024 * 1024 * 1024 * 2) + ggml_allocr_alloc_graph(alloc.ptr, ctypes.pointer(gf)) + setattr(tensor, "__data", alloc) + if dump: + if dump == True: + dump = f"dot/{sys._getframe(1).f_code.co_name}" + ggml_graph_dump_dot(ctypes.pointer(gf), NULLPTR, dump.encode("ascii")) + # subprocess.run(["dot", "-Tsvg", "-O", dump]) + ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), num_threads) + return gf + + +@c_fn(lib) +def causal_attention_mask( + ctx: ggml_context_p, seqs: Ptr[ggml_tensor] +) -> Ptr[ggml_tensor]: + ... + + +@c_fn(lib) +def ggml_slice( + ctx: ggml_context_p, + a: Ptr[ggml_tensor], + axis: int, + start: ctypes.c_int64, + end: ctypes.c_int64, +) -> Ptr[ggml_tensor]: + ... + + +@c_fn(lib) +def ggml_flatten_1d( + ctx: ggml_context_p, a: Ptr[ggml_tensor], dim: int +) -> Ptr[ggml_tensor]: + return a + + +@c_fn(lib) +def ggml_unflatten_1d( + ctx: ggml_context_p, a: Ptr[ggml_tensor], dim: int, num_el: int +) -> Ptr[ggml_tensor]: + return a + + +@c_struct +@dataclasses.dataclass +class SequenceGeneratorOptions: + beam_size: int + min_seq_len: int = 5 + soft_max_seq_len_a: float = 1.0 + soft_max_seq_len_b: int = 200 + hard_max_seq_len: int = 1024 + len_penalty: float = 1.0 + unk_penalty: float = 0.0 + normalize_scores: bool = True + + +@c_struct +@dataclasses.dataclass +class SequenceGeneratorJob: + opts: SequenceGeneratorOptions + prefix_seq: Ptr[ggml_tensor] + pad_idx: int + unk_idx: int + bos_idx: int + eos_idx: int + num_threads: int = 1 + + +@c_struct +class Hypothesis: + seq: Ptr[ggml_tensor] + """The generated sequence.""" + + score: float + """The score of the hypothesis.""" + + step_scores: Ptr[ggml_tensor] + """The score of each individual sequence step.""" + + +@c_fn(lib) +def generate_sequence( + model: ctypes.c_void_p, + job: Ptr[SequenceGeneratorJob], + encoder_output: Ptr[ggml_tensor], + encoder_padding_mask: Ptr[ggml_tensor], + result_ctx: ggml_context_p, +) -> Ptr[Hypothesis]: + ... + + +@c_fn(lib) +def _testing_return_hypothesis_ptr(ctx: ggml_context_p) -> Ptr[Hypothesis]: + return Ptr() + + +@c_fn(lib) +def fairseq2_model_layer_config_int(model: ctypes.c_void_p, name: bytes) -> int: + return -1 + + +@c_fn(lib.fairseq2_kv_cache_alloc) +def _fairseq2_kv_cache_alloc( + model: ctypes.c_void_p, ctx: ctypes.c_void_p, beam_size: int, max_seq_len: int +) -> None: + pass + + +@c_fn(lib.fairseq2_kv_cache_reset) +def _fairseq2_kv_cache_reset(model: ctypes.c_void_p) -> None: + pass + + +@contextlib.contextmanager +def fairseq2_kv_cache_alloc( + model: ctypes.c_void_p, kv_cache_size: int, beam_size: int, max_seq_len: int +) -> Iterator[None]: + + memory = torch.zeros(kv_cache_size, dtype=torch.uint8) + ctx = ggml_init( + params=ggml_init_params( + mem_size=kv_cache_size, + mem_buffer=ctypes.c_void_p(memory.data_ptr()), + no_alloc=False, + ) + ) + _fairseq2_kv_cache_alloc(model, ctx, beam_size, max_seq_len) + try: + yield + finally: + _fairseq2_kv_cache_reset(model) + ggml_free(ctx) + + +@c_fn(lib) +def fairseq2_spm_tokenize( + model: ctypes.c_void_p, text: bytes, out: Ptr[ggml_tensor] +) -> None: + pass + + +@c_fn(lib) +def fairseq2_spm_detokenize( + model: ctypes.c_void_p, tensor: Ptr[ggml_tensor], out: ctypes.Array[ctypes.c_char] +) -> ctypes.c_size_t: + return 0 diff --git a/seamless_communication/ggml/ggml_convert.py b/seamless_communication/ggml/ggml_convert.py new file mode 100644 index 0000000..58b574e --- /dev/null +++ b/seamless_communication/ggml/ggml_convert.py @@ -0,0 +1,483 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import dataclasses +import logging +import math +import struct +from enum import Enum +from io import BufferedWriter +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import torch +from fairseq2.assets import AssetCard +from fairseq2.models.transformer.frontend import TransformerEmbeddingFrontend +from fairseq2.nn import SinusoidalPositionEncoder +from fairseq2.nn.transformer import RelativePositionalEncoding +from seamless_communication.models import unity + +import ggml +import re + +Preprocessor = Callable[[Any], Any] +log = logging.getLogger("ggml_convert") + + +def convert_model( + model_name: Union[str, torch.nn.Module], + out: Optional[Path] = None, + layers: str = "", + hparams: Optional[Dict[str, Any]] = None, + vocab: Optional[List[Tuple[str, float]]] = None, + fp16: bool = False, +) -> None: + if isinstance(model_name, str): + # Load the corresponding fairseq2 model + if out is None: + out = Path(model_name).with_suffix(".ggml") + + # The type of model depends on the name + if "unity" in model_name or "seamlessM4T" in model_name: + if hparams is None: + model_config = unity.load_unity_config(model_name) + hparams = flatten_config( + dataclasses.asdict(model_config), separator="__" + ) + log.info(hparams) + model = unity.load_unity_model(model_name) + if vocab is None: + tokenizer = unity.load_unity_text_tokenizer(model_name) + vocab = read_vocab(tokenizer) + else: + raise ValueError(f"Unsupported model type: {model_name}") + else: + # Use the model passed explicitly + assert ( + out is not None + ), "output path is required when explicitly passing a module" + hparams = hparams or {} + model = model_name + + state_dict = model.state_dict() + if layers: + state_dict = {k: v for k, v in state_dict.items() if re.match(layers, k)} + fixup_model(model, state_dict, layer_filter=layers) + layer_config = read_layer_config(model, layer_filter=layers) + vocab = vocab or [] + write_ggml_file(out, hparams, layer_config, vocab, state_dict, fp16) + + +def _nested_getattr(model: Any, name: str) -> Any: + parts = name.split(".") + node = model + for part in parts: + node = getattr(node, part) + if node is None: + return None + return node + + +def find_children(model: torch.nn.Module, t: type, layer_filter: str = "") -> List[Tuple[str, torch.nn.Module]]: + queue = list(model._modules.items()) + modules = [] + while queue: + name, node = queue.pop() + if node is None: + continue + if layer_filter and not re.match(layer_filter, name): + continue + if isinstance(node, t): + modules.append((name, node)) + for child_name, child_node in node._modules.items(): + queue.append((".".join((name, child_name)), child_node)) + + return modules + + +def fixup_model(model: torch.nn.Module, state_dict: Dict[str, torch.Tensor], layer_filter: str) -> None: + # Bake the embedding scaling into the weights + frontends = find_children(model, TransformerEmbeddingFrontend, layer_filter) + if frontends: + log.info( + "Upgrading the following TransformerEmbeddingFrontend: {}", + [x[0] for x in frontends], + ) + for name, frontend in frontends: + embed_weights = state_dict[name + ".embed.weight"] + state_dict[name + ".embed.weight"] = embed_weights * frontend.scale + + # Sinusoidal embeddings are typically not saved since they are easily recomputed, + # but this allows to avoid porting the sinusoidal logic to GGML + pos_encoders = find_children(model, SinusoidalPositionEncoder, layer_filter) + if pos_encoders: + log.info( + "Upgrading the following SinusoidalPositionEncoder: {}", + [x[0] for x in pos_encoders], + ) + for name, pos_encoder in pos_encoders: + assert isinstance(pos_encoder.freqs, torch.Tensor) + assert name not in state_dict + state_dict[name] = pos_encoder.freqs + + relative_pos_encs = find_children(model, RelativePositionalEncoding, layer_filter) + # speech_encoder has several copies of the relative_pos_enc module. + # For efficiency reasons we only make one copy of it to GGML. + if relative_pos_encs: + log.info("Merging all speech_encoder RelativePositionalEncoding into one.") + _, rel_pos_enc = relative_pos_encs[0] + assert isinstance(rel_pos_enc.freqs, torch.Tensor) + state_dict["speech_encoder.pos_enc"] = rel_pos_enc.freqs + + +def convert_to_fp16(state_dict: Dict[str, torch.Tensor]) -> None: + for k in state_dict: + v = state_dict[k] + if v.dtype != torch.float32: + # ignore int tensors + continue + state_dict[k] = v.to(torch.float16) + + +def read_vocab(tokenizer: Any) -> List[Tuple[str, float]]: + vocab_info = tokenizer.vocab_info + vocab = [ + (tokenizer.model.index_to_token(i).replace("▁", " "), -i) + for i in range(vocab_info.size) + ] + return vocab # type: ignore[return-value] + + +def write_ggml_file( + out: Path, + hparams: Dict[str, Any], + layer_config: Dict[str, Any], + vocab: List[Tuple[str, float]], + state_dict: Dict[str, torch.Tensor], + fp16: bool, +) -> None: + with out.open("wb") as o: + write_ggml_header(o) + write_hparams(o, hparams) + write_hparams(o, layer_config) + write_vocab(o, vocab) + write_state_dict(o, state_dict, fp16) + + +def write_ggml_header(out: BufferedWriter) -> None: + """Write GGML header (in reverse cause big-endian)""" + out.write(b"ggml"[::-1]) + + +def write_hparams(out: BufferedWriter, hparams: Dict[str, Any]) -> None: + """Write hyper parameters. + + :params hparams: + flattened dict containing model's hyper parameters. + + """ + simple_vals = {} + for key, value in hparams.items(): + try: + simple_vals[key] = to_ctype(value) + except ValueError: + logging.warning(f"Skipping config for key {key}={value!r}") + continue + + out.write(struct.pack(" None: + out.write(struct.pack(" None: + """Write pytorch state dict. + + :params state_dict: + state dict returned by pytorch model + :params fp16: + convert float32 tensors to float16 on disk + """ + out.write(struct.pack(" int: + full_byte_size = x.numel() * x.element_size() + if fp16 and x.dtype == torch.float32: + full_byte_size //= 2 + return full_byte_size + + # Compressed size + compressed_byte_size = sum(_fp16_byte_size(x) for x in state_dict.values()) + log.warning( + f"Saving a ggml file with {len(state_dict)} tensors, totalling {true_byte_size / GB:.3f}Gb compressed to {compressed_byte_size / GB:.3f}" + ) + + for key, value in state_dict.items(): + write_string(out, key) + if key.endswith(".bias") and value.ndim == 1 and "adaptor" not in key: + # GGML broadcasting isn't as strong as numpy + value = value.reshape(1, -1) + if "pointwise_conv" in key: # pointwise_conv / depthwise_conv + value = value.squeeze(-1) + if "depthwise_conv" in key: + value = value.squeeze(1) + if fp16 and value.dtype == torch.float32: + value = value.to(torch.float16) + write_tensor(out, value.contiguous()) + + +def write_string(out: BufferedWriter, value: str) -> None: + """Write string in utf-8 format. + + :params value: + string value to dump. + """ + str_ = value.encode("utf-8") + packed_len = struct.pack(" None: + """Write torch tensor in ggml format. + + First we save the number of dimensions and the dtype. + Then we save the data as numpy array. + + :params value: + Tensor to dump. + """ + if value.dtype is torch.int64: + # GGML doesn't have int64, downcast it + value = value.to(dtype=torch.int32) + + if value.ndim == 0: + # GGML doesn't support scalar as tensors. + value = value.reshape(1) + + data = value.numpy() + n_dims = data.ndim + assert n_dims < 5, "ggml doesn't support 5 dims tensors" + assert n_dims >= 1, "ggml doesn't support 0 dim tensors" + + ftype = torch_to_ggml_type(value.dtype) + out.write(struct.pack(" int: + if dtype is torch.float32: + return ggml.GGML_TYPE_F32 + elif dtype is torch.float16: + return ggml.GGML_TYPE_F16 + elif dtype is torch.int32: + return ggml.GGML_TYPE_I32 + elif dtype is torch.int8: + return ggml.GGML_TYPE_I8 + else: + raise NotImplementedError(f"{dtype} is not mapped to a GGML_TYPE") + + +def flatten_config( + config: Dict[str, Any], + separator: str, + config_preprocessor: Optional[Preprocessor] = None, +) -> Dict[str, Any]: + """Flatten nested dictionnary + + :param config: + nested dictionnary containing model config. + :param separator: + string separator used when flattening nested hparams + :param config_preprocessor: + Preprocessor used for config/hparams values + + :returns: + flat dictionnary + """ + + if config_preprocessor is None: + config_preprocessor = lambda x: x + + def __flatten(config: Dict[str, Any], prefix: str = "") -> Dict[str, Any]: + result = {} + for key in config: + new_key = f"{prefix}{key}" + if isinstance(config[key], dict): + nested_result = __flatten(config[key], f"{new_key}{separator}") + result.update(nested_result) + else: + new_config = config_preprocessor(config[key]) + if new_config is not None: + result[new_key] = config[key] + + return result + + return __flatten(config) + + +def read_layer_config(model: torch.nn.Module, layer_filter: str) -> Dict[str, Any]: + layer_config = {} + + def _append_node_config(node: Any, prefix: str) -> None: + for k, v in node.__dict__.items(): + # Skip special members. In particular all children module and tensors + # will be hidden in special dicts `_parameters` and `_modules` + if k.startswith("_"): + continue + # All modules have a "training" flag + if k in ("training", "init_fn"): + continue + if v is None: + continue + + try: + to_ctype(v) + except ValueError: + log.warning(f"Skipping layer config {k}={v!r}") + continue + layer_config[prefix + k] = v + + _append_node_config(model, "") + for name, node in find_children(model, torch.nn.Module, layer_filter): + _append_node_config(node, name + ".") + return layer_config + + +def to_ctype(value: Any) -> Tuple[str, Any]: + """Transform python type to ctype. + + Note: we always use little-endian and 8-byte types. + This make the format independent of the current platform. + + :params value: + value to cast into ctype + + :returns: + A tuple of ctype and cvalue. + """ + if isinstance(value, int): + return (" str: + """Return equivalent cpp type in string format + + :params value: + value to cast into ctype + + :returns: + str containing cpp type + """ + # used to have compatibility between types + try: + ctype, _ = to_ctype(value) + except ValueError as e: + return f"// Error: {e}" + + if ctype == "i": + return "std::int32_t" + if ctype == "l": + return "std::int64_t" + if ctype == "f": + return "float" + if ctype == "d": + return "double" + if ctype == "?": + return "bool" + + raise RuntimeError( + f"Should not have reached this part." f"Missing cpp translation for {ctype}" + ) + + +def generate_hparams_struct( + hparams: Dict[str, Any], + struct_name: str, +) -> str: + """Generate a c++ struct to hold the model hyper-parameters. + + :param hparams: + Flattened config of the model. + :param struct_name: + Name of the generated struct. + """ + struct = f"struct {struct_name} {{" + fields = [f" {get_cpp_type(value)} {key};" for key, value in hparams.items()] + struct = "\n".join([struct] + fields + ["};\n"]) + + valid_fields = [ + key for key, value in hparams.items() if "Error" not in get_cpp_type(value) + ] + read_struct = f"void read_{struct_name}({struct_name}& out, std::ifstream &fin) {{" + read_fields = [ + f" fin.read((char*) &out.{field}, sizeof(out.{field}));" + for field in valid_fields + ] + read_struct = "\n".join([read_struct] + read_fields + ["};\n"]) + + return "\n".join([struct, read_struct]) + + +if __name__ == "__main__": + import func_argparse + + func_argparse.single_main(convert_model) diff --git a/seamless_communication/ggml/include/ggml/ggml-alloc.h b/seamless_communication/ggml/include/ggml/ggml-alloc.h new file mode 100644 index 0000000..9559da7 --- /dev/null +++ b/seamless_communication/ggml/include/ggml/ggml-alloc.h @@ -0,0 +1,26 @@ +#pragma once + +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment); +GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment); + +// tell the allocator to parse nodes following the order described in the list +// you should call this if your graph are optimized to execute out-of-order +GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n); + +GGML_API void ggml_allocr_free(struct ggml_allocr * alloc); +GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc); +GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc); +GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor); +GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph); + + +#ifdef __cplusplus +} +#endif diff --git a/seamless_communication/ggml/include/ggml/ggml.h b/seamless_communication/ggml/include/ggml/ggml.h new file mode 100644 index 0000000..c31ee30 --- /dev/null +++ b/seamless_communication/ggml/include/ggml/ggml.h @@ -0,0 +1,2045 @@ +#pragma once + +// +// GGML Tensor Library +// +// This documentation is still a work in progress. +// If you wish some specific topics to be covered, feel free to drop a comment: +// +// https://github.com/ggerganov/whisper.cpp/issues/40 +// +// ## Overview +// +// This library implements: +// +// - a set of tensor operations +// - automatic differentiation +// - basic optimization algorithms +// +// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes, +// but is not limited to, the following: +// +// - linear regression +// - support vector machines +// - neural networks +// +// The library allows the user to define a certain function using the available tensor operations. This function +// definition is represented internally via a computation graph. Each tensor operation in the function definition +// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the +// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized +// using one of the available optimization algorithms. +// +// For example, here we define the function: f(x) = a*x^2 + b +// +// { +// struct ggml_init_params params = { +// .mem_size = 16*1024*1024, +// .mem_buffer = NULL, +// }; +// +// // memory allocation happens here +// struct ggml_context * ctx = ggml_init(params); +// +// struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); +// +// ggml_set_param(ctx, x); // x is an input variable +// +// struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); +// struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); +// struct ggml_tensor * x2 = ggml_mul(ctx, x, x); +// struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b); +// +// ... +// } +// +// Notice that the function definition above does not involve any actual computation. The computation is performed only +// when the user explicitly requests it. For example, to compute the function's value at x = 2.0: +// +// { +// ... +// +// struct ggml_cgraph gf = ggml_build_forward(f); +// +// // set the input variable and parameter values +// ggml_set_f32(x, 2.0f); +// ggml_set_f32(a, 3.0f); +// ggml_set_f32(b, 4.0f); +// +// ggml_graph_compute_with_ctx(ctx, &gf, n_threads); +// +// printf("f = %f\n", ggml_get_f32_1d(f, 0)); +// +// ... +// } +// +// The actual computation is performed in the ggml_graph_compute() function. +// +// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the +// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know +// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory +// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was +// actually needed. +// +// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic +// differentiation and optimization algorithms. +// +// The described approach allows to define the function graph once and then compute its forward or backward graphs +// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way +// the user can avoid the memory allocation overhead at runtime. +// +// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class +// citizens, but in theory the library can be extended to support FP8 and integer data types. +// +// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary +// and binary operations. Most of the available operations fall into one of these two categories. With time, it became +// clear that the library needs to support more complex operations. The way to support these operations is not clear +// yet, but a few examples are demonstrated in the following operations: +// +// - ggml_permute() +// - ggml_conv_1d_1s() +// - ggml_conv_1d_2s() +// +// For each tensor operator, the library implements a forward and backward computation function. The forward function +// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the +// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a +// calculus class, or watch the following video: +// +// What is Automatic Differentiation? +// https://www.youtube.com/watch?v=wG_nF1awSSY +// +// +// ## Tensor data (struct ggml_tensor) +// +// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of +// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains +// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example: +// +// { +// struct ggml_tensor * c = ggml_add(ctx, a, b); +// +// assert(c->src[0] == a); +// assert(c->src[1] == b); +// } +// +// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the +// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows +// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and +// permutation. All tensor operations have to take the stride into account and not assume that the tensor is +// contiguous in memory. +// +// The data of the tensor is accessed via the "data" pointer. For example: +// +// { +// const int nx = 2; +// const int ny = 3; +// +// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny); +// +// for (int y = 0; y < ny; y++) { +// for (int x = 0; x < nx; x++) { +// *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y; +// } +// } +// +// ... +// } +// +// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used. +// +// ## The matrix multiplication operator (ggml_mul_mat) +// +// TODO +// +// +// ## Multi-threading +// +// TODO +// +// +// ## Overview of ggml.c +// +// TODO +// +// +// ## SIMD optimizations +// +// TODO +// +// +// ## Debugging ggml +// +// TODO +// +// + +#ifdef GGML_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef GGML_BUILD +# define GGML_API __declspec(dllexport) +# else +# define GGML_API __declspec(dllimport) +# endif +# else +# define GGML_API __attribute__ ((visibility ("default"))) +# endif +#else +# define GGML_API +#endif + +// TODO: support for clang +#ifdef __GNUC__ +# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint))) +#elif defined(_MSC_VER) +# define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func +#else +# define GGML_DEPRECATED(func, hint) func +#endif + +#ifndef __GNUC__ +# define GGML_ATTRIBUTE_FORMAT(...) +#elif defined(__MINGW32__) +# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif + +#include +#include +#include + +#define GGML_FILE_MAGIC 0x67676d6c // "ggml" +#define GGML_FILE_VERSION 1 + +#define GGML_QNT_VERSION 2 // bump this on quantization format changes +#define GGML_QNT_VERSION_FACTOR 1000 // do not change this + +#define GGML_MAX_DIMS 4 +#define GGML_MAX_NODES 4096 +#define GGML_MAX_PARAMS 256 +#define GGML_MAX_CONTEXTS 64 +#define GGML_MAX_SRC 6 +#define GGML_MAX_NAME 64 +#define GGML_MAX_OP_PARAMS 32 +#define GGML_DEFAULT_N_THREADS 4 + +#if UINTPTR_MAX == 0xFFFFFFFF + #define GGML_MEM_ALIGN 4 +#else + #define GGML_MEM_ALIGN 16 +#endif + +#define GGML_EXIT_SUCCESS 0 +#define GGML_EXIT_ABORTED 1 + +#define GGUF_MAGIC 0x46554747 // "GGUF" +#define GGUF_VERSION 2 + +#define GGUF_DEFAULT_ALIGNMENT 32 + +#define GGML_UNUSED(x) (void)(x) + +#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) + +#define GGML_ASSERT(x) \ + do { \ + if (!(x)) { \ + fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ + abort(); \ + } \ + } while (0) + +// used to copy the number of elements and stride in bytes of tensors into local variables. +// main purpose is to reduce code duplication and improve readability. +// +// example: +// +// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); +// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); +// +#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \ + const type prefix##0 = (pointer)->array[0]; \ + GGML_UNUSED(prefix##0); +#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \ + const type prefix##1 = (pointer)->array[1]; \ + GGML_UNUSED(prefix##1); +#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \ + const type prefix##2 = (pointer)->array[2]; \ + GGML_UNUSED(prefix##2); +#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \ + const type prefix##3 = (pointer)->array[3]; \ + GGML_UNUSED(prefix##3); + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__ARM_NEON) && defined(__CUDACC__) + typedef half ggml_fp16_t; +#elif defined(__ARM_NEON) + typedef __fp16 ggml_fp16_t; +#else + typedef uint16_t ggml_fp16_t; +#endif + + // convert FP16 <-> FP32 + GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x); + GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x); + + GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n); + GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n); + + struct ggml_object; + struct ggml_context; + + enum ggml_type { + GGML_TYPE_F32 = 0, + GGML_TYPE_F16 = 1, + GGML_TYPE_Q4_0 = 2, + GGML_TYPE_Q4_1 = 3, + // GGML_TYPE_Q4_2 = 4, support has been removed + // GGML_TYPE_Q4_3 (5) support has been removed + GGML_TYPE_Q5_0 = 6, + GGML_TYPE_Q5_1 = 7, + GGML_TYPE_Q8_0 = 8, + GGML_TYPE_Q8_1 = 9, + // k-quantizations + GGML_TYPE_Q2_K = 10, + GGML_TYPE_Q3_K = 11, + GGML_TYPE_Q4_K = 12, + GGML_TYPE_Q5_K = 13, + GGML_TYPE_Q6_K = 14, + GGML_TYPE_Q8_K = 15, + GGML_TYPE_I8, + GGML_TYPE_I16, + GGML_TYPE_I32, + GGML_TYPE_COUNT, + }; + + enum ggml_backend { + GGML_BACKEND_CPU = 0, + GGML_BACKEND_GPU = 10, + GGML_BACKEND_GPU_SPLIT = 20, + }; + + // model file types + enum ggml_ftype { + GGML_FTYPE_UNKNOWN = -1, + GGML_FTYPE_ALL_F32 = 0, + GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors + GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors + GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors + }; + + // available tensor operations: + enum ggml_op { + GGML_OP_NONE = 0, + + GGML_OP_DUP, + GGML_OP_ADD, + GGML_OP_ADD1, + GGML_OP_ACC, + GGML_OP_SUB, + GGML_OP_MUL, + GGML_OP_DIV, + GGML_OP_SQR, + GGML_OP_SQRT, + GGML_OP_LOG, + GGML_OP_SUM, + GGML_OP_SUM_ROWS, + GGML_OP_MEAN, + GGML_OP_ARGMAX, + GGML_OP_REPEAT, + GGML_OP_REPEAT_BACK, + GGML_OP_CONCAT, + GGML_OP_SILU_BACK, + GGML_OP_NORM, // normalize + GGML_OP_BATCH_NORM, + GGML_OP_RMS_NORM, + GGML_OP_RMS_NORM_BACK, + GGML_OP_GROUP_NORM, + + GGML_OP_MUL_MAT, + GGML_OP_OUT_PROD, + + GGML_OP_SCALE, + GGML_OP_SET, + GGML_OP_CPY, + GGML_OP_CONT, + GGML_OP_RESHAPE, + GGML_OP_VIEW, + GGML_OP_PERMUTE, + GGML_OP_TRANSPOSE, + GGML_OP_GET_ROWS, + GGML_OP_GET_ROWS_BACK, + GGML_OP_DIAG, + GGML_OP_DIAG_MASK_INF, + GGML_OP_DIAG_MASK_ZERO, + GGML_OP_SOFT_MAX, + GGML_OP_SOFT_MAX_BACK, + GGML_OP_ROPE, + GGML_OP_ROPE_BACK, + GGML_OP_ALIBI, + GGML_OP_CLAMP, + GGML_OP_CONV_1D, + GGML_OP_CONV_1D_GENERIC, + GGML_OP_CONV_2D, + GGML_OP_CONV_TRANSPOSE_2D, + GGML_OP_POOL_1D, + GGML_OP_POOL_2D, + + GGML_OP_CONV_1D_STAGE_0, // internal + GGML_OP_CONV_1D_STAGE_1, // internal + GGML_OP_CONV_1D_STAGE_2, // internal + + GGML_OP_CONV_1D_GENERIC_STAGE_0, + GGML_OP_CONV_1D_GENERIC_STAGE_1, + + GGML_OP_UPSCALE, // nearest interpolate + + GGML_OP_FLASH_ATTN, + GGML_OP_FLASH_FF, + GGML_OP_FLASH_ATTN_BACK, + GGML_OP_WIN_PART, + GGML_OP_WIN_UNPART, + GGML_OP_GET_REL_POS, + GGML_OP_ADD_REL_POS, + + GGML_OP_UNARY, + + GGML_OP_MAP_UNARY, + GGML_OP_MAP_BINARY, + + GGML_OP_MAP_CUSTOM1_F32, + GGML_OP_MAP_CUSTOM2_F32, + GGML_OP_MAP_CUSTOM3_F32, + + GGML_OP_MAP_CUSTOM1, + GGML_OP_MAP_CUSTOM2, + GGML_OP_MAP_CUSTOM3, + + GGML_OP_CROSS_ENTROPY_LOSS, + GGML_OP_CROSS_ENTROPY_LOSS_BACK, + + GGML_OP_COUNT, + }; + + enum ggml_unary_op { + GGML_UNARY_OP_ABS, + GGML_UNARY_OP_SGN, + GGML_UNARY_OP_NEG, + GGML_UNARY_OP_STEP, + GGML_UNARY_OP_TANH, + GGML_UNARY_OP_ELU, + GGML_UNARY_OP_RELU, + GGML_UNARY_OP_GELU, + GGML_UNARY_OP_GELU_QUICK, + GGML_UNARY_OP_SILU, + GGML_UNARY_OP_GLU, + }; + + enum ggml_object_type { + GGML_OBJECT_TENSOR, + GGML_OBJECT_GRAPH, + GGML_OBJECT_WORK_BUFFER + }; + + // ggml object + struct ggml_object { + size_t offs; + size_t size; + + struct ggml_object * next; + + enum ggml_object_type type; + + char padding[4]; + }; + + static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); + + // n-dimensional tensor + struct ggml_tensor { + enum ggml_type type; + enum ggml_backend backend; + + int n_dims; + int64_t ne[GGML_MAX_DIMS]; // number of elements + size_t nb[GGML_MAX_DIMS]; // stride in bytes: + // nb[0] = sizeof(type) + // nb[1] = nb[0] * ne[0] + padding + // nb[i] = nb[i-1] * ne[i-1] + + // compute data + enum ggml_op op; + + // op params - allocated as int32_t for alignment + int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; + + bool is_param; + + struct ggml_tensor * grad; + struct ggml_tensor * src[GGML_MAX_SRC]; + + // performance + int perf_runs; + int64_t perf_cycles; + int64_t perf_time_us; + + struct ggml_tensor * view_src; + size_t view_offs; + + void * data; + + char name[GGML_MAX_NAME]; + + void * extra; // extra things e.g. for ggml-cuda.cu + + char padding[4]; + }; + + static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); + + // the compute plan that needs to be prepared for ggml_graph_compute() + // since https://github.com/ggerganov/ggml/issues/287 + struct ggml_cplan { + size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` + uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` + + int n_threads; + + // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes + int n_tasks[GGML_MAX_NODES]; + + // abort ggml_graph_compute when true + bool (*abort_callback)(void * data); + void * abort_callback_data; + }; + + // next prime after GGML_MAX_NODES + // #define GGML_GRAPH_HASHTABLE_SIZE 4099 + // next prime after GGML_MAX_NODES * 2 (nodes + leafs) + #define GGML_GRAPH_HASHTABLE_SIZE 8273 + + // computation graph + struct ggml_cgraph { + int n_nodes; + int n_leafs; + + struct ggml_tensor * nodes[GGML_MAX_NODES]; + struct ggml_tensor * grads[GGML_MAX_NODES]; + struct ggml_tensor * leafs[GGML_MAX_NODES]; + + void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE]; + + // performance + int perf_runs; + int64_t perf_cycles; + int64_t perf_time_us; + }; + + static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph); + + // scratch buffer + struct ggml_scratch { + size_t offs; + size_t size; + void * data; + }; + + struct ggml_init_params { + // memory pool + int64_t mem_size; // bytes + void * mem_buffer; // if NULL, memory will be allocated internally + bool no_alloc; // don't allocate memory for the tensor data + }; + + + // compute types + + // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled. + // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995. + enum ggml_task_type { + GGML_TASK_INIT = 0, + GGML_TASK_COMPUTE, + GGML_TASK_FINALIZE, + }; + + struct ggml_compute_params { + enum ggml_task_type type; + + // ith = thread index, nth = number of threads + int ith, nth; + + // work buffer for all threads + size_t wsize; + void * wdata; + }; + + // misc + + GGML_API void ggml_time_init(void); // call this once at the beginning of the program + GGML_API int64_t ggml_time_ms(void); + GGML_API int64_t ggml_time_us(void); + GGML_API int64_t ggml_cycles(void); + GGML_API int64_t ggml_cycles_per_ms(void); + + GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems + GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node + + GGML_API void ggml_print_object (const struct ggml_object * obj); + GGML_API void ggml_print_objects(const struct ggml_context * ctx); + + GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor); + GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); + GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); + GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN + GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split); + + GGML_API int ggml_blck_size (enum ggml_type type); + GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block + GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float + + GGML_API const char * ggml_type_name(enum ggml_type type); + GGML_API const char * ggml_op_name (enum ggml_op op); + GGML_API const char * ggml_op_symbol(enum ggml_op op); + + GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); + + GGML_API bool ggml_is_quantized(enum ggml_type type); + + // TODO: temporary until model loading of ggml examples is refactored + GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); + + GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); + GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor); + GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); + + GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1); + + // use this to compute the memory overhead of a tensor + GGML_API size_t ggml_tensor_overhead(void); + + // main + + GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); + GGML_API void ggml_free(struct ggml_context * ctx); + + GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); + + GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch); + GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx); + GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); + + GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx); + GGML_API int64_t ggml_get_mem_size (const struct ggml_context * ctx); + GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx); + + GGML_API struct ggml_tensor * ggml_new_tensor( + struct ggml_context * ctx, + enum ggml_type type, + int n_dims, + const int64_t *ne); + + GGML_API struct ggml_tensor * ggml_new_tensor_1d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0); + + GGML_API struct ggml_tensor * ggml_new_tensor_2d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1); + + GGML_API struct ggml_tensor * ggml_new_tensor_3d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1, + int64_t ne2); + + GGML_API struct ggml_tensor * ggml_new_tensor_4d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3); + + GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); + GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); + + GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); + GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src); + + GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); + + GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); + GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); + GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); + + GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); + GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); + + GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); + GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); + + GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); + GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); + + GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); + + GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor); + GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name); + GGML_ATTRIBUTE_FORMAT(2, 3) + GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...); + + // + // operations on tensors with backpropagation + // + + GGML_API struct ggml_tensor * ggml_dup( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_dup_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_add( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_add_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_add1( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_add1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_acc( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + + GGML_API struct ggml_tensor * ggml_acc_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + + GGML_API struct ggml_tensor * ggml_sub( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_sub_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_mul( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_mul_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_div( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_div_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_sqr( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sqr_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sqrt( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sqrt_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_log( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_log_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // return scalar + GGML_API struct ggml_tensor * ggml_sum( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d] + GGML_API struct ggml_tensor * ggml_sum_rows( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // mean along rows + GGML_API struct ggml_tensor * ggml_mean( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // argmax along rows + GGML_API struct ggml_tensor * ggml_argmax( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // if a is the same shape as b, and a is not parameter, return a + // otherwise, return a new tensor: repeat(a) to fit in b + GGML_API struct ggml_tensor * ggml_repeat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_repeat_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // concat a and b on dim 2 + // used in stable-diffusion + GGML_API struct ggml_tensor * ggml_concat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_abs( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_abs_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sgn( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sgn_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_neg( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_neg_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_step( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_step_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_tanh( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_tanh_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_elu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_elu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_relu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_relu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // TODO: double-check this computation is correct + GGML_API struct ggml_tensor * ggml_gelu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_gelu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_gelu_quick( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_gelu_quick_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_silu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_silu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // a - x + // b - dy + GGML_API struct ggml_tensor * ggml_silu_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_glu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // normalize along rows + GGML_API struct ggml_tensor * ggml_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + GGML_API struct ggml_tensor * ggml_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + GGML_API struct ggml_tensor * ggml_batch_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * gamma, + struct ggml_tensor * beta, + struct ggml_tensor * running_mean, + struct ggml_tensor * running_var, + float eps); + + GGML_API struct ggml_tensor * ggml_rms_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + GGML_API struct ggml_tensor * ggml_rms_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + // group normalize along ne0*ne1*n_groups + // used in stable-diffusion + // TODO: eps is hardcoded to 1e-6 for now + GGML_API struct ggml_tensor * ggml_group_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups); + + GGML_API struct ggml_tensor * ggml_group_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups); + + // a - x + // b - dy + GGML_API struct ggml_tensor * ggml_rms_norm_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + float eps); + + // A: n columns, m rows + // B: n columns, p rows (i.e. we transpose it internally) + // result is m columns, p rows + GGML_API struct ggml_tensor * ggml_mul_mat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // A: m columns, n rows, + // B: p columns, n rows, + // result is m columns, p rows + GGML_API struct ggml_tensor * ggml_out_prod( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // + // operations on tensors without backpropagation + // + + GGML_API struct ggml_tensor * ggml_scale( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_scale_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // b -> view(a,offset,nb1,nb2,3), return modified a + GGML_API struct ggml_tensor * ggml_set( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + + // b -> view(a,offset,nb1,nb2,3), return view(a) + GGML_API struct ggml_tensor * ggml_set_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + + GGML_API struct ggml_tensor * ggml_set_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset); + + GGML_API struct ggml_tensor * ggml_set_1d_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset); + + // b -> view(a,offset,nb1,nb2,3), return modified a + GGML_API struct ggml_tensor * ggml_set_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t offset); + + // b -> view(a,offset,nb1,nb2,3), return view(a) + GGML_API struct ggml_tensor * ggml_set_2d_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t offset); + + + // a -> b, return view(b) + GGML_API struct ggml_tensor * ggml_cpy( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // a -> b, in-place, return view(b) + GGML_API struct ggml_tensor * ggml_cpy_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // make contiguous + GGML_API struct ggml_tensor * ggml_cont( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // make contiguous, in-place + GGML_API struct ggml_tensor * ggml_cont_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // return view(a), b specifies the new shape + // TODO: when we start computing gradient, make a copy instead of view + GGML_API struct ggml_tensor * ggml_reshape( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // return view(a) + // TODO: when we start computing gradient, make a copy instead of view + GGML_API struct ggml_tensor * ggml_reshape_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0); + + GGML_API struct ggml_tensor * ggml_reshape_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1); + + // return view(a) + // TODO: when we start computing gradient, make a copy instead of view + GGML_API struct ggml_tensor * ggml_reshape_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2); + + GGML_API struct ggml_tensor * ggml_reshape_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3); + + // offset in bytes + GGML_API struct ggml_tensor * ggml_view_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + size_t offset); + + GGML_API struct ggml_tensor * ggml_view_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + size_t nb1, // row stride in bytes + size_t offset); + + GGML_API struct ggml_tensor * ggml_view_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + size_t nb1, // row stride in bytes + size_t nb2, // slice stride in bytes + size_t offset); + + GGML_API struct ggml_tensor * ggml_view_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + size_t nb1, // row stride in bytes + size_t nb2, // slice stride in bytes + size_t nb3, + size_t offset); + + GGML_API struct ggml_tensor * ggml_permute( + struct ggml_context * ctx, + struct ggml_tensor * a, + int axis0, + int axis1, + int axis2, + int axis3); + + // alias for ggml_permute(ctx, a, 1, 0, 2, 3) + GGML_API struct ggml_tensor * ggml_transpose( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_get_rows( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_get_rows_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c); + + GGML_API struct ggml_tensor * ggml_diag( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // set elements above the diagonal to -INF + GGML_API struct ggml_tensor * ggml_diag_mask_inf( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + + // set elements above the diagonal to 0 + GGML_API struct ggml_tensor * ggml_diag_mask_zero( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + + GGML_API struct ggml_tensor * ggml_soft_max( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_soft_max_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_soft_max_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_soft_max_back_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // rotary position embedding + // if mode & 1 == 1, skip n_past elements + // if mode & 2 == 1, GPT-NeoX style + // if mode & 4 == 1, ChatGLM style + // TODO: avoid creating a new tensor every time + GGML_API struct ggml_tensor * ggml_rope( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_rope_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx); + + // custom RoPE + GGML_API struct ggml_tensor * ggml_rope_custom( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx, + float freq_base, + float freq_scale); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_rope_custom_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx, + float freq_base, + float freq_scale); + + // xPos RoPE, in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_rope_xpos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + float base, + bool down); + + // rotary position embedding backward, i.e compute dx from dy + // a - dy + GGML_API struct ggml_tensor * ggml_rope_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx, + float freq_base, + float freq_scale, + float xpos_base, + bool xpos_down); + + // alibi position embedding + // in-place, returns view(a) + struct ggml_tensor * ggml_alibi( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_head, + float bias_max); + + // clamp + // in-place, returns view(a) + struct ggml_tensor * ggml_clamp( + struct ggml_context * ctx, + struct ggml_tensor * a, + float min, + float max); + + GGML_API struct ggml_tensor * ggml_conv_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, // stride + int p0, // padding + int d0); // dilation + + GGML_API struct ggml_tensor * ggml_conv_1d_generic( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, // stride + int p0, // padding + int d0); // dilation + + // conv_1d with padding = half + // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) + GGML_API struct ggml_tensor* ggml_conv_1d_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s, + int d); + + GGML_API struct ggml_tensor * ggml_conv_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1); + + + // kernel size is a->ne[0] x a->ne[1] + // stride is equal to kernel size + // padding is zero + // example: + // a: 16 16 3 768 + // b: 1024 1024 3 1 + // res: 64 64 768 1 + // used in sam + GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // kernel size is a->ne[0] x a->ne[1] + // stride is 1 + // padding is half + // example: + // a: 3 3 256 256 + // b: 64 64 256 1 + // res: 64 64 256 1 + // used in sam + GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride); + + enum ggml_op_pool { + GGML_OP_POOL_MAX, + GGML_OP_POOL_AVG, + GGML_OP_POOL_COUNT, + }; + + GGML_API struct ggml_tensor * ggml_pool_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, // kernel size + int s0, // stride + int p0); // padding + + GGML_API struct ggml_tensor * ggml_pool_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, + int k1, + int s0, + int s1, + int p0, + int p1); + + // nearest interpolate + // used in stable-diffusion + GGML_API struct ggml_tensor * ggml_upscale( + struct ggml_context * ctx, + struct ggml_tensor * a, + int scale_factor); + + GGML_API struct ggml_tensor * ggml_flash_attn( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + bool masked); + + GGML_API struct ggml_tensor * ggml_flash_attn_back( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * d, + bool masked); + + GGML_API struct ggml_tensor * ggml_flash_ff( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b0, + struct ggml_tensor * b1, + struct ggml_tensor * c0, + struct ggml_tensor * c1); + + // partition into non-overlapping windows with padding if needed + // example: + // a: 768 64 64 1 + // w: 14 + // res: 768 14 14 25 + // used in sam + GGML_API struct ggml_tensor * ggml_win_part( + struct ggml_context * ctx, + struct ggml_tensor * a, + int w); + + // reverse of ggml_win_part + // used in sam + GGML_API struct ggml_tensor * ggml_win_unpart( + struct ggml_context * ctx, + struct ggml_tensor * a, + int w0, + int h0, + int w); + + GGML_API struct ggml_tensor * ggml_unary( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_unary_op op); + + GGML_API struct ggml_tensor * ggml_unary_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_unary_op op); + + // used in sam + GGML_API struct ggml_tensor * ggml_get_rel_pos( + struct ggml_context * ctx, + struct ggml_tensor * a, + int qh, + int kh); + + // used in sam + + GGML_API struct ggml_tensor * ggml_add_rel_pos( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph); + + GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph); + + // custom operators + + typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); + typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); + + typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *); + typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); + typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_unary_op_f32_t fun), + "use ggml_map_custom1 instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_unary_op_f32_t fun), + "use ggml_map_custom1_inplace instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_binary_op_f32_t fun), + "use ggml_map_custom2 instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_binary_op_f32_t fun), + "use ggml_map_custom2_inplace instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_f32_t fun), + "use ggml_map_custom1 instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_f32_t fun), + "use ggml_map_custom1_inplace instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_f32_t fun), + "use ggml_map_custom2 instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_f32_t fun), + "use ggml_map_custom2_inplace instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_f32_t fun), + "use ggml_map_custom3 instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_f32_t fun), + "use ggml_map_custom3_inplace instead"); + + // custom operators v2 + + typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata); + typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata); + typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata); + + #define GGML_N_TASKS_MAX -1 + + GGML_API struct ggml_tensor * ggml_map_custom1( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom2( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom2_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom3( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom3_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_t fun, + int n_tasks, + void * userdata); + + // loss function + + GGML_API struct ggml_tensor * ggml_cross_entropy_loss( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c); + + // + // automatic differentiation + // + + GGML_API void ggml_set_param( + struct ggml_context * ctx, + struct ggml_tensor * tensor); + + + GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); + GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep); + + GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); + GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); + + // graph allocation in a context + GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); + GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor); + GGML_API size_t ggml_graph_overhead(void); + + // ggml_graph_plan() has to be called before ggml_graph_compute() + // when plan.work_size > 0, caller must allocate memory for plan.work_data + GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); + + // same as ggml_graph_compute() but the work data is allocated as a part of the context + // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data + GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); + + GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); + + GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname); + GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval); + + // print info and performance information for the graph + GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph); + + // dump the graph into a file using the dot format + GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); + + // + // optimization + // + + // optimization methods + enum ggml_opt_type { + GGML_OPT_ADAM, + GGML_OPT_LBFGS, + }; + + // linesearch methods + enum ggml_linesearch { + GGML_LINESEARCH_DEFAULT = 1, + + GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0, + GGML_LINESEARCH_BACKTRACKING_WOLFE = 1, + GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2, + }; + + // optimization return values + enum ggml_opt_result { + GGML_OPT_OK = 0, + GGML_OPT_DID_NOT_CONVERGE, + GGML_OPT_NO_CONTEXT, + GGML_OPT_INVALID_WOLFE, + GGML_OPT_FAIL, + + GGML_LINESEARCH_FAIL = -128, + GGML_LINESEARCH_MINIMUM_STEP, + GGML_LINESEARCH_MAXIMUM_STEP, + GGML_LINESEARCH_MAXIMUM_ITERATIONS, + GGML_LINESEARCH_INVALID_PARAMETERS, + }; + + typedef void (*ggml_opt_callback)(void * data, float * sched); + + // optimization parameters + // + // see ggml.c (ggml_opt_default_params) for default values + // + struct ggml_opt_params { + enum ggml_opt_type type; + + int n_threads; + + // delta-based convergence test + // + // if past == 0 - disabled + // if past > 0: + // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|) + // + int past; + float delta; + + // maximum number of iterations without improvement + // + // if 0 - disabled + // if > 0: + // assume convergence if no cost improvement in this number of iterations + // + int max_no_improvement; + + bool print_forward_graph; + bool print_backward_graph; + + // ADAM parameters + struct { + int n_iter; + + float sched; // schedule multiplier (fixed, decay or warmup) + float decay; // weight decay for AdamW, use 0.0f to disable + int decay_min_ndim; // minimum number of tensor dimension to apply weight decay + float alpha; // learning rate + float beta1; + float beta2; + float eps; // epsilon for numerical stability + float eps_f; // epsilon for convergence test + float eps_g; // epsilon for convergence test + float gclip; // gradient clipping + } adam; + + // LBFGS parameters + struct { + int m; // number of corrections to approximate the inv. Hessian + int n_iter; + int max_linesearch; + + float eps; // convergence tolerance + float ftol; // line search tolerance + float wolfe; + float min_step; + float max_step; + + enum ggml_linesearch linesearch; + } lbfgs; + }; + + struct ggml_opt_context { + struct ggml_context * ctx; + struct ggml_opt_params params; + + int iter; + int64_t nx; // number of parameter elements + + bool just_initialized; + + float loss_before; + float loss_after; + + struct { + struct ggml_tensor * m; // first moment + struct ggml_tensor * v; // second moment + struct ggml_tensor * pf; // past function values + float fx_best; + float fx_prev; + int n_no_improvement; + } adam; + + struct { + struct ggml_tensor * x; // current parameters + struct ggml_tensor * xp; // previous parameters + struct ggml_tensor * g; // current gradient + struct ggml_tensor * gp; // previous gradient + struct ggml_tensor * d; // search direction + struct ggml_tensor * pf; // past function values + struct ggml_tensor * lmal; // the L-BFGS memory alpha + struct ggml_tensor * lmys; // the L-BFGS memory ys + struct ggml_tensor * lms; // the L-BFGS memory s + struct ggml_tensor * lmy; // the L-BFGS memory y + float fx_best; + float step; + int j; + int k; + int end; + int n_no_improvement; + } lbfgs; + }; + + GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); + + // optimize the function defined by the tensor f + GGML_API enum ggml_opt_result ggml_opt( + struct ggml_context * ctx, + struct ggml_opt_params params, + struct ggml_tensor * f); + + // initialize optimizer context + GGML_API void ggml_opt_init( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_opt_params params, + int64_t nx); + + // continue optimizing the function defined by the tensor f + GGML_API enum ggml_opt_result ggml_opt_resume( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f); + + // continue optimizing the function defined by the tensor f + GGML_API enum ggml_opt_result ggml_opt_resume_g( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + ggml_opt_callback callback, + void * callback_data); + + // + // quantization + // + + GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); + + GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + + // + // gguf + // + + enum gguf_type { + GGUF_TYPE_UINT8 = 0, + GGUF_TYPE_INT8 = 1, + GGUF_TYPE_UINT16 = 2, + GGUF_TYPE_INT16 = 3, + GGUF_TYPE_UINT32 = 4, + GGUF_TYPE_INT32 = 5, + GGUF_TYPE_FLOAT32 = 6, + GGUF_TYPE_BOOL = 7, + GGUF_TYPE_STRING = 8, + GGUF_TYPE_ARRAY = 9, + GGUF_TYPE_UINT64 = 10, + GGUF_TYPE_INT64 = 11, + GGUF_TYPE_FLOAT64 = 12, + GGUF_TYPE_COUNT, // marks the end of the enum + }; + + struct gguf_context; + + struct gguf_init_params { + bool no_alloc; + + // if not NULL, create a ggml_context and allocate the tensor data in it + struct ggml_context ** ctx; + }; + + GGML_API struct gguf_context * gguf_init_empty(void); + GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); + //GGML_API struct gguf_context * gguf_init_from_buffer(..); + + GGML_API void gguf_free(struct gguf_context * ctx); + + GGML_API const char * gguf_type_name(enum gguf_type type); + + GGML_API int gguf_get_version (const struct gguf_context * ctx); + GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx); + GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx); + GGML_API void * gguf_get_data (const struct gguf_context * ctx); + + GGML_API int gguf_get_n_kv(const struct gguf_context * ctx); + GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key); + GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i); + + GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i); + GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i); + + // results are undefined if the wrong type is used for the key + GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int i); + GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int i); + GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int i); + GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int i); + GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int i); + GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int i); + GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int i); + GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int i); + GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int i); + GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int i); + GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int i); + GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i); + GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int i); + GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i); + GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i); + + GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); + GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); + GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i); + GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i); + + // overrides existing values or adds a new one + GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val); + GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val); + GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val); + GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val); + GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val); + GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val); + GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val); + GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val); + GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val); + GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val); + GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val); + GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val); + GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n); + GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n); + + // set or add KV pairs from another context + GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src); + + // manage tensor info + GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor); + GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type); + GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size); + + // writing gguf files can be done in 2 ways: + // + // - write the entire gguf_context to a binary file in a single pass: + // + // gguf_write_to_file(ctx, fname); + // + // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data: + // + // FILE * f = fopen(fname, "wb"); + // fseek(f, gguf_get_meta_size(ctx), SEEK_SET); + // fwrite(f, ...); + // void * data = gguf_meta_get_meta_data(ctx); + // fseek(f, 0, SEEK_SET); + // fwrite(f, data, gguf_get_meta_size(ctx)); + // free(data); + // fclose(f); + // + + // write the entire context to a binary file + GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta); + + // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding + GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx); + GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data); + + // + // system info + // + + GGML_API int ggml_cpu_has_avx (void); + GGML_API int ggml_cpu_has_avx2 (void); + GGML_API int ggml_cpu_has_avx512 (void); + GGML_API int ggml_cpu_has_avx512_vbmi(void); + GGML_API int ggml_cpu_has_avx512_vnni(void); + GGML_API int ggml_cpu_has_fma (void); + GGML_API int ggml_cpu_has_neon (void); + GGML_API int ggml_cpu_has_arm_fma (void); + GGML_API int ggml_cpu_has_f16c (void); + GGML_API int ggml_cpu_has_fp16_va (void); + GGML_API int ggml_cpu_has_wasm_simd (void); + GGML_API int ggml_cpu_has_blas (void); + GGML_API int ggml_cpu_has_cublas (void); + GGML_API int ggml_cpu_has_clblast (void); + GGML_API int ggml_cpu_has_gpublas (void); + GGML_API int ggml_cpu_has_sse3 (void); + GGML_API int ggml_cpu_has_ssse3 (void); + GGML_API int ggml_cpu_has_vsx (void); + + // + // Internal types and functions exposed for tests and benchmarks + // + +#ifdef __cplusplus +// restrict not standard in C++ +#define GGML_RESTRICT +#else +#define GGML_RESTRICT restrict +#endif + typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); + typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); + typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); + + typedef struct { + const char * type_name; + int blck_size; + size_t type_size; + bool is_quantized; + ggml_to_float_t to_float; + ggml_from_float_t from_float; + ggml_from_float_t from_float_reference; + ggml_vec_dot_t vec_dot; + enum ggml_type vec_dot_type; + } ggml_type_traits_t; + + ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); + +#ifdef __cplusplus +} +#endif diff --git a/seamless_communication/ggml/requirements.txt b/seamless_communication/ggml/requirements.txt new file mode 100644 index 0000000..62d3cdc --- /dev/null +++ b/seamless_communication/ggml/requirements.txt @@ -0,0 +1,6 @@ +accelerate==0.19.0 +numpy==1.24.3 +sentencepiece==0.1.98 +torch==2.0.1 +torchaudio==2.0.2 +torchvision==0.15.2 diff --git a/seamless_communication/ggml/scripts/sync-llama.sh b/seamless_communication/ggml/scripts/sync-llama.sh new file mode 100755 index 0000000..db7ee49 --- /dev/null +++ b/seamless_communication/ggml/scripts/sync-llama.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +cp -rpv ../llama.cpp/ggml.c src/ggml.c +cp -rpv ../llama.cpp/ggml-alloc.c src/ggml-alloc.c +cp -rpv ../llama.cpp/ggml-cuda.h src/ggml-cuda.h +cp -rpv ../llama.cpp/ggml-cuda.cu src/ggml-cuda.cu +cp -rpv ../llama.cpp/ggml-opencl.h src/ggml-opencl.h +cp -rpv ../llama.cpp/ggml-opencl.cpp src/ggml-opencl.cpp +cp -rpv ../llama.cpp/ggml-metal.h src/ggml-metal.h +cp -rpv ../llama.cpp/ggml-metal.m src/ggml-metal.m +cp -rpv ../llama.cpp/ggml-metal.metal src/ggml-metal.metal +cp -rpv ../llama.cpp/ggml.h include/ggml/ggml.h +cp -rpv ../llama.cpp/ggml-alloc.h include/ggml/ggml-alloc.h + +cp -rpv ../llama.cpp/tests/test-opt.cpp tests/test-opt.cpp +cp -rpv ../llama.cpp/tests/test-grad0.cpp tests/test-grad0.cpp +cp -rpv ../llama.cpp/tests/test-quantize-fns.cpp tests/test-quantize-fns.cpp +cp -rpv ../llama.cpp/tests/test-quantize-perf.cpp tests/test-quantize-perf.cpp diff --git a/seamless_communication/ggml/scripts/sync-whisper.sh b/seamless_communication/ggml/scripts/sync-whisper.sh new file mode 100755 index 0000000..1c74859 --- /dev/null +++ b/seamless_communication/ggml/scripts/sync-whisper.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +cp -rpv ../whisper.cpp/ggml.c src/ggml.c +cp -rpv ../whisper.cpp/ggml-cuda.h src/ggml-cuda.h +cp -rpv ../whisper.cpp/ggml-cuda.cu src/ggml-cuda.cu +cp -rpv ../whisper.cpp/ggml-opencl.h src/ggml-opencl.h +cp -rpv ../whisper.cpp/ggml-opencl.cpp src/ggml-opencl.cpp +cp -rpv ../whisper.cpp/ggml-metal.h src/ggml-metal.h +cp -rpv ../whisper.cpp/ggml-metal.m src/ggml-metal.m +cp -rpv ../whisper.cpp/ggml-metal.metal src/ggml-metal.metal +cp -rpv ../whisper.cpp/ggml.h include/ggml/ggml.h +cp -rpv ../whisper.cpp/examples/common.h examples/common.h +cp -rpv ../whisper.cpp/examples/common.cpp examples/common.cpp +cp -rpv ../whisper.cpp/examples/common-ggml.h examples/common-ggml.h +cp -rpv ../whisper.cpp/examples/common-ggml.cpp examples/common-ggml.cpp +cp -rpv ../whisper.cpp/whisper.h examples/whisper/whisper.h +cp -rpv ../whisper.cpp/whisper.cpp examples/whisper/whisper.cpp +cp -rpv ../whisper.cpp/examples/main/main.cpp examples/whisper/main.cpp +cp -rpv ../whisper.cpp/examples/quantize/quantize.cpp examples/whisper/quantize.cpp diff --git a/seamless_communication/ggml/src/CMakeLists.txt b/seamless_communication/ggml/src/CMakeLists.txt new file mode 100644 index 0000000..fcbe806 --- /dev/null +++ b/seamless_communication/ggml/src/CMakeLists.txt @@ -0,0 +1,322 @@ +if (GGML_ALL_WARNINGS) + if (NOT MSVC) + add_compile_options(-Wunused -Wextra -Wcast-qual -Wdouble-promotion) + add_compile_options("$<$:-Wshadow;-Wno-unused-function;-Wmissing-prototypes>") + else() + # todo : windows + endif() +endif() + +# compiler flags + +if (NOT MSVC) + #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations") +endif() + +message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") + +if (NOT UNAME_S) + execute_process(COMMAND uname -s OUTPUT_VARIABLE UNAME_S) +endif() +if (NOT UNAME_P) + execute_process(COMMAND uname -p OUTPUT_VARIABLE UNAME_P) +endif() +if (NOT UNAME_M) + execute_process(COMMAND uname -m OUTPUT_VARIABLE UNAME_M) +endif() +#message(STATUS "UNAME_S: ${UNAME_S} UNAME_P: ${UNAME_P} UNAME_M: ${UNAME_M}") + +# Mac OS + Arm can report x86_64 +# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 +if (UNAME_S MATCHES "Darwin") + if (NOT UNAME_P MATCHES "arm") + execute_process(COMMAND sysctl -n hw.optional.arm64 OUTPUT_VARIABLE SYSCTL_M) + if (SYSCTL_M MATCHES "1") + #set(UNAME_P "arm") + #set(UNAME_M "arm64") + message(WARNING "Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-#1282546789") + endif() + endif() +endif() + +if (${CMAKE_SYSTEM_NAME} STREQUAL "Emscripten") + message(STATUS "Emscripten detected") +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") + message(STATUS "ARM detected") + #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=apple-m1") +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") + message(STATUS "PPC64 detected") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpower9-vector") +else() + message(STATUS "x86 detected") + #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c") + if (UNAME_S MATCHES "Darwin") + execute_process(COMMAND sysctl machdep.cpu.features OUTPUT_VARIABLE AVX1_M) + if (AVX1_M MATCHES "AVX1.0") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx") + endif() + execute_process(COMMAND sysctl machdep.cpu.leaf7_features OUTPUT_VARIABLE AVX2_M) + if (AVX2_M MATCHES "AVX2") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2") + endif() + if (AVX1_M MATCHES "FMA") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma") + endif() + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c") + elseif (UNAME_S MATCHES "Linux") + message(STATUS "Linux detected") + execute_process(COMMAND grep "avx " /proc/cpuinfo OUTPUT_VARIABLE AVX1_M) + if (AVX1_M MATCHES "avx") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx") + endif() + execute_process(COMMAND grep "avx2 " /proc/cpuinfo OUTPUT_VARIABLE AVX2_M) + if (AVX2_M MATCHES "avx2") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2") + endif() + execute_process(COMMAND grep "fma " /proc/cpuinfo OUTPUT_VARIABLE FMA_M) + if (FMA_M MATCHES "fma") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma") + endif() + execute_process(COMMAND grep "f16c " /proc/cpuinfo OUTPUT_VARIABLE F16C_M) + if (F16C_M MATCHES "f16c") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c") + endif() + execute_process(COMMAND grep "sse3 " /proc/cpuinfo OUTPUT_VARIABLE SSE3_M) + if (SSE3_M MATCHES "sse3") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3") + endif() + elseif (UNAME_S MATCHES "Haiku") + message(STATUS "Haiku detected") + execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX " OUTPUT_VARIABLE AVX1_M) + if (AVX1_M MATCHES "avx") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx") + endif() + execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX2 " OUTPUT_VARIABLE AVX2_M) + if (AVX2_M MATCHES "avx2") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2") + endif() + execute_process(COMMAND sysinfo -cpu COMMAND grep "FMA " OUTPUT_VARIABLE FMA_M) + if (FMA_M MATCHES "fma") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma") + endif() + execute_process(COMMAND sysinfo -cpu COMMAND grep "F16C " OUTPUT_VARIABLE F16C_M) + if (F16C_M MATCHES "f16c") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c") + endif() + elseif (MSVC) + if (GGML_AVX512) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX512") + # MSVC has no compile-time flags enabling specific + # AVX512 extensions, neither it defines the + # macros corresponding to the extensions. + # Do it manually. + if (GGML_AVX512_VBMI) + add_compile_definitions(__AVX512VBMI__) + endif() + if (GGML_AVX512_VNNI) + add_compile_definitions(__AVX512VNNI__) + endif() + elseif (GGML_AVX2) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2") + elseif (GGML_AVX) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX") + endif() + else() + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma -mf16c -mavx -mavx2") + endif() +endif() + +# ggml + +set(TARGET ggml) + +# on APPLE - include Accelerate framework +if (APPLE AND NOT GGML_NO_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate) + if (ACCELERATE_FRAMEWORK) + message(STATUS "Accelerate framework found") + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) + set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_ACCELERATE) + else() + message(WARNING "Accelerate framework not found") + endif() +endif() + +if (GGML_OPENBLAS) + set(OPENBLAS_INCLUDE_SEARCH_PATHS + /usr/include + /usr/include/openblas + /usr/include/openblas-base + /usr/local/include + /usr/local/include/openblas + /usr/local/include/openblas-base + /opt/OpenBLAS/include + $ENV{OpenBLAS_HOME} + $ENV{OpenBLAS_HOME}/include + ) + find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS}) + find_library(OPENBLAS_LIB NAMES openblas libopenblas) + if (OPENBLAS_LIB) + message(STATUS "OpenBLAS found") + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${OPENBLAS_LIB}) + set(GGML_EXTRA_INCS ${GGML_EXTRA_INCS} ${OPENBLAS_INC}) + set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_OPENBLAS) + else() + message(WARNING "OpenBLAS not found") + endif() +endif() + +if (GGML_CLBLAST) + set(CLBLAST_INCLUDE_SEARCH_PATHS + /usr/include + /usr/local/include + $ENV{CLBLAST_HOME} + $ENV{CLBLAST_HOME}/include + ) + find_path(CLBLAST_INC NAMES clblast.h PATHS ${CLBLAST_INCLUDE_SEARCH_PATHS}) + find_library(CLBLAST_LIB NAMES clblast) + if (CLBLAST_LIB AND CLBLAST_INC) + message(STATUS "clBLAST found") + + + set(GGML_EXTRA_INCS ${GGML_EXTRA_INCS} ${CLBLAST_INC}) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${CLBLAST_LIB}) + set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_CLBLAST) + + set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h) + + link_libraries("-Wl,--copy-dt-needed-entries") + else() + message(WARNING "clBLAST not found") + endif() +endif() + +if (GGML_CUBLAS) + cmake_minimum_required(VERSION 3.17) + + find_package(CUDAToolkit) + if (CUDAToolkit_FOUND) + message(STATUS "cuBLAS found") + + enable_language(CUDA) + + set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h) + + add_compile_definitions(GGML_USE_CUBLAS) + + if (GGML_STATIC) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + else() + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) + endif() + + else() + message(WARNING "cuBLAS not found") + endif() +endif() + +if (GGML_METAL) + find_library(FOUNDATION_LIBRARY Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) + find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED) + + set(GGML_METAL_SOURCES ggml-metal.m ggml-metal.h) + + add_compile_definitions(GGML_USE_METAL) + add_compile_definitions(GGML_METAL_NDEBUG) + + # get full path to the file + #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") + + # copy ggml-metal.metal to bin directory + configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} + ${FOUNDATION_LIBRARY} + ${METAL_FRAMEWORK} + ${METALKIT_FRAMEWORK} + ${METALPERFORMANCE_FRAMEWORK} + ) +endif() + +if (GGML_PERF) + set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_PERF) +endif() + +add_library(${TARGET} + ggml.c + ggml-alloc.c + ../include/ggml/ggml.h + ../include/ggml/ggml-alloc.h + ${GGML_CUDA_SOURCES} + ${GGML_OPENCL_SOURCES} + ${GGML_METAL_SOURCES} + ) + +target_include_directories(${TARGET} PUBLIC + . + ../include + ../include/ggml + ../examples/ + ${GGML_EXTRA_INCS} + ) + +if (MSVC) + target_link_libraries(${TARGET} PUBLIC ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT} kaldi-native-fbank) +else() + target_link_libraries(${TARGET} PUBLIC m ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT} kaldi-native-fbank) +endif() + +if (BUILD_SHARED_LIBS) + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) + + target_link_libraries(${TARGET} PUBLIC + ${CMAKE_DL_LIBS} + ) + + target_compile_definitions(${TARGET} PUBLIC + GGML_SHARED + ) + + target_compile_definitions(${TARGET} PRIVATE + GGML_BUILD + ) + + if (GGML_METAL) + set_target_properties(${TARGET} PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") + endif() +endif() + +target_compile_definitions(${TARGET} PUBLIC + ${GGML_EXTRA_FLAGS} + ) + +if (MINGW) + target_link_libraries(${TARGET} PUBLIC + stdc++ + ) +endif() + +if (GGML_CUDA_SOURCES) + message(STATUS "GGML CUDA sources found, configuring CUDA architecture") + set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES "52;61") + set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") + if (NOT MSVC) + target_link_libraries(ggml PUBLIC stdc++) + endif() +endif() + +set (GGML_PUBLIC_HEADERS + ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml/ggml.h + ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml/ggml-alloc.h) +set_target_properties(${TARGET} PROPERTIES + PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") + +install(TARGETS ${TARGET} + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib/static + PUBLIC_HEADER DESTINATION include/ggml + ) diff --git a/seamless_communication/ggml/src/ggml-alloc.c b/seamless_communication/ggml/src/ggml-alloc.c new file mode 100644 index 0000000..a1f6e7b --- /dev/null +++ b/seamless_communication/ggml/src/ggml-alloc.c @@ -0,0 +1,633 @@ +#include "ggml-alloc.h" +#include "ggml.h" +#include +#include +#include +#include +#include + +#ifdef __has_include + #if __has_include() + #include + #if defined(_POSIX_MAPPED_FILES) + #include + #include + #endif + #endif +#endif + +#if defined(_WIN32) + #define WIN32_LEAN_AND_MEAN + #ifndef NOMINMAX + #define NOMINMAX + #endif + #include + #include +#endif + + +#define UNUSED(x) (void)(x) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define GGML_MAX_CONCUR (2*GGML_MAX_NODES) + +//#define GGML_ALLOCATOR_DEBUG + +//#define AT_PRINTF printf +#define AT_PRINTF(...) ((void)0) + +struct hash_node { + struct ggml_tensor * t; + int n_children; + int n_views; +}; + +static size_t hash(void * p) { + return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE; +} + +static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) { + size_t h = hash(t); + + // linear probing + size_t i = h; + while (hash_table[i].t != NULL) { + if (hash_table[i].t == t) { + return &hash_table[i]; + } + i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE; + if (i == h) { + // hash table is full + GGML_ASSERT(false); + } + } + + hash_table[i].t = t; + return &hash_table[i]; +} + +// TODO: GGML_PAD ? +static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { + assert(alignment && !(alignment & (alignment - 1))); // power of 2 + size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment; + return offset + align; +} + +struct free_block { + void * addr; + size_t size; +}; + +#define MAX_FREE_BLOCKS 128 + +struct ggml_allocr { + void * data; + size_t size; + size_t alignment; + int n_free_blocks; + struct free_block free_blocks[MAX_FREE_BLOCKS]; + struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE]; + size_t max_size; + bool measure; + int parse_seq[GGML_MAX_CONCUR]; + int parse_seq_len; + +#ifdef GGML_ALLOCATOR_DEBUG + struct ggml_tensor * allocated_tensors[1024]; +#endif +}; + +#ifdef GGML_ALLOCATOR_DEBUG +static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { + for (int i = 0; i < 1024; i++) { + if (alloc->allocated_tensors[i] == NULL) { + alloc->allocated_tensors[i] = tensor; + return; + } + } + GGML_ASSERT(!"out of allocated_tensors"); +} +static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { + for (int i = 0; i < 1024; i++) { + if (alloc->allocated_tensors[i] == tensor || + (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) { + alloc->allocated_tensors[i] = NULL; + return; + } + } + printf("tried to free tensor %s not found\n", tensor->name); + GGML_ASSERT(!"tensor not found"); +} +#endif + +static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { + return ggml_nbytes(tensor); + + UNUSED(alloc); +} + +// check if a tensor is allocated by this buffer +static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) { + void * ptr = tensor->data; + return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size; +} + +void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { +#ifdef GGML_ALLOCATOR_DEBUG + GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources + GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated +#endif + size_t size = ggml_allocr_get_alloc_size(alloc, tensor); + size = aligned_offset(NULL, size, alloc->alignment); + + AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); + + size_t max_avail = 0; + + // find the best fitting free block besides the last block + int best_fit_block = -1; + size_t best_fit_size = SIZE_MAX; + for (int i = 0; i < alloc->n_free_blocks - 1; i++) { + struct free_block * block = &alloc->free_blocks[i]; + max_avail = MAX(max_avail, block->size); + if (block->size >= size && block->size <= best_fit_size) { + best_fit_block = i; + best_fit_size = block->size; + } + } + + AT_PRINTF("block %d\n", best_fit_block); + + if (best_fit_block == -1) { + // the last block is our last resort + struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1]; + max_avail = MAX(max_avail, block->size); + if (block->size >= size) { + best_fit_block = alloc->n_free_blocks - 1; + } else { + fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", + __func__, size, max_avail); + GGML_ASSERT(!"not enough space in the buffer"); + return; + } + } + struct free_block * block = &alloc->free_blocks[best_fit_block]; + void * addr = block->addr; + block->addr = (char*)block->addr + size; + block->size -= size; + if (block->size == 0) { + // remove block if empty + alloc->n_free_blocks--; + for (int j = best_fit_block; j < alloc->n_free_blocks; j++) { + alloc->free_blocks[j] = alloc->free_blocks[j+1]; + } + } + + tensor->data = addr; + +#ifdef GGML_ALLOCATOR_DEBUG + add_allocated_tensor(alloc, tensor); + size_t cur_max = (char*)addr - (char*)alloc->data + size; + if (cur_max > alloc->max_size) { + printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); + for (int i = 0; i < 1024; i++) { + if (alloc->allocated_tensors[i]) { + printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0); + } + } + printf("\n"); + } +#endif + + alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size); +} + +// this is a very naive implementation, but for our case the number of free blocks should be very small +static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { + void * ptr = tensor->data; + + if (ggml_allocr_is_own(alloc, tensor) == false) { + // the tensor was not allocated in this buffer + // this can happen because the graph allocator will try to free weights and other tensors from different buffers + // the easiest way to deal with this is just to ignore it + return; + } + + size_t size = ggml_allocr_get_alloc_size(alloc, tensor); + size = aligned_offset(NULL, size, alloc->alignment); + AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks); + +#ifdef GGML_ALLOCATOR_DEBUG + remove_allocated_tensor(alloc, tensor); +#endif + + // see if we can merge with an existing block + for (int i = 0; i < alloc->n_free_blocks; i++) { + struct free_block * block = &alloc->free_blocks[i]; + // check if ptr is at the end of the block + if ((char*)block->addr + block->size == ptr) { + block->size += size; + // check if we can merge with the next block + if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) { + block->size += alloc->free_blocks[i+1].size; + alloc->n_free_blocks--; + for (int j = i+1; j < alloc->n_free_blocks; j++) { + alloc->free_blocks[j] = alloc->free_blocks[j+1]; + } + } + return; + } + // check if ptr is at the beginning of the block + if ((char*)ptr + size == block->addr) { + block->addr = ptr; + block->size += size; + // check if we can merge with the previous block + if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) { + alloc->free_blocks[i-1].size += block->size; + alloc->n_free_blocks--; + for (int j = i; j < alloc->n_free_blocks; j++) { + alloc->free_blocks[j] = alloc->free_blocks[j+1]; + } + } + return; + } + } + // otherwise, add a new block + GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks"); + // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster) + int insert_pos = 0; + while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) { + insert_pos++; + } + // shift all blocks from insert_pos onward to make room for the new block + for (int i = alloc->n_free_blocks; i > insert_pos; i--) { + alloc->free_blocks[i] = alloc->free_blocks[i-1]; + } + // insert the new block + alloc->free_blocks[insert_pos].addr = ptr; + alloc->free_blocks[insert_pos].size = size; + alloc->n_free_blocks++; +} + +void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) { + for (int i = 0; i < n; i++) { + alloc->parse_seq[i] = list[i]; + } + alloc->parse_seq_len = n; +} + +void ggml_allocr_reset(struct ggml_allocr * alloc) { + alloc->n_free_blocks = 1; + size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment); + alloc->free_blocks[0].addr = (char *)alloc->data + align_offset; + alloc->free_blocks[0].size = alloc->size - align_offset; +} + +struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) { + struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); + + *alloc = (struct ggml_allocr){ + /*.data = */ data, + /*.size = */ size, + /*.alignment = */ alignment, + /*.n_free_blocks = */ 0, + /*.free_blocks = */ {{0}}, + /*.hash_table = */ {{0}}, + /*.max_size = */ 0, + /*.measure = */ false, + /*.parse_seq = */ {0}, + /*.parse_seq_len = */ 0, +#ifdef GGML_ALLOCATOR_DEBUG + /*.allocated_tensors = */ {0}, +#endif + }; + + ggml_allocr_reset(alloc); + + return alloc; +} + +// OS specific functions to allocate and free uncommitted virtual memory +static void * alloc_vmem(size_t size) { +#if defined(_WIN32) + return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS); +#elif defined(_POSIX_MAPPED_FILES) + void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (ptr == MAP_FAILED) { + return NULL; + } + return ptr; +#else + // use a fixed address for other platforms + uintptr_t base_addr = (uintptr_t)-size - 0x100; + return (void *)base_addr; +#endif +} + +static void free_vmem(void * base_addr, size_t size) { +#if defined(_WIN32) + VirtualFree(base_addr, 0, MEM_RELEASE); + UNUSED(size); +#elif defined(_POSIX_MAPPED_FILES) + munmap(base_addr, size); +#else + // nothing to do + UNUSED(base_addr); + UNUSED(size); +#endif +} + +// allocate uncommitted virtual memory to measure the size of the graph +static void alloc_measure_vmem(void ** base_addr, size_t * size) { + // 1TB for 64-bit, 1GB for 32-bit + *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40; + do { + *base_addr = alloc_vmem(*size); + if (*base_addr != NULL) { + AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr); + return; + } + // try again with half the size + *size /= 2; + } while (*size > 0); + + GGML_ASSERT(!"failed to allocate virtual memory for measure buffer"); +} + +static void free_measure_vmem(void * base_addr, size_t size) { + free_vmem(base_addr, size); +} + +struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { + struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); + + void * base_addr; + size_t size; + + alloc_measure_vmem(&base_addr, &size); + + *alloc = (struct ggml_allocr){ + /*.data = */ base_addr, + /*.size = */ size, + /*.alignment = */ alignment, + /*.n_free_blocks = */ 0, + /*.free_blocks = */ {{0}}, + /*.hash_table = */ {{0}}, + /*.max_size = */ 0, + /*.measure = */ true, + /*.parse_seq = */ {0}, + /*.parse_seq_len = */ 0, +#ifdef GGML_ALLOCATOR_DEBUG + /*.allocated_tensors = */ {0}, +#endif + }; + + ggml_allocr_reset(alloc); + + return alloc; +} + +void ggml_allocr_free(struct ggml_allocr * alloc) { + if (alloc->measure) { + free_measure_vmem(alloc->data, alloc->size); + } + free(alloc); +} + +bool ggml_allocr_is_measure(struct ggml_allocr * alloc) { + return alloc->measure; +} + +//////////// compute graph allocator + +static bool ggml_is_view(struct ggml_tensor * t) { + return t->view_src != NULL; +} + +static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { + if (a->type != b->type) { + return false; + } + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (a->ne[i] != b->ne[i]) { + return false; + } + if (a->nb[i] != b->nb[i]) { + return false; + } + } + return true; +} + +static bool ggml_op_can_inplace(enum ggml_op op) { + switch (op) { + case GGML_OP_SCALE: + case GGML_OP_DIAG_MASK_ZERO: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_ADD: + case GGML_OP_ADD1: + case GGML_OP_SUB: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_UNARY: + case GGML_OP_ROPE: + case GGML_OP_RMS_NORM: + case GGML_OP_SOFT_MAX: + case GGML_OP_CONT: + return true; + + default: + return false; + } +} + +static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) { + struct hash_node * ht = alloc->hash_table; + if (node->data == NULL) { + if (ggml_is_view(node)) { + assert(node->view_src->data != NULL); + node->data = (char *)node->view_src->data + node->view_offs; + } else { + // see if we can reuse a parent's buffer (inplace) + if (ggml_op_can_inplace(node->op)) { + for (int i = 0; i < GGML_MAX_SRC; i++) { + struct ggml_tensor * parent = node->src[i]; + if (parent == NULL) { + break; + } + + // if the node's data is external, then we cannot re-use it + if (ggml_allocr_is_own(alloc, parent) == false) { + AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); + continue; + } + + struct hash_node * p_hn = hash_get(ht, parent); + if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) { + if (ggml_is_view(parent)) { + struct ggml_tensor * view_src = parent->view_src; + struct hash_node * view_src_hn = hash_get(ht, view_src); + if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { + // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite + // the parent's data that it will need later (same layout requirement). the problem is that then + // we cannot free the tensor because the original address of the allocation is lost. + // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views + // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data) + AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); + node->data = parent->data; + return; + } + } + else { + AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); + node->data = parent->data; + return; + } + } + } + } + ggml_allocr_alloc(alloc, node); + } + } +} + +static size_t ggml_allocr_alloc_graph_tensors_n( + struct ggml_allocr * alloc, + struct ggml_cgraph ** graphs, int n_graphs, + struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) { + + // reset hash table + struct hash_node * ht = alloc->hash_table; + memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE); + + // count number of children and views + for (int g = 0; g < n_graphs; g++) { + struct ggml_cgraph * gf = graphs[g]; + for (int i = 0; i < gf->n_nodes; i++) { + struct ggml_tensor * node = gf->nodes[i]; + + if (ggml_is_view(node)) { + struct ggml_tensor * view_src = node->view_src; + hash_get(ht, view_src)->n_views += 1; + } + + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + hash_get(ht, parent)->n_children += 1; + } + } + } + + // allocate tensors + for (int g = 0; g < n_graphs; g++) { + struct ggml_cgraph * gf = graphs[g]; + AT_PRINTF("####### graph %d/%d\n", g, n_graphs); + // graph inputs are allocated first to ensure that they are not overwritten by each other + if (inputs != NULL && inputs[g] != NULL) { + for (int i = 0; inputs[g][i] != NULL; i++) { + struct ggml_tensor * input = inputs[g][i]; + AT_PRINTF("input: %s\n", input->name); + allocate_node(alloc, input); + } + } + // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers + int last_barrier_pos = 0; + int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes; + + for (int ind = 0; ind < n_nodes; ind++) { + // allocate a node if there is no parse_seq or this is not a barrier + if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) { + int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind; + struct ggml_tensor * node = gf->nodes[i]; + + // allocate parents (leafs) + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + allocate_node(alloc, parent); + } + + // allocate node + allocate_node(alloc, node); + + AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name); + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + AT_PRINTF("%s", parent->name); + if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { + AT_PRINTF(", "); + } + } + AT_PRINTF("\n"); + } + + // update parents + // update immediately if there is no parse_seq + // update only at barriers if there is parse_seq + if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) { + int update_start = alloc->parse_seq_len ? last_barrier_pos : ind; + int update_end = alloc->parse_seq_len ? ind : ind + 1; + for (int i = update_start; i < update_end; i++) { + int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i; + struct ggml_tensor * node = gf->nodes[node_i]; + + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + struct hash_node * p_hn = hash_get(ht, parent); + p_hn->n_children -= 1; + + //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views); + + if (p_hn->n_children == 0 && p_hn->n_views == 0) { + if (ggml_is_view(parent)) { + struct ggml_tensor * view_src = parent->view_src; + struct hash_node * view_src_hn = hash_get(ht, view_src); + view_src_hn->n_views -= 1; + AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); + if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) { + ggml_allocr_free_tensor(alloc, view_src); + } + } + else { + if (parent->data != node->data) { + ggml_allocr_free_tensor(alloc, parent); + } + } + } + } + } + AT_PRINTF("\n"); + if (alloc->parse_seq_len) { + last_barrier_pos = ind + 1; + } + } + } + // free graph outputs here that wouldn't be freed otherwise because they have no children + if (outputs != NULL && outputs[g] != NULL) { + for (int i = 0; outputs[g][i] != NULL; i++) { + struct ggml_tensor * output = outputs[g][i]; + AT_PRINTF("output: %s\n", output->name); + ggml_allocr_free_tensor(alloc, output); + } + } + } + + return alloc->max_size; +} + +size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) { + return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL); +} diff --git a/seamless_communication/ggml/src/ggml-cuda.cu b/seamless_communication/ggml/src/ggml-cuda.cu new file mode 100644 index 0000000..e0163ae --- /dev/null +++ b/seamless_communication/ggml/src/ggml-cuda.cu @@ -0,0 +1,6814 @@ +#include +#include +#include +#include +#include +#include +#include + +#if defined(GGML_USE_HIPBLAS) +#include +#include +#include +#ifdef __HIP_PLATFORM_AMD__ +// for rocblas_initialize() +#include "rocblas/rocblas.h" +#endif +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH 0 +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F +#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasCreate hipblasCreate +#define cublasGemmEx hipblasGemmEx +#define cublasHandle_t hipblasHandle_t +#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS +#define cublasSetStream hipblasSetStream +#define cublasSgemm hipblasSgemm +#define cublasStatus_t hipblasStatus_t +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaError_t hipError_t +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEvent_t hipEvent_t +#define cudaEventDestroy hipEventDestroy +#define cudaFree hipFree +#define cudaFreeHost hipHostFree +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaMalloc hipMalloc +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#define cudaMemcpy hipMemcpy +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyKind hipMemcpyKind +#define cudaMemset hipMemset +#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize +#define cudaSetDevice hipSetDevice +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0) +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess +#else +#include +#include +#include +#endif + +#include "ggml-cuda.h" +#include "ggml.h" + +#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products +#ifndef CC_TURING +#define CC_TURING 700 +#endif + +#if defined(GGML_USE_HIPBLAS) +#define __CUDA_ARCH__ 1300 + +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); +static __device__ __forceinline__ int __vsubss4(const int a, const int b) { + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); +#if __has_builtin(__builtin_elementwise_sub_sat) + const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); + return reinterpret_cast(c); +#else + int8x4_t c; + int16_t tmp; +#pragma unroll + for (int i = 0; i < 4; i++) { + tmp = va[i] - vb[i]; + if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); + if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); + c[i] = tmp; + } + return reinterpret_cast(c); +#endif // __has_builtin(__builtin_elementwise_sub_sat) +} + +static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { +#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) + c = __builtin_amdgcn_sdot4(a, b, c, false); +#elif defined(__gfx1100__) + c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); +#elif defined(__gfx1010__) || defined(__gfx900__) + int tmp1; + int tmp2; + asm("\n \ + v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \ + v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \ + v_add3_u32 %0, %1, %2, %0 \n \ + v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \ + v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \ + v_add3_u32 %0, %1, %2, %0 \n \ + " + : "+v"(c), "=&v"(tmp1), "=&v"(tmp2) + : "v"(a), "v"(b) + ); +#else + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); + c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; +#endif + return c; +} +#endif + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); + +#define CUDA_CHECK(err) \ + do { \ + cudaError_t err_ = (err); \ + if (err_ != cudaSuccess) { \ + fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ + cudaGetErrorString(err_)); \ + exit(1); \ + } \ + } while (0) + +#if CUDART_VERSION >= 12000 +#define CUBLAS_CHECK(err) \ + do { \ + cublasStatus_t err_ = (err); \ + if (err_ != CUBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \ + err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \ + exit(1); \ + } \ + } while (0) +#else +#define CUBLAS_CHECK(err) \ + do { \ + cublasStatus_t err_ = (err); \ + if (err_ != CUBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + exit(1); \ + } \ + } while (0) +#endif // CUDART_VERSION >= 11 + +#ifdef GGML_CUDA_F16 +typedef half dfloat; // dequantize float +typedef half2 dfloat2; +#else +typedef float dfloat; // dequantize float +typedef float2 dfloat2; +#endif //GGML_CUDA_F16 + +static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) { + const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment + + int x32 = 0; + x32 |= x16[0] << 0; + x32 |= x16[1] << 16; + + return x32; +} + +static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) { + const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment + + int x32 = 0; + x32 |= x16[0] << 0; + x32 |= x16[1] << 16; + + return x32; +} + +static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) { + return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment +} + +static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) { + return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment +} + +typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v); +typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream); +typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v); +typedef void (*cpy_kernel_t)(const char * cx, char * cdst); +typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +typedef void (*ggml_cuda_op_t)( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i, + float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main); + +// QK = number of values after dequantization +// QR = QK / number of values before dequantization +// QI = number of 32 bit integers before dequantization + +#define QK4_0 32 +#define QR4_0 2 +#define QI4_0 (QK4_0 / (4 * QR4_0)) +typedef struct { + half d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants +} block_q4_0; +static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding"); + +#define QK4_1 32 +#define QR4_1 2 +#define QI4_1 (QK4_1 / (4 * QR4_1)) +typedef struct { + half2 dm; // dm.x = delta, dm.y = min + uint8_t qs[QK4_1 / 2]; // nibbles / quants +} block_q4_1; +static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); + +#define QK5_0 32 +#define QR5_0 2 +#define QI5_0 (QK5_0 / (4 * QR5_0)) +typedef struct { + half d; // delta + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[QK5_0 / 2]; // nibbles / quants +} block_q5_0; +static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding"); + +#define QK5_1 32 +#define QR5_1 2 +#define QI5_1 (QK5_1 / (4 * QR5_1)) +typedef struct { + half2 dm; // dm.x = delta, dm.y = min + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[QK5_1 / 2]; // nibbles / quants +} block_q5_1; +static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); + +#define QK8_0 32 +#define QR8_0 1 +#define QI8_0 (QK8_0 / (4 * QR8_0)) +typedef struct { + half d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; +static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding"); + +#define QK8_1 32 +#define QR8_1 1 +#define QI8_1 (QK8_1 / (4 * QR8_1)) +typedef struct { + half2 ds; // ds.x = delta, ds.y = sum + int8_t qs[QK8_0]; // quants +} block_q8_1; +static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding"); + +typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs); +typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc); +typedef void (*load_tiles_cuda_t)( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row); +typedef float (*vec_dot_q_mul_mat_cuda_t)( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k); + +//================================= k-quants + +#ifdef GGML_QKK_64 +#define QK_K 64 +#define K_SCALE_SIZE 4 +#else +#define QK_K 256 +#define K_SCALE_SIZE 12 +#endif + +#define QR2_K 4 +#define QI2_K (QK_K / (4*QR2_K)) +typedef struct { + uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits + uint8_t qs[QK_K/4]; // quants + half2 dm; // super-block scale for quantized scales/mins +} block_q2_K; +static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); + +#define QR3_K 4 +#define QI3_K (QK_K / (4*QR3_K)) +typedef struct { + uint8_t hmask[QK_K/8]; // quants - high bit + uint8_t qs[QK_K/4]; // quants - low 2 bits +#ifdef GGML_QKK_64 + uint8_t scales[2]; // scales, quantized with 8 bits +#else + uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits +#endif + half d; // super-block scale +} block_q3_K; +//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding"); + +#define QR4_K 2 +#define QI4_K (QK_K / (4*QR4_K)) +#ifdef GGML_QKK_64 +typedef struct { + half dm[2]; // super-block scales/mins + uint8_t scales[2]; // 4-bit block scales/mins + uint8_t qs[QK_K/2]; // 4--bit quants +} block_q4_K; +static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding"); +#else +typedef struct { + half2 dm; // super-block scale for quantized scales/mins + uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits + uint8_t qs[QK_K/2]; // 4--bit quants +} block_q4_K; +static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); +#endif + +#define QR5_K 2 +#define QI5_K (QK_K / (4*QR5_K)) +#ifdef GGML_QKK_64 +typedef struct { + half d; // super-block scale + int8_t scales[QK_K/16]; // block scales + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits +} block_q5_K; +static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding"); +#else +typedef struct { + half2 dm; // super-block scale for quantized scales/mins + uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits +} block_q5_K; +static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); +#endif + +#define QR6_K 2 +#define QI6_K (QK_K / (4*QR6_K)) +typedef struct { + uint8_t ql[QK_K/2]; // quants, lower 4 bits + uint8_t qh[QK_K/4]; // quants, upper 2 bits + int8_t scales[QK_K/16]; // scales + half d; // delta +} block_q6_K; +static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding"); + +#define WARP_SIZE 32 +#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses + +#define CUDA_ADD_BLOCK_SIZE 256 +#define CUDA_MUL_BLOCK_SIZE 256 +#define CUDA_GELU_BLOCK_SIZE 256 +#define CUDA_SILU_BLOCK_SIZE 256 +#define CUDA_CPY_BLOCK_SIZE 32 +#define CUDA_SCALE_BLOCK_SIZE 256 +#define CUDA_ROPE_BLOCK_SIZE 256 +#define CUDA_ALIBI_BLOCK_SIZE 32 +#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32 +#define CUDA_QUANTIZE_BLOCK_SIZE 256 +#define CUDA_DEQUANTIZE_BLOCK_SIZE 256 + +// dmmv = dequantize_mul_mat_vec +#ifndef GGML_CUDA_DMMV_X +#define GGML_CUDA_DMMV_X 32 +#endif +#ifndef GGML_CUDA_MMV_Y +#define GGML_CUDA_MMV_Y 1 +#endif + +#ifndef K_QUANTS_PER_ITERATION +#define K_QUANTS_PER_ITERATION 2 +#else +static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); +#endif + +struct ggml_tensor_extra_gpu { + void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors + cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs +}; + +static int g_device_count = -1; +static int g_main_device = 0; +static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES]; +static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; +static bool g_mul_mat_q = true; + +static void * g_scratch_buffer = nullptr; +static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default +static size_t g_scratch_offset = 0; + +static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; + +static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr }; + +static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= kx) { + return; + } + dst[i] = x[i] + y[i%ky]; +} + +static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + dst[i] = __hadd(x[i], __float2half(y[i])); +} + +static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= kx) { + return; + } + dst[i] = x[i] * y[i%ky]; +} + +static __global__ void gelu_f32(const float * x, float * dst, const int k) { + const float GELU_COEF_A = 0.044715f; + const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + float xi = x[i]; + dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi))); +} + +static __global__ void silu_f32(const float * x, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + dst[i] = x[i] / (1.0f + expf(-x[i])); +} + +static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32); + a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32); + } + return a; +} + +template +static __global__ void norm_f32(const float * x, float * dst, const int ncols) { + const int row = blockIdx.x*blockDim.y + threadIdx.y; + const int tid = threadIdx.x; + + const float eps = 1e-5f; + + float2 mean_var = make_float2(0.f, 0.f); + + for (int col = tid; col < ncols; col += block_size) { + const float xi = x[row*ncols + col]; + mean_var.x += xi; + mean_var.y += xi * xi; + } + + // sum up partial sums + mean_var = warp_reduce_sum(mean_var); + if (block_size > WARP_SIZE) { + __shared__ float2 s_sum[32]; + int warp_id = threadIdx.x / WARP_SIZE; + int lane_id = threadIdx.x % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = mean_var; + } + __syncthreads(); + mean_var = s_sum[lane_id]; + mean_var = warp_reduce_sum(mean_var); + } + + const float mean = mean_var.x / ncols; + const float var = mean_var.y / ncols - mean * mean; + const float inv_std = rsqrtf(var + eps); + + for (int col = tid; col < ncols; col += block_size) { + dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std; + } +} + +static __device__ __forceinline__ float warp_reduce_sum(float x) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + x += __shfl_xor_sync(0xffffffff, x, mask, 32); + } + return x; +} + +template +static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) { + const int row = blockIdx.x*blockDim.y + threadIdx.y; + const int tid = threadIdx.x; + + float tmp = 0.0f; // partial sum for thread in warp + + for (int col = tid; col < ncols; col += block_size) { + const float xi = x[row*ncols + col]; + tmp += xi * xi; + } + + // sum up partial sums + tmp = warp_reduce_sum(tmp); + if (block_size > WARP_SIZE) { + __shared__ float s_sum[32]; + int warp_id = threadIdx.x / WARP_SIZE; + int lane_id = threadIdx.x % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + __syncthreads(); + tmp = s_sum[lane_id]; + tmp = warp_reduce_sum(tmp); + } + + const float mean = tmp / ncols; + const float scale = rsqrtf(mean + eps); + + for (int col = tid; col < ncols; col += block_size) { + dst[row*ncols + col] = scale * x[row*ncols + col]; + } +} + +static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){ + const block_q4_0 * x = (const block_q4_0 *) vx; + + const dfloat d = x[ib].d; + + const int vui = x[ib].qs[iqs]; + + v.x = vui & 0xF; + v.y = vui >> 4; + +#ifdef GGML_CUDA_F16 + v = __hsub2(v, {8.0f, 8.0f}); + v = __hmul2(v, {d, d}); +#else + v.x = (v.x - 8.0f) * d; + v.y = (v.y - 8.0f) * d; +#endif // GGML_CUDA_F16 +} + +static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ + const block_q4_1 * x = (const block_q4_1 *) vx; + + const dfloat d = __low2half(x[ib].dm); + const dfloat m = __high2half(x[ib].dm); + + const int vui = x[ib].qs[iqs]; + + v.x = vui & 0xF; + v.y = vui >> 4; + +#ifdef GGML_CUDA_F16 + v = __hmul2(v, {d, d}); + v = __hadd2(v, {m, m}); +#else + v.x = (v.x * d) + m; + v.y = (v.y * d) + m; +#endif // GGML_CUDA_F16 +} + +static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){ + const block_q5_0 * x = (const block_q5_0 *) vx; + + const dfloat d = x[ib].d; + + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); + v.y = ((x[ib].qs[iqs] >> 4) | xh_1); + +#ifdef GGML_CUDA_F16 + v = __hsub2(v, {16.0f, 16.0f}); + v = __hmul2(v, {d, d}); +#else + v.x = (v.x - 16.0f) * d; + v.y = (v.y - 16.0f) * d; +#endif // GGML_CUDA_F16 +} + +static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ + const block_q5_1 * x = (const block_q5_1 *) vx; + + const dfloat d = __low2half(x[ib].dm); + const dfloat m = __high2half(x[ib].dm); + + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); + v.y = ((x[ib].qs[iqs] >> 4) | xh_1); + +#ifdef GGML_CUDA_F16 + v = __hmul2(v, {d, d}); + v = __hadd2(v, {m, m}); +#else + v.x = (v.x * d) + m; + v.y = (v.y * d) + m; +#endif // GGML_CUDA_F16 +} + +static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){ + const block_q8_0 * x = (const block_q8_0 *) vx; + + const dfloat d = x[ib].d; + + v.x = x[ib].qs[iqs + 0]; + v.y = x[ib].qs[iqs + 1]; + +#ifdef GGML_CUDA_F16 + v = __hmul2(v, {d, d}); +#else + v.x *= d; + v.y *= d; +#endif // GGML_CUDA_F16 +} + +//================================== k-quants + +static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) { + + const int i = blockIdx.x; + const block_q2_K * x = (const block_q2_K *) vx; + + const int tid = threadIdx.x; +#if QK_K == 256 + const int n = tid/32; + const int l = tid - 32*n; + const int is = 8*n + l/16; + + const uint8_t q = x[i].qs[32*n + l]; + float * y = yy + i*QK_K + 128*n; + + float dall = __low2half(x[i].dm); + float dmin = __high2half(x[i].dm); + y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); + y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); + y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); + y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); +#else + const int is = tid/16; // 0 or 1 + const int il = tid%16; // 0...15 + const uint8_t q = x[i].qs[il] >> (2*is); + float * y = yy + i*QK_K + 16*is + il; + float dall = __low2half(x[i].dm); + float dmin = __high2half(x[i].dm); + y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); + y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); +#endif + +} + +static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) { + + const int i = blockIdx.x; + const block_q3_K * x = (const block_q3_K *) vx; + +#if QK_K == 256 + const int r = threadIdx.x/4; + const int tid = r/2; + const int is0 = r%2; + const int l0 = 16*is0 + 4*(threadIdx.x%4); + const int n = tid / 4; + const int j = tid - 4*n; + + uint8_t m = 1 << (4*n + j); + int is = 8*n + 2*j + is0; + int shift = 2*j; + + int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) : + is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) : + is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) : + (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4); + float d_all = x[i].d; + float dl = d_all * (us - 32); + + float * y = yy + i*QK_K + 128*n + 32*j; + const uint8_t * q = x[i].qs + 32*n; + const uint8_t * hm = x[i].hmask; + + for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); +#else + const int tid = threadIdx.x; + const int is = tid/16; // 0 or 1 + const int il = tid%16; // 0...15 + const int im = il/8; // 0...1 + const int in = il%8; // 0...7 + + float * y = yy + i*QK_K + 16*is + il; + + const uint8_t q = x[i].qs[il] >> (2*is); + const uint8_t h = x[i].hmask[in] >> (2*is + im); + const float d = (float)x[i].d; + + if (is == 0) { + y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); + y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); + } else { + y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); + y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); + } +#endif + +} + +#if QK_K == 256 +static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { + if (j < 4) { + d = q[j] & 63; m = q[j + 4] & 63; + } else { + d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); + m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); + } +} +#endif + +static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) { + const block_q4_K * x = (const block_q4_K *) vx; + + const int i = blockIdx.x; + +#if QK_K == 256 + // assume 32 threads + const int tid = threadIdx.x; + const int il = tid/8; + const int ir = tid%8; + const int is = 2*il; + const int n = 4; + + float * y = yy + i*QK_K + 64*il + n*ir; + + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); + + const uint8_t * q = x[i].qs + 32*il + n*ir; + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[i].scales, sc, m); + const float d1 = dall * sc; const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, sc, m); + const float d2 = dall * sc; const float m2 = dmin * m; + for (int l = 0; l < n; ++l) { + y[l + 0] = d1 * (q[l] & 0xF) - m1; + y[l +32] = d2 * (q[l] >> 4) - m2; + } +#else + const int tid = threadIdx.x; + const uint8_t * q = x[i].qs; + float * y = yy + i*QK_K; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; + y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); + y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4); +#endif +} + +static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) { + const block_q5_K * x = (const block_q5_K *) vx; + + const int i = blockIdx.x; + +#if QK_K == 256 + // assume 64 threads - this is very slightly better than the one below + const int tid = threadIdx.x; + const int il = tid/16; // il is in 0...3 + const int ir = tid%16; // ir is in 0...15 + const int is = 2*il; // is is in 0...6 + + float * y = yy + i*QK_K + 64*il + 2*ir; + + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); + + const uint8_t * ql = x[i].qs + 32*il + 2*ir; + const uint8_t * qh = x[i].qh + 2*ir; + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[i].scales, sc, m); + const float d1 = dall * sc; const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, sc, m); + const float d2 = dall * sc; const float m2 = dmin * m; + + uint8_t hm = 1 << (2*il); + y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1; + y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1; + hm <<= 1; + y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; + y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; +#else + const int tid = threadIdx.x; + const uint8_t q = x[i].qs[tid]; + const int im = tid/8; // 0...3 + const int in = tid%8; // 0...7 + const int is = tid/16; // 0 or 1 + const uint8_t h = x[i].qh[in] >> im; + const float d = x[i].d; + float * y = yy + i*QK_K + tid; + y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16)); + y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16)); +#endif +} + +static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) { + const block_q6_K * x = (const block_q6_K *) vx; + + const int i = blockIdx.x; +#if QK_K == 256 + + // assume 64 threads - this is very slightly better than the one below + const int tid = threadIdx.x; + const int ip = tid/32; // ip is 0 or 1 + const int il = tid - 32*ip; // 0...32 + const int is = 8*ip + il/16; + + float * y = yy + i*QK_K + 128*ip + il; + + const float d = x[i].d; + + const uint8_t * ql = x[i].ql + 64*ip + il; + const uint8_t qh = x[i].qh[32*ip + il]; + const int8_t * sc = x[i].scales + is; + + y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); + y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); + y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); +#else + + // assume 32 threads + const int tid = threadIdx.x; + const int ip = tid/16; // 0 or 1 + const int il = tid - 16*ip; // 0...15 + + float * y = yy + i*QK_K + 16*ip + il; + + const float d = x[i].d; + + const uint8_t ql = x[i].ql[16*ip + il]; + const uint8_t qh = x[i].qh[il] >> (2*ip); + const int8_t * sc = x[i].scales; + + y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32); +#endif +} + +static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { + + static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); + + const int row = blockIdx.y*blockDim.y + threadIdx.y; + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q2_K * x = (const block_q2_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15 + const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int step = 16/K_QUANTS_PER_ITERATION; + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int s_offset = 8*im; + const int y_offset = 128*im + l0; + + uint32_t aux[4]; + const uint8_t * d = (const uint8_t *)aux; + const uint8_t * m = (const uint8_t *)(aux + 2); + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); + + const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); + aux[0] = a[0] & 0x0f0f0f0f; + aux[1] = a[1] & 0x0f0f0f0f; + aux[2] = (a[0] >> 4) & 0x0f0f0f0f; + aux[3] = (a[1] >> 4) & 0x0f0f0f0f; + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) + + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) + + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) + + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) + + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) + + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) + + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) + +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); + sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] + + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; + + } + tmp += dall * sum1 - dmin * sum2; + + } +#else + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 + const int offset = tid * K_QUANTS_PER_ITERATION; + + uint32_t uaux[2]; + const uint8_t * d = (const uint8_t *)uaux; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + offset; + const uint8_t * q = x[i].qs + offset; + const uint32_t * s = (const uint32_t *)x[i].scales; + + uaux[0] = s[0] & 0x0f0f0f0f; + uaux[1] = (s[0] >> 4) & 0x0f0f0f0f; + + const float2 dall = __half22float2(x[i].dm); + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + const uint8_t ql = q[l]; + sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3) + + y[l+16] * d[1] * ((ql >> 2) & 3) + + y[l+32] * d[2] * ((ql >> 4) & 3) + + y[l+48] * d[3] * ((ql >> 6) & 3); + sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7]; + } + tmp += dall.x * sum1 - dall.y * sum2; + } +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (threadIdx.x == 0) { + dst[row] = tmp; + } +} + +static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { + + const int row = blockIdx.y*blockDim.y + threadIdx.y; + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q3_K * x = (const block_q3_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + + const uint16_t kmask1 = 0x0303; + const uint16_t kmask2 = 0x0f0f; + + const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop + const int step = 16/K_QUANTS_PER_ITERATION; + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0....15 or 0...7 + + const uint8_t m = 1 << (4*im); + + const int l0 = n*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int y_offset = 128*im + l0; + + uint16_t utmp[4]; + const int8_t * s = (const int8_t *)utmp; + + const uint16_t s_shift = 4*im; + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + const uint8_t * h = x[i].hmask + l0; + + const uint16_t * a = (const uint16_t *)x[i].scales; + utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); + utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); + utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); + utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); + + const float d = x[i].d; + + float sum = 0; + for (int l = 0; l < n; ++l) { + sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) + + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) + + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) + + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); + sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) + + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) + + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) + + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); + } + tmp += d * sum; + + } +#else + + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 + const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14 + const int in = offset/8; // 0 or 1 + const int im = offset%8; // 0...7 + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + offset; + const uint8_t * q = x[i].qs + offset; + const uint8_t * s = x[i].scales; + + const float dall = (float)x[i].d; + + float sum = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + const uint8_t hl = x[i].hmask[im+l] >> in; + const uint8_t ql = q[l]; + sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4)) + + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4)) + + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4)) + + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4)); + } + tmp += sum; + } +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (threadIdx.x == 0) { + dst[row] = tmp; + } +} + +static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { + + const int row = blockIdx.y*blockDim.y + threadIdx.y; + if (row > nrows) return; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q4_K * x = (const block_q4_K *)vx + ib0; + +#if QK_K == 256 + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 + + const int il = tid/step; // 0...3 + const int ir = tid - step*il; // 0...7 or 0...3 + const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + +#if K_QUANTS_PER_ITERATION == 2 + uint32_t q32[4]; + const uint8_t * q4 = (const uint8_t *)q32; +#else + uint16_t q16[4]; + const uint8_t * q4 = (const uint8_t *)q16; +#endif + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + +#if K_QUANTS_PER_ITERATION == 2 + const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset); + const uint32_t * q2 = q1 + 16; + + q32[0] = q1[0] & 0x0f0f0f0f; + q32[1] = q1[0] & 0xf0f0f0f0; + q32[2] = q2[0] & 0x0f0f0f0f; + q32[3] = q2[0] & 0xf0f0f0f0; + + float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 4; ++l) { + s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4]; + s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; +#else + const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset); + const uint16_t * q2 = q1 + 32; + + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[0] & 0xf0f0; + q16[2] = q2[0] & 0x0f0f; + q16[3] = q2[0] & 0xf0f0; + + float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 2; ++l) { + s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; + s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; +#endif + + } +#else + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); + + const int step = tid * K_QUANTS_PER_ITERATION; + + uint16_t aux16[2]; + const uint8_t * s = (const uint8_t *)aux16; + + float tmp = 0; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + const uint8_t * q = x[i].qs + step; + const float * y = yy + i*QK_K + step; + const uint16_t * a = (const uint16_t *)x[i].scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; + float sum = 0.f; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) + + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2]) + + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3]) + + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]); + } + tmp += sum; + } + +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + +static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) { + + const int row = blockIdx.x; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q5_K * x = (const block_q5_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int tid = threadIdx.x/2; // 0...15 + const int ix = threadIdx.x%2; + + const int il = tid/4; // 0...3 + const int ir = tid - 4*il;// 0...3 + const int n = 2; + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + const uint8_t hm1 = 1 << (2*im); + const uint8_t hm2 = hm1 << 4; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + + uint16_t q16[8]; + const uint8_t * q4 = (const uint8_t *)q16; + + for (int i = ix; i < num_blocks_per_row; i += 2) { + + const uint8_t * ql1 = x[i].qs + q_offset; + const uint8_t * qh = x[i].qh + l0; + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + + float4 sum = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + const uint16_t * q1 = (const uint16_t *)ql1; + const uint16_t * q2 = q1 + 32; + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[8] & 0x0f0f; + q16[2] = (q1[0] >> 4) & 0x0f0f; + q16[3] = (q1[8] >> 4) & 0x0f0f; + q16[4] = q2[0] & 0x0f0f; + q16[5] = q2[8] & 0x0f0f; + q16[6] = (q2[0] >> 4) & 0x0f0f; + q16[7] = (q2[8] >> 4) & 0x0f0f; + for (int l = 0; l < n; ++l) { + sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0)) + + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0)); + sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0)) + + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0)); + sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0)) + + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0)); + sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0)) + + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0)); + smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] + + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; + } + tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin; + } + +#else + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); + const int step = tid * K_QUANTS_PER_ITERATION; + const int im = step/8; + const int in = step%8; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + const uint8_t * q = x[i].qs + step; + const int8_t * s = x[i].scales; + const float * y = yy + i*QK_K + step; + const float d = x[i].d; + float sum = 0.f; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + const uint8_t h = x[i].qh[in+j] >> im; + sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16)) + + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16)) + + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16)) + + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16)); + } + tmp += sum; + } +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (threadIdx.x == 0) { + dst[row] = tmp; + } +} + +static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { + + static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); + + const int row = blockIdx.y*blockDim.y + threadIdx.y; + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q6_K * x = (const block_q6_K *)vx + ib0; + +#if QK_K == 256 + + const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + + const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + +#if K_QUANTS_PER_ITERATION == 1 + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 + const int is = 0; +#else + const int l0 = 4 * in; // 0, 4, 8, ..., 28 + const int is = in / 4; +#endif + const int ql_offset = 64*im + l0; + const int qh_offset = 32*im + l0; + const int s_offset = 8*im + is; + const int y_offset = 128*im + l0; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * ql = x[i].ql + ql_offset; + const uint8_t * qh = x[i].qh + qh_offset; + const int8_t * s = x[i].scales + s_offset; + + const float d = x[i].d; + +#if K_QUANTS_PER_ITERATION == 1 + float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) + + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) + + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) + + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) + + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) + + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) + + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) + +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); + tmp += sum; +#else + float sum = 0; + for (int l = 0; l < 4; ++l) { + sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) + + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) + + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) + + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); + } + tmp += sum; +#endif + + } + +#else + + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3 + + const int step = tid * K_QUANTS_PER_ITERATION; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + step; + const uint8_t * ql = x[i].ql + step; + const uint8_t * qh = x[i].qh + step; + const int8_t * s = x[i].scales; + + const float d = x[i+0].d; + + float sum = 0; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32) + + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32) + + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32) + + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32); + } + tmp += sum; + + } + +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + +static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){ + const half * x = (const half *) vx; + + // automatic half -> float type cast if dfloat == float + v.x = x[ib + iqs + 0]; + v.y = x[ib + iqs + 1]; +} + +static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) { + const int ix = blockDim.x*blockIdx.x + threadIdx.x; + + if (ix >= kx_padded) { + return; + } + + const int iy = blockDim.y*blockIdx.y + threadIdx.y; + + const int i_padded = iy*kx_padded + ix; + + block_q8_1 * y = (block_q8_1 *) vy; + + const int ib = i_padded / QK8_1; // block index + const int iqs = i_padded % QK8_1; // quant index + + const float xi = ix < kx ? x[iy*kx + ix] : 0.0f; + float amax = fabsf(xi); + float sum = xi; + +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32)); + sum += __shfl_xor_sync(0xffffffff, sum, mask, 32); + } + + const float d = amax / 127; + const int8_t q = amax == 0.0f ? 0 : roundf(xi / d); + + y[ib].qs[iqs] = q; + + if (iqs > 0) { + return; + } + + reinterpret_cast(y[ib].ds.x) = d; + reinterpret_cast(y[ib].ds.y) = sum; +} + +template +static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) { + const int i = blockDim.x*blockIdx.x + 2*threadIdx.x; + + if (i >= k) { + return; + } + + const int ib = i/qk; // block index + const int iqs = (i%qk)/qr; // quant index + const int iybs = i - i%qk; // y block start index + const int y_offset = qr == 1 ? 1 : qk/2; + + // dequantize + dfloat2 v; + dequantize_kernel(vx, ib, iqs, v); + + y[iybs + iqs + 0] = v.x; + y[iybs + iqs + y_offset] = v.y; +} + +// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called +// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q + +#define VDR_Q4_0_Q8_1_MMVQ 2 +#define VDR_Q4_0_Q8_1_MMQ 4 + +template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl( + const int * v, const int * u, const float & d4, const half2 & ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; + const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; + + // SIMD dot product of quantized values + sumi = __dp4a(vi0, u[2*i+0], sumi); + sumi = __dp4a(vi1, u[2*i+1], sumi); + } + + const float2 ds8f = __half22float2(ds8); + + // second part effectively subtracts 8 from each quant value + return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y); +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q4_1_Q8_1_MMVQ 2 +#define VDR_Q4_1_Q8_1_MMQ 4 + +template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl( + const int * v, const int * u, const half2 & dm4, const half2 & ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; + const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; + + // SIMD dot product of quantized values + sumi = __dp4a(vi0, u[2*i+0], sumi); + sumi = __dp4a(vi1, u[2*i+1], sumi); + } + +#ifdef GGML_CUDA_F16 + const float2 tmp = __half22float2(__hmul2(dm4, ds8)); + const float d4d8 = tmp.x; + const float m4s8 = tmp.y; +#else + const float2 dm4f = __half22float2(dm4); + const float2 ds8f = __half22float2(ds8); + const float d4d8 = dm4f.x * ds8f.x; + const float m4s8 = dm4f.y * ds8f.y; +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it + return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q5_0_Q8_1_MMVQ 2 +#define VDR_Q5_0_Q8_1_MMQ 4 + +template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl( + const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits + vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 + vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 + vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 + vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 + sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values + + int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits + vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 + vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 + vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 + vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 + sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values + } + + const float2 ds8f = __half22float2(ds8); + + // second part effectively subtracts 16 from each quant value + return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y); +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q5_1_Q8_1_MMVQ 2 +#define VDR_Q5_1_Q8_1_MMQ 4 + +template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl( + const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits + vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 + vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 + vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 + vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 + sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values + + int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits + vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 + vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 + vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 + vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 + sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values + } + +#ifdef GGML_CUDA_F16 + const float2 tmp = __half22float2(__hmul2(dm5, ds8)); + const float d5d8 = tmp.x; + const float m5s8 = tmp.y; +#else + const float2 dm5f = __half22float2(dm5); + const float2 ds8f = __half22float2(ds8); + const float d5d8 = dm5f.x * ds8f.x; + const float m5s8 = dm5f.y * ds8f.y; +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it + return sumi*d5d8 + m5s8 / (QI5_1 / vdr); + +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q8_0_Q8_1_MMVQ 2 +#define VDR_Q8_0_Q8_1_MMQ 8 + +template static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl( + const int * v, const int * u, const float & d8_0, const float & d8_1) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + // SIMD dot product of quantized values + sumi = __dp4a(v[i], u[i], sumi); + } + + return d8_0*d8_1 * sumi; +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl( + const int * v, const int * u, const half2 & dm8, const half2 & ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + // SIMD dot product of quantized values + sumi = __dp4a(v[i], u[i], sumi); + } + +#ifdef GGML_CUDA_F16 + const float2 tmp = __half22float2(__hmul2(dm8, ds8)); + const float d8d8 = tmp.x; + const float m8s8 = tmp.y; +#else + const float2 dm8f = __half22float2(dm8); + const float2 ds8f = __half22float2(ds8); + const float d8d8 = dm8f.x * ds8f.x; + const float m8s8 = dm8f.y * ds8f.y; +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it + return sumi*d8d8 + m8s8 / (QI8_1 / vdr); +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q2_K_Q8_1_MMVQ 1 +#define VDR_Q2_K_Q8_1_MMQ 2 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( + const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales, + const half2 & dm2, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR2_K; ++i) { + const int sc = scales[2*i]; + + const int vi = (v >> (2*i)) & 0x03030303; + + sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product + + // fill int with 4x m + int m = sc >> 4; + m |= m << 8; + m |= m << 16; + sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values + } + + const float2 dm2f = __half22float2(dm2); + + return dm2f.x*sumf_d - dm2f.y*sumf_m; +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales, + const half2 & dm2, const float & d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi_d = 0; + int sumi_m = 0; + +#pragma unroll + for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) { + int sumi_d_sc = 0; + + const int sc = scales[i0 / (QI8_1/2)]; + + // fill int with 4x m + int m = sc >> 4; + m |= m << 8; + m |= m << 16; + +#pragma unroll + for (int i = i0; i < i0 + QI8_1/2; ++i) { + sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product + sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m + } + + sumi_d += sumi_d_sc * (sc & 0xF); + } + + const float2 dm2f = __half22float2(dm2); + + return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m); +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q3_K_Q8_1_MMVQ 1 +#define VDR_Q3_K_Q8_1_MMQ 2 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( + const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales, + const int & scale_offset, const float & d3, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf = 0.0f; + +#pragma unroll + for (int i = 0; i < QR3_K; ++i) { + const int isc = scale_offset + 2*i; + + const int isc_low = isc % (QK_K/32); + const int sc_shift_low = 4 * (isc / (QK_K/32)); + const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF; + + const int isc_high = isc % (QK_K/64); + const int sc_shift_high = 2 * (isc / (QK_K/64)); + const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4; + + const int sc = (sc_low | sc_high) - 32; + + const int vil = (vl >> (2*i)) & 0x03030303; + + const int vih = ((vh >> i) << 2) & 0x04040404; + + const int vi = __vsubss4(vil, vih); + + sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product + } + + return d3 * sumf; +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales, + const float & d3, const float & d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) { + int sumi_sc = 0; + + for (int i = i0; i < i0 + QI8_1/2; ++i) { + sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product + } + + sumi += sumi_sc * scales[i0 / (QI8_1/2)]; + } + + return d3*d8 * sumi; +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q4_K_Q8_1_MMVQ 2 +#define VDR_Q4_K_Q8_1_MMQ 8 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR4_K; ++i) { + const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F; + const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F; + + const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product + const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u + + sumf_d += d8[i] * (dot1 * sc[i]); + sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values + } + + const float2 dm4f = __half22float2(dm4); + + return dm4f.x*sumf_d - dm4f.y*sumf_m; + +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) { + int sumi_d = 0; + +#pragma unroll + for (int j = 0; j < QI8_1; ++j) { + sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product + } + + const float2 ds8f = __half22float2(ds8[i]); + + sumf_d += ds8f.x * (sc[i] * sumi_d); + sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val + } + + const float2 dm4f = __half22float2(dm4); + + return dm4f.x*sumf_d - dm4f.y*sumf_m; + +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q5_K_Q8_1_MMVQ 2 +#define VDR_Q5_K_Q8_1_MMQ 8 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( + const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR5_K; ++i) { + const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F; + const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F; + + const int vh0i = ((vh[0] >> i) << 4) & 0x10101010; + const int vh1i = ((vh[1] >> i) << 4) & 0x10101010; + + const int v0i = vl0i | vh0i; + const int v1i = vl1i | vh1i; + + const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product + const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u + + sumf_d += d8[i] * (dot1 * sc[i]); + sumf_m += d8[i] * (dot2 * m[i]); + + } + + const float2 dm5f = __half22float2(dm5); + + return dm5f.x*sumf_d - dm5f.y*sumf_m; + +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) { + int sumi_d = 0; + +#pragma unroll + for (int j = 0; j < QI8_1; ++j) { + sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product + } + + const float2 ds8f = __half22float2(ds8[i]); + + sumf_d += ds8f.x * (sc[i] * sumi_d); + sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val + } + + const float2 dm4f = __half22float2(dm4); + + return dm4f.x*sumf_d - dm4f.y*sumf_m; + +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q6_K_Q8_1_MMVQ 1 +#define VDR_Q6_K_Q8_1_MMQ 8 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( + const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales, + const float & d, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf = 0.0f; + +#pragma unroll + for (int i = 0; i < QR6_K; ++i) { + const int sc = scales[4*i]; + + const int vil = (vl >> (4*i)) & 0x0F0F0F0F; + + const int vih = ((vh >> (4*i)) << 4) & 0x30303030; + + const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32 + + sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product + } + + return d*sumf; +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc, + const float & d6, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + +#pragma unroll + for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) { + int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale + +#pragma unroll + for (int i = i0; i < i0 + 2; ++i) { + sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product + sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product + + sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product + sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product + } + + sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y); + } + + return d6 * sumf_d; + +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +static __device__ __forceinline__ float vec_dot_q4_0_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; + + int v[VDR_Q4_0_Q8_1_MMVQ]; + int u[2*VDR_Q4_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_uint8(bq4_0->qs, iqs + i); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0); + } + + return vec_dot_q4_0_q8_1_impl(v, u, bq4_0->d, bq8_1->ds); +} + +template static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0]; + + *x_ql = tile_x_qs; + *x_dm = (half2 *) tile_x_d; +} + +template static __device__ __forceinline__ void load_tiles_q4_0( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < nwarps); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); + + const int kbx = k / QI4_0; + const int kqsx = k % QI4_0; + + const block_q4_0 * bx0 = (block_q4_0 *) vx; + + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) { + int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row; + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d; + } +} + +static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const float * x_dmf = (float *) x_dm; + + int u[2*VDR_Q4_0_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE]; + } + + return vec_dot_q4_0_q8_1_impl + (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0], + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +static __device__ __forceinline__ float vec_dot_q4_1_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; + + int v[VDR_Q4_1_Q8_1_MMVQ]; + int u[2*VDR_Q4_1_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1); + } + + return vec_dot_q4_1_q8_1_impl(v, u, bq4_1->dm, bq8_1->ds); +} + +template static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1]; + + *x_ql = tile_x_qs; + *x_dm = tile_x_dm; +} + +template static __device__ __forceinline__ void load_tiles_q4_1( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < nwarps); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); + + const int kbx = k / QI4_1; + const int kqsx = k % QI4_1; + + const block_q4_1 * bx0 = (block_q4_1 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_1; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) { + int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row; + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm; + } +} + +static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + + int u[2*VDR_Q4_1_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE]; + } + + return vec_dot_q4_1_q8_1_impl + (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1], + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +static __device__ __forceinline__ float vec_dot_q5_0_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; + + int vl[VDR_Q5_0_Q8_1_MMVQ]; + int vh[VDR_Q5_0_Q8_1_MMVQ]; + int u[2*VDR_Q5_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) { + vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i); + vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i)); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0); + } + + return vec_dot_q5_0_q8_1_impl(vl, vh, u, bq5_0->d, bq8_1->ds); +} + +template static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0]; + + *x_ql = tile_x_ql; + *x_dm = (half2 *) tile_x_d; +} + +template static __device__ __forceinline__ void load_tiles_q5_0( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < nwarps); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); + + const int kbx = k / QI5_0; + const int kqsx = k % QI5_0; + + const block_q5_0 * bx0 = (block_q5_0 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx; + + const int ql = get_int_from_uint8(bxi->qs, kqsx); + const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0)); + + int qs0 = (ql >> 0) & 0x0F0F0F0F; + qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 + qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 + qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 + qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 + qs0 = __vsubss4(qs0, 0x10101010); // subtract 16 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; + + int qs1 = (ql >> 4) & 0x0F0F0F0F; + qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 + qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 + qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 + qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 + qs1 = __vsubss4(qs1, 0x10101010); // subtract 16 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_0; + const int kbxd = k % blocks_per_tile_x_row; + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) { + int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row; + + if (need_check) { + i = min(i, i_max); + } + + const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d; + } +} + +static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + int u[2*VDR_Q5_0_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE]; + } + + return vec_dot_q8_0_q8_1_impl + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +static __device__ __forceinline__ float vec_dot_q5_1_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; + + int vl[VDR_Q5_1_Q8_1_MMVQ]; + int vh[VDR_Q5_1_Q8_1_MMVQ]; + int u[2*VDR_Q5_1_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) { + vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i); + vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i)); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1); + } + + return vec_dot_q5_1_q8_1_impl(vl, vh, u, bq5_1->dm, bq8_1->ds); +} + +template static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1]; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; +} + +template static __device__ __forceinline__ void load_tiles_q5_1( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < nwarps); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); + + const int kbx = k / QI5_1; + const int kqsx = k % QI5_1; + + const block_q5_1 * bx0 = (block_q5_1 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx; + + const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); + const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1)); + + int qs0 = (ql >> 0) & 0x0F0F0F0F; + qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 + qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 + qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 + qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; + + int qs1 = (ql >> 4) & 0x0F0F0F0F; + qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 + qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 + qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 + qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_1; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) { + int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row; + + if (need_check) { + i = min(i, i_max); + } + + const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm; + } +} + +static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; + + int u[2*VDR_Q5_1_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE]; + } + + return vec_dot_q8_1_q8_1_impl + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +static __device__ __forceinline__ float vec_dot_q8_0_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; + + int v[VDR_Q8_0_Q8_1_MMVQ]; + int u[VDR_Q8_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_int8(bq8_0->qs, iqs + i); + u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + } + + return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, __low2half(bq8_1->ds)); +} + +template static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0]; + + *x_ql = tile_x_qs; + *x_dm = (half2 *) tile_x_d; +} + +template static __device__ __forceinline__ void load_tiles_q8_0( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < nwarps); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); + + const int kbx = k / QI8_0; + const int kqsx = k % QI8_0; + float * x_dmf = (float *) x_dm; + + const block_q8_0 * bx0 = (block_q8_0 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) { + int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row; + + if (need_check) { + i = min(i, i_max); + } + + const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d; + } +} + +static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + return vec_dot_q8_0_q8_1_impl + (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0], + y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]); +} + +static __device__ __forceinline__ float vec_dot_q2_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q2_K * bq2_K = (const block_q2_K *) vbq; + + const int bq8_offset = QR2_K * (iqs / QI8_1); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + + const uint8_t * scales = bq2_K->scales + scale_offset; + + const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs); + int u[QR2_K]; + float d8[QR2_K]; + +#pragma unroll + for (int i = 0; i < QR2_K; ++ i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); + d8[i] = __low2half(bq8_1[bq8_offset + i].ds); + } + + return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); +} + +template static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4]; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_sc = tile_x_sc; +} + +template static __device__ __forceinline__ void load_tiles_q2_K( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < nwarps); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); + + const int kbx = k / QI2_K; + const int kqsx = k % QI2_K; + + const block_q2_K * bx0 = (block_q2_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI2_K; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) { + int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = min(i, i_max); + } + + const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { + int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + + if (need_check) { + i = min(i, i_max); + } + + const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4); + + x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4)); + } +} + +static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + const int kbx = k / QI2_K; + const int ky = (k % QI2_K) * QR2_K; + const float * y_df = (const float *) y_ds; + + int v[QR2_K*VDR_Q2_K_Q8_1_MMQ]; + + const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2); + const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2)); + +#pragma unroll + for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) { + v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303; + } + + const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; + + const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE; + return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); +} + +static __device__ __forceinline__ float vec_dot_q3_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q3_K * bq3_K = (const block_q3_K *) vbq; + + const int bq8_offset = QR3_K * (iqs / (QI3_K/2)); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + + const float d = bq3_K->d; + + const int vl = get_int_from_uint8(bq3_K->qs, iqs); + + // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted + const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset; + + int u[QR3_K]; + float d8[QR3_K]; + +#pragma unroll + for (int i = 0; i < QR3_K; ++i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); + d8[i] = __low2half(bq8_1[bq8_offset + i].ds); + } + + return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); +} + +template static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K]; + __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4]; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_qh = tile_x_qh; + *x_sc = tile_x_sc; +} + +template static __device__ __forceinline__ void load_tiles_q3_K( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < nwarps); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); + + const int kbx = k / QI3_K; + const int kqsx = k % QI3_K; + + const block_q3_K * bx0 = (block_q3_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI3_K; + const int kbxd = k % blocks_per_tile_x_row; + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) { + int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) { + int i = i0 + i_offset * 2 + k / (WARP_SIZE/2); + + if (need_check) { + i = min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2); + + // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted + x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2)); + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { + int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + + if (need_check) { + i = min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4); + + const int ksc = k % (QI3_K/4); + + const int ksc_low = ksc % (QI3_K/8); + const int shift_low = 4 * (ksc / (QI3_K/8)); + const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F; + + const int ksc_high = QI3_K/8; + const int shift_high = 2 * ksc; + const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030; + + const int sc = __vsubss4(sc_low | sc_high, 0x20202020); + + x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc; + } +} + +static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + const int kbx = k / QI3_K; + const int ky = (k % QI3_K) * QR3_K; + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4; + + int v[QR3_K*VDR_Q3_K_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) { + const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2); + const int shift = 2 * ((ky % 32) / 8); + const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303; + + const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8); + const int vlh = (vh << 2) & 0x04040404; + + v[l] = __vsubss4(vll, vlh); + } + + const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE; + return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); +} + +static __device__ __forceinline__ float vec_dot_q4_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + +#ifndef GGML_QKK_64 + const block_q4_K * bq4_K = (const block_q4_K *) vbq; + + int v[2]; + int u[2*QR4_K]; + float d8[QR4_K]; + + // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6 + const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2)); + + // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12 + // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44 + // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76 + // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108 + + const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); + v[0] = q4[0]; + v[1] = q4[4]; + + const uint16_t * scales = (const uint16_t *)bq4_K->scales; + uint16_t aux[2]; + const int j = bq8_offset/2; + if (j < 2) { + aux[0] = scales[j+0] & 0x3f3f; + aux[1] = scales[j+2] & 0x3f3f; + } else { + aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); + aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; + + for (int i = 0; i < QR4_K; ++i) { + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + d8[i] = __low2half(bq8i->ds); + + const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); + u[2*i+0] = q8[0]; + u[2*i+1] = q8[4]; + } + + return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8); + +#else + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q4_K * bq4_K = (const block_q4_K *) vbq; + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + + uint16_t aux16[2]; + const uint8_t * s = (const uint8_t *)aux16; + + const uint16_t * a = (const uint16_t *)bq4_K->scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + + const float dall = bq4_K->dm[0]; + const float dmin = bq4_K->dm[1]; + + const float d8_1 = __low2float(bq8_1[0].ds); + const float d8_2 = __low2float(bq8_1[1].ds); + + const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); + const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); + const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); + const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); + + const int * q4 = (const int *)bq4_K->qs + (iqs/2); + const int v1 = q4[0]; + const int v2 = q4[4]; + + const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0)); + const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0)); + const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0)); + const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0)); + + sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]); + sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]); + + return dall * sumf_d - dmin * sumf_m; + +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + +#endif +} + +template static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_sc = tile_x_sc; +} + +template static __device__ __forceinline__ void load_tiles_q4_K( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < nwarps); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); + + const int kbx = k / QI4_K; // == 0 if QK_K == 256 + const int kqsx = k % QI4_K; // == k if QK_K == 256 + + const block_q4_K * bx0 = (block_q4_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) { + int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; + +#if QK_K == 256 + x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm; +#else + x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]}; +#endif + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8); + + const int * scales = (int *) bxi->scales; + + const int ksc = k % (WARP_SIZE/8); + + // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 + int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits + scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits + + x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; + } +} + +static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); + + const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE; + return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); +} + +static __device__ __forceinline__ float vec_dot_q5_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + +#ifndef GGML_QKK_64 + const block_q5_K * bq5_K = (const block_q5_K *) vbq; + + int vl[2]; + int vh[2]; + int u[2*QR5_K]; + float d8[QR5_K]; + + const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2)); + const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); + const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4)); + + vl[0] = ql[0]; + vl[1] = ql[4]; + + vh[0] = qh[0] >> bq8_offset; + vh[1] = qh[4] >> bq8_offset; + + const uint16_t * scales = (const uint16_t *)bq5_K->scales; + uint16_t aux[2]; + const int j = bq8_offset/2; + if (j < 2) { + aux[0] = scales[j+0] & 0x3f3f; + aux[1] = scales[j+2] & 0x3f3f; + } else { + aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); + aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; + +#pragma unroll + for (int i = 0; i < QR5_K; ++i) { + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + d8[i] = __low2float(bq8i->ds); + + const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); + u[2*i+0] = q8[0]; + u[2*i+1] = q8[4]; + } + + return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8); + +#else + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q5_K * bq5_K = (const block_q5_K *) vbq; + + const int8_t * s = bq5_K->scales; + + const float d = bq5_K->d; + + const float d8_1 = __low2half(bq8_1[0].ds); + const float d8_2 = __low2half(bq8_1[1].ds); + + const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); + const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); + const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); + const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); + + const int * ql = (const int *)bq5_K->qs + (iqs/2); + const int vl1 = ql[0]; + const int vl2 = ql[4]; + + const int step = 4 * (iqs/2); // 0, 4, 8, 12 + const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6 + const int in = step%8; // 0, 4, 0, 4 + const int vh = (*((const int *)(bq5_K->qh + in))) >> im; + + const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f); + const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f); + const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f); + const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f); + + const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1]) + + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]); + + return d * sumf_d; + +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + +#endif +} + +template static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_sc = tile_x_sc; +} + +template static __device__ __forceinline__ void load_tiles_q5_K( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < nwarps); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); + + const int kbx = k / QI5_K; // == 0 if QK_K == 256 + const int kqsx = k % QI5_K; // == k if QK_K == 256 + + const block_q5_K * bx0 = (block_q5_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx; + const int ky = QR5_K*kqsx; + + const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); + const int ql0 = (ql >> 0) & 0x0F0F0F0F; + const int ql1 = (ql >> 4) & 0x0F0F0F0F; + + const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4)); + const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010; + const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010; + + const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0; + const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4); + + x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0; + x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) { + int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = min(i, i_max); + } + + const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; + +#if QK_K == 256 + x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; +#endif + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + + if (need_check) { + i = min(i, i_max); + } + + const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8); + + const int * scales = (int *) bxi->scales; + + const int ksc = k % (WARP_SIZE/8); + + // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 + int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits + scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits + + x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; + } +} + +static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); + + const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; + const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE; + return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); +} + +static __device__ __forceinline__ float vec_dot_q6_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q6_K * bq6_K = (const block_q6_K *) vbq; + + const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4); + const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8); + const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4)); + + const int vl = get_int_from_uint8(bq6_K->ql, iqs); + const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift; + + const int8_t * scales = bq6_K->scales + scale_offset; + + int u[QR6_K]; + float d8[QR6_K]; + +#pragma unroll + for (int i = 0; i < QR6_K; ++i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); + d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds); + } + + return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); +} + +template static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_sc = tile_x_sc; +} + +template static __device__ __forceinline__ void load_tiles_q6_K( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + + __builtin_assume(i_offset >= 0); + __builtin_assume(i_offset < nwarps); + __builtin_assume(k >= 0); + __builtin_assume(k < WARP_SIZE); + + const int kbx = k / QI6_K; // == 0 if QK_K == 256 + const int kqsx = k % QI6_K; // == k if QK_K == 256 + + const block_q6_K * bx0 = (block_q6_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx; + const int ky = QR6_K*kqsx; + + const int ql = get_int_from_uint8(bxi->ql, kqsx); + const int ql0 = (ql >> 0) & 0x0F0F0F0F; + const int ql1 = (ql >> 4) & 0x0F0F0F0F; + + const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)); + const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030; + const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030; + + const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0; + const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2); + + x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020); + x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) { + int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = min(i, i_max); + } + + const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + + if (need_check) { + i = min(i, i_max); + } + + const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4; + + x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8)); + } +} + +static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]); + + const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; + const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE; + return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); +} + +template +static __device__ __forceinline__ void mul_mat_q( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + const int blocks_per_row_x = ncols_x / qk; + const int blocks_per_col_y = nrows_y / QK8_1; + const int blocks_per_warp = WARP_SIZE / qi; + + const int & ncols_dst = ncols_y; + + const int row_dst_0 = blockIdx.x*mmq_y; + const int & row_x_0 = row_dst_0; + + const int col_dst_0 = blockIdx.y*mmq_x; + const int & col_y_0 = col_dst_0; + + int * tile_x_ql = nullptr; + half2 * tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + + allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + + __shared__ int tile_y_qs[mmq_x * WARP_SIZE]; + __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1]; + + float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f}; + + for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { + + load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, + threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x); + +#pragma unroll + for (int ir = 0; ir < qr; ++ir) { + const int kqs = ir*WARP_SIZE + threadIdx.x; + const int kbxd = kqs / QI8_1; + +#pragma unroll + for (int i = 0; i < mmq_x; i += nwarps) { + const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses + + const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd]; + + const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE; + tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1); + } + +#pragma unroll + for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) { + const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x; + const int kby = threadIdx.x % (WARP_SIZE/QI8_1); + const int col_y_eff = min(col_y_0 + ids, ncols_y-1); + + // if the sum is not needed it's faster to transform the scale to f32 ahead of time + const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds; + half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby]; + if (need_sum) { + *dsi_dst = *dsi_src; + } else { + float * dfi_dst = (float *) dsi_dst; + *dfi_dst = __low2half(*dsi_src); + } + } + + __syncthreads(); + +// #pragma unroll // unrolling this loop causes too much register pressure + for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) { +#pragma unroll + for (int j = 0; j < mmq_x; j += nwarps) { +#pragma unroll + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + sum[i/WARP_SIZE][j/nwarps] += vec_dot( + tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, + threadIdx.x + i, threadIdx.y + j, k); + } + } + } + + __syncthreads(); + } + } + +#pragma unroll + for (int j = 0; j < mmq_x; j += nwarps) { + const int col_dst = col_dst_0 + j + threadIdx.y; + + if (col_dst >= ncols_dst) { + return; + } + +#pragma unroll + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + const int row_dst = row_dst_0 + threadIdx.x + i; + + if (row_dst >= nrows_dst) { + continue; + } + + dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps]; + } + } +} + +#define MMQ_X_Q4_0_AMPERE 64 +#define MMQ_Y_Q4_0_AMPERE 128 +#define NWARPS_Q4_0_AMPERE 4 +#define MMQ_X_Q4_0_PASCAL 64 +#define MMQ_Y_Q4_0_PASCAL 64 +#define NWARPS_Q4_0_PASCAL 8 + +template static __global__ void mul_mat_q4_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q4_0_AMPERE; + const int mmq_y = MMQ_Y_Q4_0_AMPERE; + const int nwarps = NWARPS_Q4_0_AMPERE; + + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q4_0_PASCAL; + const int mmq_y = MMQ_Y_Q4_0_PASCAL; + const int nwarps = NWARPS_Q4_0_PASCAL; + + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q4_0_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q4_1_AMPERE 64 +#define MMQ_Y_Q4_1_AMPERE 128 +#define NWARPS_Q4_1_AMPERE 4 +#define MMQ_X_Q4_1_PASCAL 64 +#define MMQ_Y_Q4_1_PASCAL 64 +#define NWARPS_Q4_1_PASCAL 8 + +template static __global__ void +#if __CUDA_ARCH__ < CC_TURING + __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_TURING + mul_mat_q4_1( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q4_1_AMPERE; + const int mmq_y = MMQ_Y_Q4_1_AMPERE; + const int nwarps = NWARPS_Q4_1_AMPERE; + + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q4_1_PASCAL; + const int mmq_y = MMQ_Y_Q4_1_PASCAL; + const int nwarps = NWARPS_Q4_1_PASCAL; + + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q4_1_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q5_0_AMPERE 128 +#define MMQ_Y_Q5_0_AMPERE 64 +#define NWARPS_Q5_0_AMPERE 4 +#define MMQ_X_Q5_0_PASCAL 64 +#define MMQ_Y_Q5_0_PASCAL 64 +#define NWARPS_Q5_0_PASCAL 8 + +template static __global__ void mul_mat_q5_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q5_0_AMPERE; + const int mmq_y = MMQ_Y_Q5_0_AMPERE; + const int nwarps = NWARPS_Q5_0_AMPERE; + + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q5_0_PASCAL; + const int mmq_y = MMQ_Y_Q5_0_PASCAL; + const int nwarps = NWARPS_Q5_0_PASCAL; + + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q5_0_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q5_1_AMPERE 128 +#define MMQ_Y_Q5_1_AMPERE 64 +#define NWARPS_Q5_1_AMPERE 4 +#define MMQ_X_Q5_1_PASCAL 64 +#define MMQ_Y_Q5_1_PASCAL 64 +#define NWARPS_Q5_1_PASCAL 8 + +template static __global__ void mul_mat_q5_1( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q5_1_AMPERE; + const int mmq_y = MMQ_Y_Q5_1_AMPERE; + const int nwarps = NWARPS_Q5_1_AMPERE; + + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q5_1_PASCAL; + const int mmq_y = MMQ_Y_Q5_1_PASCAL; + const int nwarps = NWARPS_Q5_1_PASCAL; + + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q5_1_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q8_0_AMPERE 128 +#define MMQ_Y_Q8_0_AMPERE 64 +#define NWARPS_Q8_0_AMPERE 4 +#define MMQ_X_Q8_0_PASCAL 64 +#define MMQ_Y_Q8_0_PASCAL 64 +#define NWARPS_Q8_0_PASCAL 8 + +template static __global__ void mul_mat_q8_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q8_0_AMPERE; + const int mmq_y = MMQ_Y_Q8_0_AMPERE; + const int nwarps = NWARPS_Q8_0_AMPERE; + + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q8_0_PASCAL; + const int mmq_y = MMQ_Y_Q8_0_PASCAL; + const int nwarps = NWARPS_Q8_0_PASCAL; + + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q8_0_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q2_K_AMPERE 64 +#define MMQ_Y_Q2_K_AMPERE 128 +#define NWARPS_Q2_K_AMPERE 4 +#define MMQ_X_Q2_K_PASCAL 64 +#define MMQ_Y_Q2_K_PASCAL 64 +#define NWARPS_Q2_K_PASCAL 8 + +template static __global__ void mul_mat_q2_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q2_K_AMPERE; + const int mmq_y = MMQ_Y_Q2_K_AMPERE; + const int nwarps = NWARPS_Q2_K_AMPERE; + + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q2_K_PASCAL; + const int mmq_y = MMQ_Y_Q2_K_PASCAL; + const int nwarps = NWARPS_Q2_K_PASCAL; + + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q2_K_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q3_K_AMPERE 128 +#define MMQ_Y_Q3_K_AMPERE 128 +#define NWARPS_Q3_K_AMPERE 4 +#define MMQ_X_Q3_K_PASCAL 64 +#define MMQ_Y_Q3_K_PASCAL 64 +#define NWARPS_Q3_K_PASCAL 8 + +template static __global__ void +#if __CUDA_ARCH__ < CC_TURING + __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_TURING + mul_mat_q3_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q3_K_AMPERE; + const int mmq_y = MMQ_Y_Q3_K_AMPERE; + const int nwarps = NWARPS_Q3_K_AMPERE; + + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q3_K_PASCAL; + const int mmq_y = MMQ_Y_Q3_K_PASCAL; + const int nwarps = NWARPS_Q3_K_PASCAL; + + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q3_K_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q4_K_AMPERE 64 +#define MMQ_Y_Q4_K_AMPERE 128 +#define NWARPS_Q4_K_AMPERE 4 +#define MMQ_X_Q4_K_PASCAL 64 +#define MMQ_Y_Q4_K_PASCAL 64 +#define NWARPS_Q4_K_PASCAL 8 + +template static __global__ void +#if __CUDA_ARCH__ < CC_TURING + __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_TURING + mul_mat_q4_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q4_K_AMPERE; + const int mmq_y = MMQ_Y_Q4_K_AMPERE; + const int nwarps = NWARPS_Q4_K_AMPERE; + + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q4_K_PASCAL; + const int mmq_y = MMQ_Y_Q4_K_PASCAL; + const int nwarps = NWARPS_Q4_K_PASCAL; + + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q4_K_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q5_K_AMPERE 64 +#define MMQ_Y_Q5_K_AMPERE 128 +#define NWARPS_Q5_K_AMPERE 4 +#define MMQ_X_Q5_K_PASCAL 64 +#define MMQ_Y_Q5_K_PASCAL 64 +#define NWARPS_Q5_K_PASCAL 8 + +template static __global__ void mul_mat_q5_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q5_K_AMPERE; + const int mmq_y = MMQ_Y_Q5_K_AMPERE; + const int nwarps = NWARPS_Q5_K_AMPERE; + + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q5_K_PASCAL; + const int mmq_y = MMQ_Y_Q5_K_PASCAL; + const int nwarps = NWARPS_Q5_K_PASCAL; + + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q5_K_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q6_K_AMPERE 64 +#define MMQ_Y_Q6_K_AMPERE 64 +#define NWARPS_Q6_K_AMPERE 4 +#define MMQ_X_Q6_K_PASCAL 64 +#define MMQ_Y_Q6_K_PASCAL 64 +#define NWARPS_Q6_K_PASCAL 8 + +template static __global__ void +#if __CUDA_ARCH__ < CC_TURING + __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_TURING + mul_mat_q6_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q6_K_AMPERE; + const int mmq_y = MMQ_Y_Q6_K_AMPERE; + const int nwarps = NWARPS_Q6_K_AMPERE; + + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q6_K_PASCAL; + const int mmq_y = MMQ_Y_Q6_K_PASCAL; + const int nwarps = NWARPS_Q6_K_PASCAL; + + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q6_K_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +template +static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) { + const int row = blockIdx.y*blockDim.y + threadIdx.y; + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = 0; i < blocks_per_row; i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index + + const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (threadIdx.x == 0) { + dst[row] = tmp; + } +} + +template +static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) { + // qk = quantized weights per x block + // qr = number of quantized weights per data value in x block + const int row = blockIdx.y*blockDim.y + threadIdx.y; + + if (row >= nrows) { + return; + } + + const int tid = threadIdx.x; + + const int iter_stride = 2*GGML_CUDA_DMMV_X; + const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter + const int y_offset = qr == 1 ? 1 : qk/2; + +// partial sum for each thread +#ifdef GGML_CUDA_F16 + half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics +#else + float tmp = 0.0f; +#endif // GGML_CUDA_F16 + + for (int i = 0; i < ncols; i += iter_stride) { + const int col = i + vals_per_iter*tid; + const int ib = (row*ncols + col)/qk; // x block index + const int iqs = (col%qk)/qr; // x quant index + const int iybs = col - col%qk; // y block start index + +// processing >2 values per i iter is faster for fast GPUs +#pragma unroll + for (int j = 0; j < vals_per_iter; j += 2) { + // process 2 vals per j iter + + // dequantize + // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val + dfloat2 v; + dequantize_kernel(vx, ib, iqs + j/qr, v); + + // matrix multiplication + // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 +#ifdef GGML_CUDA_F16 + tmp += __hmul2(v, { + y[iybs + iqs + j/qr + 0], + y[iybs + iqs + j/qr + y_offset] + }); +#else + tmp += v.x * y[iybs + iqs + j/qr + 0]; + tmp += v.y * y[iybs + iqs + j/qr + y_offset]; +#endif // GGML_CUDA_F16 + } + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (tid == 0) { +#ifdef GGML_CUDA_F16 + dst[row] = tmp.x + tmp.y; +#else + dst[row] = tmp; +#endif // GGML_CUDA_F16 + } +} + +static __global__ void mul_mat_p021_f16_f32( + const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) { + + const half * x = (const half *) vx; + + const int row_x = blockDim.y*blockIdx.y + threadIdx.y; + const int channel = blockDim.z*blockIdx.z + threadIdx.z; + const int channel_x = channel / (nchannels_y / nchannels_x); + + const int nrows_y = ncols_x; + const int nrows_dst = nrows_x; + const int row_dst = row_x; + + float tmp = 0.0f; + + for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) { + const int col_x = col_x0 + threadIdx.x; + + if (col_x >= ncols_x) { + break; + } + + // x is transposed and permuted + const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x; + const float xi = __half2float(x[ix]); + + const int row_y = col_x; + + + // y is not transposed but permuted + const int iy = channel*nrows_y + row_y; + + tmp += xi * y[iy]; + } + + // dst is not transposed and not permuted + const int idst = channel*nrows_dst + row_dst; + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (threadIdx.x == 0) { + dst[idst] = tmp; + } +} + +static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous + const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, + const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) { + + const half * x = (const half *) vx; + + const int row_x = blockDim.y*blockIdx.y + threadIdx.y; + const int channel = blockDim.z*blockIdx.z + threadIdx.z; + const int channel_x = channel / channel_x_divisor; + + const int nrows_y = ncols_x; + const int nrows_dst = nrows_x; + const int row_dst = row_x; + + const int idst = channel*nrows_dst + row_dst; + + float tmp = 0.0f; + + for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) { + const int col_x = col_x0 + threadIdx.x; + + if (col_x >= ncols_x) { + break; + } + + const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x; + const float xi = __half2float(x[ix]); + + const int row_y = col_x; + + const int iy = channel*nrows_y + row_y; + + tmp += xi * y[iy]; + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (threadIdx.x == 0) { + dst[idst] = tmp; + } +} + +static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + float * dsti = (float *) cdsti; + + *dsti = *xi; +} + +static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + half * dsti = (half *) cdsti; + + *dsti = __float2half(*xi); +} + +template +static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, + const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= ne) { + return; + } + + // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor + // then combine those indices with the corresponding byte offsets to get the total offsets + const int i02 = i / (ne00*ne01); + const int i01 = (i - i02*ne01*ne00) / ne00; + const int i00 = i - i02*ne01*ne00 - i01*ne00; + const int x_offset = i00*nb00 + i01*nb01 + i02*nb02; + + const int i12 = i / (ne10*ne11); + const int i11 = (i - i12*ne10*ne11) / ne10; + const int i10 = i - i12*ne10*ne11 - i11*ne10; + const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12; + + cpy_1(cx + x_offset, cdst + dst_offset); +} + +// rope == RoPE == rotary positional embedding +static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0, + const float p_delta, const int p_delta_rows, const float theta_scale) { + const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); + + if (col >= ncols) { + return; + } + + const int row = blockDim.x*blockIdx.x + threadIdx.x; + const int i = row*ncols + col; + + const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2); + const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + 1]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + 1] = x0*sin_theta + x1*cos_theta; +} + +static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0, + const float p_delta, const int p_delta_rows, const float theta_scale) { + const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); + + if (col >= ncols) { + return; + } + + const int row = blockDim.x*blockIdx.x + threadIdx.x; + const int i = row*ncols + col/2; + + const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2); + const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + ncols/2]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + ncols/2] = x0*sin_theta + x1*cos_theta; +} + +static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0, + const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) { + const int col = blockDim.x*blockIdx.x + threadIdx.x; + const int half_n_dims = ncols/4; + + if (col >= half_n_dims) { + return; + } + + const int row = blockDim.y*blockIdx.y + threadIdx.y; + const int i = row*ncols + col; + + const float col_theta_scale = powf(theta_scale, col); + const float p = p0 + p_delta*(row/p_delta_rows); + + const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale; + const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + half_n_dims]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta; + + const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale; + const float sin_block_theta = sinf(block_theta); + const float cos_block_theta = cosf(block_theta); + + const float x2 = x[i + half_n_dims * 2]; + const float x3 = x[i + half_n_dims * 3]; + + dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta; + dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta; +} + +static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows, + const int n_heads_log2_floor, const float m0, const float m1) { + const int col = blockDim.x*blockIdx.x + threadIdx.x; + + if (col >= ncols) { + return; + } + + const int row = blockDim.y*blockIdx.y + threadIdx.y; + const int i = row*ncols + col; + + const int k = row/k_rows; + + float m_k; + if (k < n_heads_log2_floor) { + m_k = powf(m0, k + 1); + } else { + m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); + } + + dst[i] = col * m_k + x[i]; +} + +static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) { + const int col = blockDim.y*blockIdx.y + threadIdx.y; + const int row = blockDim.x*blockIdx.x + threadIdx.x; + + if (col >= ncols) { + return; + } + + const int i = row*ncols + col; + // dst[i] = col > n_past + row ? -INFINITY : x[i]; + dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU +} + +// the CUDA soft max implementation differs from the CPU implementation +// instead of doubles floats are used +static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) { + const int row = blockDim.x*blockIdx.x + threadIdx.x; + const int block_size = blockDim.y; + const int tid = threadIdx.y; + + float max_val = -INFINITY; + + for (int col = tid; col < ncols; col += block_size) { + const int i = row*ncols + col; + max_val = max(max_val, x[i]); + } + + // find the max value in the block +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32)); + } + + float tmp = 0.f; + + for (int col = tid; col < ncols; col += block_size) { + const int i = row*ncols + col; + const float val = expf(x[i] - max_val); + tmp += val; + dst[i] = val; + } + + // sum up partial sums +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + const float inv_tmp = 1.f / tmp; + + for (int col = tid; col < ncols; col += block_size) { + const int i = row*ncols + col; + dst[i] *= inv_tmp; + } +} + +static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + dst[i] = scale * x[i]; +} + +static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) { + const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE; + add_f32<<>>(x, y, dst, kx, ky); +} + +static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE; + add_f16_f32_f16<<>>(x, y, dst, k); +} + +static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) { + const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE; + mul_f32<<>>(x, y, dst, kx, ky); +} + +static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE; + gelu_f32<<>>(x, dst, k); +} + +static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE; + silu_f32<<>>(x, dst, k); +} + +static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % WARP_SIZE == 0); + if (ncols < 1024) { + const dim3 block_dims(WARP_SIZE, 1, 1); + norm_f32<<>>(x, dst, ncols); + } else { + const dim3 block_dims(1024, 1, 1); + norm_f32<1024><<>>(x, dst, ncols); + } +} + +static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) { + GGML_ASSERT(ncols % WARP_SIZE == 0); + if (ncols < 1024) { + const dim3 block_dims(WARP_SIZE, 1, 1); + rms_norm_f32<<>>(x, dst, ncols, eps); + } else { + const dim3 block_dims(1024, 1, 1); + rms_norm_f32<1024><<>>(x, dst, ncols, eps); + } +} + +static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) { + const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; + const dim3 num_blocks(block_num_x, ky, 1); + const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1); + quantize_q8_1<<>>(x, vy, kx, kx_padded); +} + +static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; + dequantize_block<<>>(vx, y, k); +} + +static void dequantize_row_q4_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; + dequantize_block<<>>(vx, y, k); +} + +static void dequantize_row_q5_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; + dequantize_block<<>>(vx, y, k); +} + +static void dequantize_row_q5_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; + dequantize_block<<>>(vx, y, k); +} + +static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; + dequantize_block<<>>(vx, y, k); +} + +static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { + const int nb = k / QK_K; +#if QK_K == 256 + dequantize_block_q2_K<<>>(vx, y); +#else + dequantize_block_q2_K<<>>(vx, y); +#endif +} + +static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { + const int nb = k / QK_K; +#if QK_K == 256 + dequantize_block_q3_K<<>>(vx, y); +#else + dequantize_block_q3_K<<>>(vx, y); +#endif +} + +static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_q4_K<<>>(vx, y); +} + +static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { + const int nb = k / QK_K; +#if QK_K == 256 + dequantize_block_q5_K<<>>(vx, y); +#else + dequantize_block_q5_K<<>>(vx, y); +#endif +} + +static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { + const int nb = k / QK_K; +#if QK_K == 256 + dequantize_block_q6_K<<>>(vx, y); +#else + dequantize_block_q6_K<<>>(vx, y); +#endif +} + +static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_vec + <<>>(vx, y, dst, ncols, nrows); +} + +static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_vec + <<>>(vx, y, dst, ncols, nrows); +} + +static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_vec + <<>>(vx, y, dst, ncols, nrows); +} + +static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_vec + <<>>(vx, y, dst, ncols, nrows); +} + +static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_vec + <<>>(vx, y, dst, ncols, nrows); +} + +static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2 + const int block_num_y = (nrows + ny - 1) / ny; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(32, ny, 1); + dequantize_mul_mat_vec_q2_k<<>>(vx, y, dst, ncols, nrows); +} + +static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(32, ny, 1); + dequantize_mul_mat_vec_q3_k<<>>(vx, y, dst, ncols, nrows); +} + +static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(32, ny, 1); + dequantize_mul_mat_vec_q4_k<<>>(vx, y, dst, ncols, nrows); +} + +static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const dim3 block_dims(32, 1, 1); + dequantize_mul_mat_vec_q5_k<<>>(vx, y, dst, ncols); +} + +static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(32, ny, 1); + dequantize_mul_mat_vec_q6_k<<>>(vx, y, dst, ncols, nrows); +} + +static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK4_0 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + +static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK4_1 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + +static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK5_0 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + +static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK5_1 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + +static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK8_0 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + +static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + +static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + +static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + +static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + +static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + +static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; + dequantize_block<1, 1, convert_f16><<>>(vx, y, k); +} + +static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_vec<1, 1, convert_f16> + <<>>(vx, y, dst, ncols, nrows); +} + +static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return dequantize_row_q4_0_cuda; + case GGML_TYPE_Q4_1: + return dequantize_row_q4_1_cuda; + case GGML_TYPE_Q5_0: + return dequantize_row_q5_0_cuda; + case GGML_TYPE_Q5_1: + return dequantize_row_q5_1_cuda; + case GGML_TYPE_Q8_0: + return dequantize_row_q8_0_cuda; + case GGML_TYPE_Q2_K: + return dequantize_row_q2_K_cuda; + case GGML_TYPE_Q3_K: + return dequantize_row_q3_K_cuda; + case GGML_TYPE_Q4_K: + return dequantize_row_q4_K_cuda; + case GGML_TYPE_Q5_K: + return dequantize_row_q5_K_cuda; + case GGML_TYPE_Q6_K: + return dequantize_row_q6_K_cuda; + case GGML_TYPE_F16: + return convert_fp16_to_fp32_cuda; + default: + return nullptr; + } +} + +static void ggml_mul_mat_q4_0_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q4_0_AMPERE; + mmq_y = MMQ_Y_Q4_0_AMPERE; + nwarps = NWARPS_Q4_0_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q4_0_PASCAL; + mmq_y = MMQ_Y_Q4_0_PASCAL; + nwarps = NWARPS_Q4_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q4_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q4_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +} + +static void ggml_mul_mat_q4_1_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q4_1_AMPERE; + mmq_y = MMQ_Y_Q4_1_AMPERE; + nwarps = NWARPS_Q4_1_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q4_1_PASCAL; + mmq_y = MMQ_Y_Q4_1_PASCAL; + nwarps = NWARPS_Q4_1_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q4_1<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q4_1<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +} + +static void ggml_mul_mat_q5_0_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q5_0_AMPERE; + mmq_y = MMQ_Y_Q5_0_AMPERE; + nwarps = NWARPS_Q5_0_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q5_0_PASCAL; + mmq_y = MMQ_Y_Q5_0_PASCAL; + nwarps = NWARPS_Q5_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q5_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q5_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +} + +static void ggml_mul_mat_q5_1_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q5_1_AMPERE; + mmq_y = MMQ_Y_Q5_1_AMPERE; + nwarps = NWARPS_Q5_1_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q5_1_PASCAL; + mmq_y = MMQ_Y_Q5_1_PASCAL; + nwarps = NWARPS_Q5_1_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q5_1<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q5_1<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +} + +static void ggml_mul_mat_q8_0_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q8_0_AMPERE; + mmq_y = MMQ_Y_Q8_0_AMPERE; + nwarps = NWARPS_Q8_0_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q8_0_PASCAL; + mmq_y = MMQ_Y_Q8_0_PASCAL; + nwarps = NWARPS_Q8_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q8_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q8_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +} + +static void ggml_mul_mat_q2_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q2_K_AMPERE; + mmq_y = MMQ_Y_Q2_K_AMPERE; + nwarps = NWARPS_Q2_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q2_K_PASCAL; + mmq_y = MMQ_Y_Q2_K_PASCAL; + nwarps = NWARPS_Q2_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q2_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q2_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +} + +static void ggml_mul_mat_q3_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + +#if QK_K == 256 + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q3_K_AMPERE; + mmq_y = MMQ_Y_Q3_K_AMPERE; + nwarps = NWARPS_Q3_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q3_K_PASCAL; + mmq_y = MMQ_Y_Q3_K_PASCAL; + nwarps = NWARPS_Q3_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q3_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q3_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +#endif +} + +static void ggml_mul_mat_q4_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q4_K_AMPERE; + mmq_y = MMQ_Y_Q4_K_AMPERE; + nwarps = NWARPS_Q4_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q4_K_PASCAL; + mmq_y = MMQ_Y_Q4_K_PASCAL; + nwarps = NWARPS_Q4_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q4_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q4_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +} + +static void ggml_mul_mat_q5_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q5_K_AMPERE; + mmq_y = MMQ_Y_Q5_K_AMPERE; + nwarps = NWARPS_Q5_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q5_K_PASCAL; + mmq_y = MMQ_Y_Q5_K_PASCAL; + nwarps = NWARPS_Q5_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q5_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q5_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +} + +static void ggml_mul_mat_q6_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q6_K_AMPERE; + mmq_y = MMQ_Y_Q6_K_AMPERE; + nwarps = NWARPS_Q6_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q6_K_PASCAL; + mmq_y = MMQ_Y_Q6_K_PASCAL; + nwarps = NWARPS_Q6_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q6_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q6_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +} + +static void ggml_mul_mat_p021_f16_f32_cuda( + const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, + const int nchannels_x, const int nchannels_y, cudaStream_t stream) { + + const dim3 block_nums(1, nrows_x, nchannels_y); + const dim3 block_dims(WARP_SIZE, 1, 1); + mul_mat_p021_f16_f32<<>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y); +} + +static void ggml_mul_mat_vec_nc_f16_f32_cuda( + const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x, + const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) { + + const dim3 block_nums(1, nrows_x, nchannels_y); + const dim3 block_dims(WARP_SIZE, 1, 1); + mul_mat_vec_nc_f16_f32<<>> + (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x); +} + +static void ggml_cpy_f32_f32_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, + const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + cpy_f32_f16<<>> + (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12); +} + +static void ggml_cpy_f32_f16_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, + const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + cpy_f32_f16<<>> + (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12); +} + +static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; + scale_f32<<>>(x, dst, scale, k); +} + +static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, + const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) { + GGML_ASSERT(ncols % 2 == 0); + const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); + const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); + const dim3 block_nums(nrows, num_blocks_x, 1); + rope_f32<<>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale); +} + +static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, + const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) { + GGML_ASSERT(ncols % 2 == 0); + const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); + const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); + const dim3 block_nums(nrows, num_blocks_x, 1); + rope_neox_f32<<>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale); +} + +static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, + const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) { + GGML_ASSERT(ncols % 4 == 0); + const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1); + const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE; + const dim3 block_nums(num_blocks_x, nrows, 1); + rope_glm_f32<<>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx); +} + +static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, + const int k_rows, const int n_heads_log2_floor, const float m0, + const float m1, cudaStream_t stream) { + const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1); + const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE); + const dim3 block_nums(num_blocks_x, nrows, 1); + alibi_f32<<>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1); +} + +static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) { + const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1); + const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE; + const dim3 block_nums(nrows_x, block_num_x, 1); + diag_mask_inf_f32<<>>(x, dst, ncols_x, rows_per_channel, n_past); +} + +static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) { + const dim3 block_dims(1, WARP_SIZE, 1); + const dim3 block_nums(nrows_x, 1, 1); + soft_max_f32<<>>(x, dst, ncols_x); +} + +// buffer pool for cuda +#define MAX_CUDA_BUFFERS 256 + +struct scoped_spin_lock { + std::atomic_flag& lock; + scoped_spin_lock(std::atomic_flag& lock) : lock(lock) { + while (lock.test_and_set(std::memory_order_acquire)) { + ; // spin + } + } + ~scoped_spin_lock() { + lock.clear(std::memory_order_release); + } + scoped_spin_lock(const scoped_spin_lock&) = delete; + scoped_spin_lock& operator=(const scoped_spin_lock&) = delete; +}; + +struct cuda_buffer { + void * ptr = nullptr; + size_t size = 0; +}; + +static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS]; +static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT; + +static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { + scoped_spin_lock lock(g_cuda_pool_lock); + int id; + CUDA_CHECK(cudaGetDevice(&id)); +#ifdef DEBUG_CUDA_MALLOC + int nnz = 0; + size_t max_size = 0, tot_size = 0; +#endif + size_t best_diff = 1ull << 36; + int ibest = -1; + for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { + cuda_buffer& b = g_cuda_buffer_pool[id][i]; + if (b.ptr != nullptr) { +#ifdef DEBUG_CUDA_MALLOC + ++nnz; + tot_size += b.size; + if (b.size > max_size) max_size = b.size; +#endif + if (b.size >= size) { + size_t diff = b.size - size; + if (diff < best_diff) { + best_diff = diff; + ibest = i; + if (!best_diff) { + void * ptr = b.ptr; + *actual_size = b.size; + b.ptr = nullptr; + b.size = 0; + return ptr; + } + } + } + } + } + if (ibest >= 0) { + cuda_buffer& b = g_cuda_buffer_pool[id][ibest]; + void * ptr = b.ptr; + *actual_size = b.size; + b.ptr = nullptr; + b.size = 0; + return ptr; + } +#ifdef DEBUG_CUDA_MALLOC + fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz, + (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024)); +#endif + void * ptr; + size_t look_ahead_size = (size_t) (1.05 * size); + look_ahead_size = 256 * ((look_ahead_size + 255)/256); + CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size)); + *actual_size = look_ahead_size; + return ptr; +} + +static void ggml_cuda_pool_free(void * ptr, size_t size) { + scoped_spin_lock lock(g_cuda_pool_lock); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + + for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { + cuda_buffer& b = g_cuda_buffer_pool[id][i]; + if (b.ptr == nullptr) { + b.ptr = ptr; + b.size = size; + return; + } + } + fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n"); + CUDA_CHECK(cudaFree(ptr)); +} + + +void ggml_init_cublas() { + static bool initialized = false; + + if (!initialized) { + +#ifdef __HIP_PLATFORM_AMD__ + // Workaround for a rocBLAS bug when using multiple graphics cards: + // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 + rocblas_initialize(); + CUDA_CHECK(cudaDeviceSynchronize()); +#endif + + CUDA_CHECK(cudaGetDeviceCount(&g_device_count)); + GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); + int64_t total_vram = 0; + fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count); + for (int id = 0; id < g_device_count; ++id) { + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); + fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor); + + g_tensor_split[id] = total_vram; + total_vram += prop.totalGlobalMem; + + g_compute_capabilities[id] = 100*prop.major + 10*prop.minor; + } + for (int id = 0; id < g_device_count; ++id) { + g_tensor_split[id] /= total_vram; + } + + for (int id = 0; id < g_device_count; ++id) { + CUDA_CHECK(cudaSetDevice(id)); + + // create main stream + CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking)); + + // create cublas handle + CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id])); + CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH)); + } + + // configure logging to stdout + // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); + + initialized = true; + } +} + +void ggml_cuda_set_tensor_split(const float * tensor_split) { + if (tensor_split == nullptr) { + return; + } + bool all_zero = true; + for (int i = 0; i < g_device_count; ++i) { + if (tensor_split[i] != 0.0f) { + all_zero = false; + break; + } + } + if (all_zero) { + return; + } + float split_sum = 0.0f; + for (int i = 0; i < g_device_count; ++i) { + g_tensor_split[i] = split_sum; + split_sum += tensor_split[i]; + } + for (int i = 0; i < g_device_count; ++i) { + g_tensor_split[i] /= split_sum; + } +} + +void * ggml_cuda_host_malloc(size_t size) { + if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { + return nullptr; + } + + void * ptr = nullptr; + cudaError_t err = cudaMallocHost((void **) &ptr, size); + if (err != cudaSuccess) { + // The allocation error can be bypassed. A null ptr will assigned out of this function. + // This can fixed the OOM error in WSL. + cudaGetLastError(); + fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", + size/1024.0/1024.0, cudaGetErrorString(err)); + return nullptr; + } + + return ptr; +} + +void ggml_cuda_host_free(void * ptr) { + CUDA_CHECK(cudaFreeHost(ptr)); +} + +static cudaError_t ggml_cuda_cpy_tensor_2d( + void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) { + + cudaMemcpyKind kind; + char * src_ptr; + if (src->backend == GGML_BACKEND_CPU) { + kind = cudaMemcpyHostToDevice; + src_ptr = (char *) src->data; + } else if (src->backend == GGML_BACKEND_GPU) { + kind = cudaMemcpyDeviceToDevice; + struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + src_ptr = (char *) extra->data_device[id]; + } else { + GGML_ASSERT(false); + } + char * dst_ptr = (char *) dst; + + const int64_t ne0 = src->ne[0]; + const int64_t nb0 = src->nb[0]; + const int64_t nb1 = src->nb[1]; + const int64_t nb2 = src->nb[2]; + const int64_t nb3 = src->nb[3]; + const enum ggml_type type = src->type; + const int64_t ts = ggml_type_size(type); + const int64_t bs = ggml_blck_size(type); + int64_t i1_diff = i1_high - i1_low; + + const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3; + if (nb0 == ts && nb1 == ts*ne0/bs) { + return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream); + } else if (nb0 == ts) { + return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream); + } else { + for (int64_t i1 = 0; i1 < i1_diff; i1++) { + const void * rx = (const void *) ((const char *) x + i1*nb1); + void * rd = (void *) (dst_ptr + i1*ts*ne0/bs); + // pretend the row is a matrix with cols=1 + cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream); + if (r != cudaSuccess) return r; + } + return cudaSuccess; + } +} + +inline void ggml_cuda_op_add( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr); + GGML_ASSERT(src1_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + // compute + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { + add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main); + } else { + GGML_ASSERT(false); + } + + (void) src1; + (void) dst; + (void) src0_ddq_i; + (void) i02; + (void) i1; +} + +inline void ggml_cuda_op_mul( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(src1_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main); + + (void) dst; + (void) src0_ddq_i; + (void) i02; + (void) i1; +} + +inline void ggml_cuda_op_gelu( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + + // compute + gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main); + + (void) src1; + (void) dst; + (void) src0_ddq_i; + (void) src1_ddf_i; + (void) i02; + (void) i1; +} + +inline void ggml_cuda_op_silu( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + + // compute + silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main); + + (void) src1; + (void) dst; + (void) src0_ddq_i; + (void) src1_ddf_i; + (void) i02; + (void) i1; +} + +inline void ggml_cuda_op_norm( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + + // compute + norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main); + + (void) src1; + (void) dst; + (void) src0_ddq_i; + (void) src1_ddf_i; + (void) i02; + (void) i1; +} + +inline void ggml_cuda_op_rms_norm( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + // compute + rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main); + + (void) src1; + (void) dst; + (void) src0_ddq_i; + (void) src1_ddf_i; + (void) i02; + (void) i1; +} + +inline void ggml_cuda_op_mul_mat_q( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddq_i != nullptr); + GGML_ASSERT(src1_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + GGML_ASSERT(ne10 % QK8_1 == 0); + + const int64_t ne0 = dst->ne[0]; + + const int64_t i01_diff = i01_high - i01_low; + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + + // the main device has a larger memory buffer to hold the results from all GPUs + // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into + const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff; + + const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ? + ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING; + size_t as; + void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as); + quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main); + + switch (src0->type) { + case GGML_TYPE_Q4_0: + ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + case GGML_TYPE_Q4_1: + ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + case GGML_TYPE_Q5_0: + ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + case GGML_TYPE_Q5_1: + ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + case GGML_TYPE_Q8_0: + ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + case GGML_TYPE_Q2_K: + ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + case GGML_TYPE_Q3_K: + ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + case GGML_TYPE_Q4_K: + ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + case GGML_TYPE_Q5_K: + ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + case GGML_TYPE_Q6_K: + ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + break; + default: + GGML_ASSERT(false); + break; + } + + ggml_cuda_pool_free(src1_q8_1, as); + + (void) src1; + (void) dst; + (void) src0_ddf_i; + (void) i02; + (void) i1; +} + +static int64_t get_row_rounding(ggml_type type) { + int max_compute_capability = INT_MIN; + for (int id = 0; id < g_device_count; ++id) { + if (max_compute_capability < g_compute_capabilities[id] + && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { + max_compute_capability = g_compute_capabilities[id]; + } + } + + switch(type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + return max_compute_capability >= CC_TURING ? 128 : 64; + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return 64; + case GGML_TYPE_F16: + return 1; + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + return max_compute_capability >= CC_TURING ? 128 : 64; + case GGML_TYPE_Q6_K: + return 64; + default: + GGML_ASSERT(false); + } +} + +inline void ggml_cuda_op_mul_mat_vec( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddq_i != nullptr); + GGML_ASSERT(src1_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows = i01_high - i01_low; + +#ifdef GGML_CUDA_FORCE_DMMV + const bool use_mul_mat_vec_q = false; + (void) g_compute_capabilities[0]; +#else + int id; + CUDA_CHECK(cudaGetDevice(&id)); + + bool mul_mat_vec_q_implemented = + src0->type == GGML_TYPE_Q4_0 || + src0->type == GGML_TYPE_Q4_1 || + src0->type == GGML_TYPE_Q5_0 || + src0->type == GGML_TYPE_Q5_1 || + src0->type == GGML_TYPE_Q8_0; +#if QK_K == 256 + mul_mat_vec_q_implemented = mul_mat_vec_q_implemented || + src0->type == GGML_TYPE_Q2_K || + src0->type == GGML_TYPE_Q3_K || + src0->type == GGML_TYPE_Q4_K || + src0->type == GGML_TYPE_Q5_K || + src0->type == GGML_TYPE_Q6_K; +#endif // QK_K == 256 + + const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented; +#endif + + if (use_mul_mat_vec_q) { + const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ? + ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING; + size_t as; + void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as); + quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main); + + switch (src0->type) { + case GGML_TYPE_Q4_0: + mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q4_1: + mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q5_0: + mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q5_1: + mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q8_0: + mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q2_K: + mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q3_K: + mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q4_K: + mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q5_K: + mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q6_K: + mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + default: + GGML_ASSERT(false); + break; + } + + ggml_cuda_pool_free(src1_q8_1, as); + } else { + // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics +#ifdef GGML_CUDA_F16 + size_t ash; + dfloat * src1_dfloat = nullptr; // dfloat == half + + bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || + src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || + src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; + + if (src1_convert_f16) { + src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash); + ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00, + ne00, 1, sizeof(float), 0, 0, + ne00, 1, sizeof(half), 0, 0, cudaStream_main); + } +#else + dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion +#endif // GGML_CUDA_F16 + + switch (src0->type) { + case GGML_TYPE_Q4_0: + dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q4_1: + dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q5_0: + dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q5_1: + dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q8_0: + dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q2_K: + dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q3_K: + dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q4_K: + dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q5_K: + dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q6_K: + dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_F16: + convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + default: + GGML_ASSERT(false); + break; + } + +#ifdef GGML_CUDA_F16 + if (src1_convert_f16) { + ggml_cuda_pool_free(src1_dfloat, ash); + } +#endif // GGML_CUDA_F16 + } + + (void) src1; + (void) dst; + (void) src0_ddf_i; + (void) i02; + (void) i1; +} + +inline void ggml_cuda_op_mul_mat_cublas( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(src1_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const float alpha = 1.0f; + const float beta = 0.0f; + + const int64_t ne00 = src0->ne[0]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int64_t ne0 = dst->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + + // the main device has a larger memory buffer to hold the results from all GPUs + // ldc == nrows of the matrix that cuBLAS writes into + int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff; + + CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], cudaStream_main)); + CUBLAS_CHECK( + cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, + i01_diff, ne11, ne10, + &alpha, src0_ddf_i, ne00, + src1_ddf_i, ne10, + &beta, dst_ddf_i, ldc)); + + (void) dst; + (void) src0_ddq_i; + (void) i02; + (void) i1; +} + +inline void ggml_cuda_op_rope( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t i01_diff = i01_high - i01_low; + + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + // RoPE alteration for extended context + + float freq_base, freq_scale; + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + + const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale; + + const bool is_neox = mode & 2; + const bool is_glm = mode & 4; + + // compute + if (is_glm) { + rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, n_ctx, cudaStream_main); + } else if (is_neox) { + GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet"); + rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main); + } else { + rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main); + } + + (void) src1; + (void) dst; + (void) src0_ddq_i; + (void) src1_ddf_i; + (void) i02; + (void) i1; +} + +inline void ggml_cuda_op_alibi( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t i01_diff = i01_high - i01_low; + + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_head = ((int32_t *) dst->op_params)[1]; + float max_bias; + memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); + + GGML_ASSERT(ne01 + n_past == ne00); + GGML_ASSERT(n_head == ne02); + + const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); + + const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); + + // compute + alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main); + + (void) src1; + (void) src0_ddq_i; + (void) src1_ddf_i; + (void) i02; + (void) i1; +} + +inline void ggml_cuda_op_diag_mask_inf( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t i01_diff = i01_high - i01_low; + + const int n_past = ((int32_t *) dst->op_params)[0]; + + // compute + diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main); + + (void) src1; + (void) dst; + (void) src0_ddq_i; + (void) src1_ddf_i; + (void) i02; + (void) i1; +} + +inline void ggml_cuda_op_soft_max( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + + // compute + soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main); + + (void) src1; + (void) dst; + (void) src0_ddq_i; + (void) src1_ddf_i; + (void) i02; + (void) i1; +} + +inline void ggml_cuda_op_scale( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const float scale = ((float *) src1->data)[0]; + + const int64_t ne00 = src0->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + + // compute + scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main); + CUDA_CHECK(cudaGetLastError()); + + (void) src1; + (void) dst; + (void) src0_ddq_i; + (void) src1_ddf_i; + (void) i02; + (void) i1; +} + +static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) { + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + const int64_t nrows0 = ggml_nrows(src0); + + const bool use_src1 = src1 != nullptr; + const int64_t ne10 = use_src1 ? src1->ne[0] : 1; + const int64_t ne11 = use_src1 ? src1->ne[1] : 1; + const int64_t ne12 = use_src1 ? src1->ne[2] : 1; + const int64_t ne13 = use_src1 ? src1->ne[3] : 1; + const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; + + GGML_ASSERT(ne03 == ne13); + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); + + // strides for iteration over dims 3 and 2 + const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13; + const int64_t num_iters = flatten_rows ? 1 : num_iters_0; + const int64_t stride_mod = flatten_rows ? num_iters_0 : 1; + const int64_t src0_stride = ne00 * ne01 * stride_mod; + const int64_t src1_stride = ne10 * ne11 * stride_mod; + const int64_t dst_stride = ne0 * ne1 * stride_mod; + + const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01; + const int64_t i03_max = flatten_rows ? 1 : ne03; + const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12); + const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02; + GGML_ASSERT(!(flatten_rows && ne02 < ne12)); + + const size_t src0_ts = ggml_type_size(src0->type); + const size_t src0_bs = ggml_blck_size(src0->type); + + struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; + struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + + const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool src0_is_contiguous = ggml_is_contiguous(src0); + const bool src0_is_f32 = src0->type == GGML_TYPE_F32; + + const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1); + const bool src1_stays_on_host = use_src1 && ( + dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE); + + const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + GGML_ASSERT(!(split && ne02 < ne12)); + + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); + + // dd = data device + char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized + float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float + float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; + float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; + + // asq = actual size quantized, asf = actual size float + size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0}; + size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0}; + size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0}; + size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0}; + + // if multiple devices are used they need to wait for the main device + // here an event is recorded that signifies that the main device has finished calculating the input data + if (split && g_device_count > 1) { + CUDA_CHECK(cudaSetDevice(g_main_device)); + CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device])); + } + + for (int id = 0; id < g_device_count; ++id) { + if (!split && id != g_main_device) { + continue; + } + + const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device; + const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; + + int64_t row_low, row_high; + if (split) { + const int64_t rounding = get_row_rounding(src0->type); + + row_low = id == 0 ? 0 : nrows0*g_tensor_split[id]; + row_low -= row_low % rounding; + + if (id == g_device_count - 1) { + row_high = nrows0; + } else { + row_high = nrows0*g_tensor_split[id + 1]; + row_high -= row_high % rounding; + } + } else { + row_low = 0; + row_high = nrows0*i02_divisor; + } + if (row_low == row_high) { + continue; + } + + int64_t row_diff = row_high - row_low; + + cudaSetDevice(id); + cudaStream_t cudaStream_main = g_cudaStreams_main[id]; + + // wait for main GPU data if necessary + if (split && id != g_main_device) { + CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device])); + } + + if (src0_on_device && src0_is_contiguous) { + if (src0_is_f32) { + src0_ddf[id] = (float *) src0_extra->data_device[id]; + } else { + src0_ddq[id] = (char *) src0_extra->data_device[id]; + } + } else { + if (src0_is_f32) { + src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]); + } else { + src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]); + } + } + + if (src0_needs_f32 && !src0_is_f32) { + src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]); + } + + if (use_src1 && !src1_stays_on_host) { + if (src1_on_device && src1_is_contiguous) { + src1_ddf[id] = (float *) src1_extra->data_device[id]; + } else { + src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]); + } + } + if (dst_on_device) { + dst_ddf[id] = (float *) dst_extra->data_device[id]; + } else { + size_t size_dst_ddf = split ? row_diff*ne1 * sizeof(float) : num_iters*dst_stride * sizeof(float); + dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]); + } + + for (int64_t i03 = 0; i03 < i03_max; i03++) { + const int64_t i13 = i03 % ne13; + for (int64_t i02 = 0; i02 < i02_max; i02++) { + const int64_t i12 = i02 % ne12; + + const int64_t i0 = i03*i02_max + i02; + + // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs + const int64_t i0_offset_low = row_low/rows_per_iter; + const int64_t i0_offset_high = row_high/rows_per_iter; + + int64_t i01_low = 0; + int64_t i01_high = rows_per_iter; + if (split) { + if (i0 < i0_offset_low || i0 > i0_offset_high) { + continue; + } + if (i0 == i0_offset_low) { + i01_low = row_low % rows_per_iter; + } + if (i0 == i0_offset_high) { + i01_high = row_high % rows_per_iter; + } + } + + // There is possibly a bug in the Windows nvcc compiler regarding instruction reordering or optimizing out local variables. + // Removing the first assert or changing the order of the arguments causes the second assert to fail. + // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output. + // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU). + GGML_ASSERT(i01_low == 0 || g_device_count > 1); + GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1); + + const int64_t i01_diff = i01_high - i01_low; + if (i01_diff == 0) { + continue; + } + const int64_t i11 = i13*ne12 + i12; + + // for split tensors the data begins at i0 == i0_offset_low + char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs; + float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride; + float * src1_ddf_i = src1_ddf[id] + i11*src1_stride; + float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride; + + // for split tensors the data pointer needs to be rounded down + // to the bin edge for i03, i02 bins beyond the first + if (i0 - i0_offset_low > 0) { + GGML_ASSERT(!flatten_rows); + src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs; + src0_ddf_i -= (row_low % ne01)*ne00; + dst_ddf_i -= (row_low % ne0)*ne1; + } + + // the main device memory buffer can be on VRAM scratch, with space for all partial results + // in that case an offset on dst_ddf_i is needed + if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) { + dst_ddf_i += i01_low; // offset is 0 if no tensor split + } + + // copy src0, src1 to device if necessary + if (use_src1 && !src1_stays_on_host) { + if (src1->backend == GGML_BACKEND_CPU) { + GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1)); + int64_t nrows1 = flatten_rows ? nrows0 : ne11; + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main)); + } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) { + if (id != g_main_device) { + GGML_ASSERT(!flatten_rows); + float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device]; + src1_ddf_i_source += i11*src1_stride; + CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float), + cudaMemcpyDeviceToDevice, cudaStream_main)); + } + } else if (src1_on_device && !src1_is_contiguous) { + GGML_ASSERT(!split); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main)); + } else { + GGML_ASSERT(false); + } + } + + if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) { + if (src0_is_f32) { + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main)); + } else { + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main)); + } + } + + // convert src0 to f32 if it is necessary for the ggml_cuda_op + if (src0_needs_f32 && !src0_is_f32) { + to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main); + CUDA_CHECK(cudaGetLastError()); + } + + // do the computation + op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main); + CUDA_CHECK(cudaGetLastError()); + + // copy dst to host or other device if necessary + if (!dst_on_device) { + void * dst_off_device; + cudaMemcpyKind kind; + if (dst->backend == GGML_BACKEND_CPU) { + dst_off_device = dst->data; + kind = cudaMemcpyDeviceToHost; + } else if (dst->backend == GGML_BACKEND_GPU) { + dst_off_device = dst_extra->data_device[g_main_device]; + kind = cudaMemcpyDeviceToDevice; + } else { + GGML_ASSERT(false); + } + if (split) { + // src0 = weight matrix is saved as a transposed matrix for better memory layout. + // dst is NOT transposed. + // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU. + // Instead they need to be copied to the correct slice in ne0 = dst row index. + // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results. + float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3); + CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float), + i01_diff*sizeof(float), ne1, kind, cudaStream_main)); + } else { + float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); + CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main)); + } + } + + // signify to main device that other device is done + if (split && g_device_count > 1 && id != g_main_device) { + CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main)); + } + } + } + } + + // wait until each device is finished, then free their buffers + for (int id = 0; id < g_device_count; ++id) { + if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) { + continue; + } + + CUDA_CHECK(cudaSetDevice(id)); + + if (src0_asq[id] > 0) { + ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]); + } + if (src0_asf[id] > 0) { + ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]); + } + if (src1_asf[id] > 0) { + ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]); + } + if (dst_asf[id] > 0) { + ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]); + } + } + + // main device waits for all other devices to be finished + if (split && g_device_count > 1) { + CUDA_CHECK(cudaSetDevice(g_main_device)); + for (int id = 0; id < g_device_count; ++id) { + if (id != g_main_device && src0_extra->events[id]) { + CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id])); + } + } + } + + if (dst->backend == GGML_BACKEND_CPU) { + CUDA_CHECK(cudaSetDevice(g_main_device)); + CUDA_CHECK(cudaDeviceSynchronize()); + } +} + +void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + // ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op. + // Due to flatten_rows == true this does in practice not make a difference however. + // Better solution would be nice but right now that would require disproportionate changes. + GGML_ASSERT( + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) && + src1->type == GGML_TYPE_F32 && + (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16)); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true); +} + +void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten +} + +void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true); +} + +void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true); +} + +void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true); +} + +void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true); +} + +bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + // TODO: find the optimal values for these + if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && + src1->type == GGML_TYPE_F32 && + dst->type == GGML_TYPE_F32 && + (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { + return true; + } + + return false; +} + +void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ + GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); + GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation + GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + + const int64_t ne12 = src1->ne[2]; + + CUDA_CHECK(cudaSetDevice(g_main_device)); + cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; + + struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + void * src0_ddq = src0_extra->data_device[g_main_device]; + + struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; + + struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; + + ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main); +} + +void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ + GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1)); + GGML_ASSERT(!ggml_is_permuted(src0)); + GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + + const int64_t ne12 = src1->ne[2]; + + const int64_t nb01 = src0->nb[1]; + const int64_t nb02 = src0->nb[2]; + + CUDA_CHECK(cudaSetDevice(g_main_device)); + cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; + + struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + void * src0_ddq = src0_extra->data_device[g_main_device]; + + struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; + + struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; + + const int row_stride_x = nb01 / sizeof(half); + const int channel_stride_x = nb02 / sizeof(half); + + ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main); +} + +void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && + src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU; + + if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { + ggml_cuda_mul_mat_vec_p021(src0, src1, dst); + } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) { + ggml_cuda_mul_mat_vec_nc(src0, src1, dst); + }else if (src0->type == GGML_TYPE_F32) { + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false); + } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { + if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) { + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false); + } else { + int min_compute_capability = INT_MAX; + for (int id = 0; id < g_device_count; ++id) { + if (min_compute_capability > g_compute_capabilities[id] + && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { + min_compute_capability = g_compute_capabilities[id]; + } + } + + if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) { + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false); + } else { + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false); + } + } + } else { + GGML_ASSERT(false); + } +} + +void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true); +} + +void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const int64_t ne = ggml_nelements(src0); + GGML_ASSERT(ne == ggml_nelements(src1)); + + GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + + GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); + GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + GGML_ASSERT(src0->ne[3] == 1); + + const int64_t nb00 = src0->nb[0]; + const int64_t nb01 = src0->nb[1]; + const int64_t nb02 = src0->nb[2]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + GGML_ASSERT(src1->ne[3] == 1); + + const int64_t nb10 = src1->nb[0]; + const int64_t nb11 = src1->nb[1]; + const int64_t nb12 = src1->nb[2]; + + CUDA_CHECK(cudaSetDevice(g_main_device)); + cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; + + const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + + char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; + char * src1_ddc = (char *) src1_extra->data_device[g_main_device]; + + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { + ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, + ne10, ne11, nb10, nb11, nb12, cudaStream_main); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { + ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, + ne10, ne11, nb10, nb11, nb12, cudaStream_main); + } else { + GGML_ASSERT(false); + } + + (void) dst; +} + +void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_cpy(src0, dst, nullptr); + (void) src1; +} + +void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true); +} + +void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true); +} + +void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented + + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, true); +} + +void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true); +} + +void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + (void) src0; + (void) src1; + (void) dst; +} + +void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { + int nrows = ggml_nrows(tensor); + + const int64_t ne0 = tensor->ne[0]; + + const size_t nb1 = tensor->nb[1]; + + ggml_backend backend = tensor->backend; + struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; + memset(extra, 0, sizeof(*extra)); + + for (int id = 0; id < g_device_count; ++id) { + if (backend == GGML_BACKEND_GPU && id != g_main_device) { + continue; + } + + cudaSetDevice(id); + + int row_low, row_high; + if (backend == GGML_BACKEND_GPU) { + row_low = 0; + row_high = nrows; + } else if (backend == GGML_BACKEND_GPU_SPLIT) { + const int64_t rounding = get_row_rounding(tensor->type); + + row_low = id == 0 ? 0 : nrows*g_tensor_split[id]; + row_low -= row_low % rounding; + + if (id == g_device_count - 1) { + row_high = nrows; + } else { + row_high = nrows*g_tensor_split[id + 1]; + row_high -= row_high % rounding; + } + } else { + GGML_ASSERT(false); + } + if (row_low == row_high) { + continue; + } + + int64_t nrows_split = row_high - row_low; + + const size_t offset_split = row_low*nb1; + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING) + * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type); + } + + char * buf; + CUDA_CHECK(cudaMalloc(&buf, size)); + char * buf_host = (char*)data + offset_split; + + // set padding to 0 to avoid possible NaN values + if (size > original_size) { + CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); + } + + + CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice)); + + extra->data_device[id] = buf; + + if (backend == GGML_BACKEND_GPU_SPLIT) { + CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming)); + } + } + + tensor->extra = extra; +} + +void ggml_cuda_free_data(struct ggml_tensor * tensor) { + if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) { + return; + } + + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + + for (int id = 0; id < g_device_count; ++id) { + if (extra->data_device[id] != nullptr) { + CUDA_CHECK(cudaSetDevice(id)); + CUDA_CHECK(cudaFree(extra->data_device[id])); + } + + if (extra->events[id] != nullptr) { + CUDA_CHECK(cudaSetDevice(id)); + CUDA_CHECK(cudaEventDestroy(extra->events[id])); + } + } + + delete extra; +} + +static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr; +static size_t g_temp_tensor_extra_index = 0; + +static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { + if (g_temp_tensor_extras == nullptr) { + g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES]; + } + + size_t alloc_index = g_temp_tensor_extra_index; + g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES; + struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index]; + memset(extra, 0, sizeof(*extra)); + + return extra; +} + +void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) { + if (scratch && g_scratch_size == 0) { + return; + } + + // recursively assign CUDA buffers until a compute tensor is found + if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { + const ggml_op src0_op = tensor->src[0]->op; + if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) { + ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc); + } + } + if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) { + ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc); + } + + tensor->backend = GGML_BACKEND_GPU; + + if (scratch && no_alloc) { + return; + } + + struct ggml_tensor_extra_gpu * extra; + + const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || + tensor->op == GGML_OP_VIEW || + force_inplace; + const size_t size = ggml_nbytes(tensor); + + CUDA_CHECK(cudaSetDevice(g_main_device)); + if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { + struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; + char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; + size_t offset = 0; + if (tensor->op == GGML_OP_VIEW) { + memcpy(&offset, tensor->op_params, sizeof(size_t)); + } + extra = ggml_cuda_alloc_temp_tensor_extra(); + extra->data_device[g_main_device] = src0_ddc + offset; + } else if (tensor->op == GGML_OP_CPY) { + struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra; + void * src1_ddv = src1_extra->data_device[g_main_device]; + extra = ggml_cuda_alloc_temp_tensor_extra(); + extra->data_device[g_main_device] = src1_ddv; + } else if (scratch) { + GGML_ASSERT(size <= g_scratch_size); + if (g_scratch_offset + size > g_scratch_size) { + g_scratch_offset = 0; + } + + char * data = (char *) g_scratch_buffer; + if (data == nullptr) { + CUDA_CHECK(cudaMalloc(&data, g_scratch_size)); + g_scratch_buffer = data; + } + extra = ggml_cuda_alloc_temp_tensor_extra(); + extra->data_device[g_main_device] = data + g_scratch_offset; + + g_scratch_offset += size; + + GGML_ASSERT(g_scratch_offset <= g_scratch_size); + } else { // allocate new buffers outside of scratch + void * data; + CUDA_CHECK(cudaMalloc(&data, size)); + CUDA_CHECK(cudaMemset(data, 0, size)); + extra = new ggml_tensor_extra_gpu; + memset(extra, 0, sizeof(*extra)); + extra->data_device[g_main_device] = data; + } + + tensor->extra = extra; +} + +void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) { + if (g_scratch_size == 0) { + return; + } + if (g_scratch_buffer == nullptr) { + CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size)); + } + + struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra(); + + const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || + tensor->op == GGML_OP_VIEW; + + if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { + struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; + char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; + size_t view_offset = 0; + if (tensor->op == GGML_OP_VIEW) { + memcpy(&view_offset, tensor->op_params, sizeof(size_t)); + } + extra->data_device[g_main_device] = src0_ddc + view_offset; + } else { + extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset; + } + + tensor->extra = extra; +} + +void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) { + ggml_cuda_assign_buffers_impl(tensor, true, false, false); +} + +void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) { + ggml_cuda_assign_buffers_impl(tensor, true, false, true); +} + +void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) { + ggml_cuda_assign_buffers_impl(tensor, false, false, false); +} + +void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) { + ggml_cuda_assign_buffers_impl(tensor, false, true, false); +} + +void ggml_cuda_set_main_device(int main_device) { + if (main_device >= g_device_count) { + fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", + main_device, g_device_count, g_main_device); + return; + } + g_main_device = main_device; + if (g_device_count > 1) { + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device)); + fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name); + } +} + +void ggml_cuda_set_mul_mat_q(bool mul_mat_q) { + g_mul_mat_q = mul_mat_q; +} + +void ggml_cuda_set_scratch_size(size_t scratch_size) { + g_scratch_size = scratch_size; +} + +void ggml_cuda_free_scratch() { + if (g_scratch_buffer == nullptr) { + return; + } + + CUDA_CHECK(cudaFree(g_scratch_buffer)); + g_scratch_buffer = nullptr; +} + +bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ + ggml_cuda_func_t func; + const bool any_on_device = tensor->backend == GGML_BACKEND_GPU + || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) + || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); + + switch (tensor->op) { + case GGML_OP_DUP: + if (!any_on_device) { + return false; + } + func = ggml_cuda_dup; + break; + case GGML_OP_ADD: + if (!any_on_device) { + return false; + } + func = ggml_cuda_add; + break; + case GGML_OP_MUL: + if (!any_on_device) { + return false; + } + func = ggml_cuda_mul; + break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_GELU: + if (!any_on_device) { + return false; + } + func = ggml_cuda_gelu; + break; + case GGML_UNARY_OP_SILU: + if (!any_on_device) { + return false; + } + func = ggml_cuda_silu; + break; + default: + return false; + } break; + case GGML_OP_NORM: + if (!any_on_device) { + return false; + } + func = ggml_cuda_norm; + break; + case GGML_OP_RMS_NORM: + if (!any_on_device) { + return false; + } + func = ggml_cuda_rms_norm; + break; + case GGML_OP_MUL_MAT: + if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) { + return false; + } + func = ggml_cuda_mul_mat; + break; + case GGML_OP_SCALE: + if (!any_on_device) { + return false; + } + func = ggml_cuda_scale; + break; + case GGML_OP_CPY: + if (!any_on_device) { + return false; + } + func = ggml_cuda_cpy; + break; + case GGML_OP_CONT: + if (!any_on_device) { + return false; + } + func = ggml_cuda_dup; + break; + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + if (!any_on_device) { + return false; + } + func = ggml_cuda_nop; + break; + case GGML_OP_DIAG_MASK_INF: + if (!any_on_device) { + return false; + } + func = ggml_cuda_diag_mask_inf; + break; + case GGML_OP_SOFT_MAX: + if (!any_on_device) { + return false; + } + func = ggml_cuda_soft_max; + break; + case GGML_OP_ROPE: + if (!any_on_device) { + return false; + } + func = ggml_cuda_rope; + break; + case GGML_OP_ALIBI: + if (!any_on_device) { + return false; + } + func = ggml_cuda_alibi; + break; + default: + return false; + } + + if (params->ith != 0) { + return true; + } + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return true; + } + func(tensor->src[0], tensor->src[1], tensor); + return true; +} + +int ggml_cuda_get_device_count() { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); + return device_count; +} + +void ggml_cuda_get_device_description(int device, char * description, size_t description_size) { + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); + snprintf(description, description_size, "%s", prop.name); +} diff --git a/seamless_communication/ggml/src/ggml-cuda.h b/seamless_communication/ggml/src/ggml-cuda.h new file mode 100644 index 0000000..a72e820 --- /dev/null +++ b/seamless_communication/ggml/src/ggml-cuda.h @@ -0,0 +1,46 @@ +#pragma once + +#include "ggml.h" + +#ifdef GGML_USE_HIPBLAS +#define GGML_CUDA_NAME "ROCm" +#define GGML_CUBLAS_NAME "hipBLAS" +#else +#define GGML_CUDA_NAME "CUDA" +#define GGML_CUBLAS_NAME "cuBLAS" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_CUDA_MAX_DEVICES 16 + +GGML_API void ggml_init_cublas(void); +GGML_API void * ggml_cuda_host_malloc(size_t size); +GGML_API void ggml_cuda_host_free(void * ptr); + +GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split); +GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor); +GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor); + +GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor); +GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); +GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor); + +GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor); +GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset); + +GGML_API void ggml_cuda_set_main_device(int main_device); +GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q); +GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size); +GGML_API void ggml_cuda_free_scratch(void); +GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); + +GGML_API int ggml_cuda_get_device_count(void); +GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size); + +#ifdef __cplusplus +} +#endif diff --git a/seamless_communication/ggml/src/ggml-metal.h b/seamless_communication/ggml/src/ggml-metal.h new file mode 100644 index 0000000..fca28d3 --- /dev/null +++ b/seamless_communication/ggml/src/ggml-metal.h @@ -0,0 +1,85 @@ +// An interface allowing to compute ggml_cgraph with Metal +// +// This is a fully functional interface that extends ggml with GPU support for Apple devices. +// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.) +// +// How it works? +// +// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this +// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you +// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.) +// +// You only need to make sure that all memory buffers that you used during the graph creation +// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is +// used during the graph evaluation to determine the arguments of the compute kernels. +// +// Synchronization between device and host memory (for example for input and output tensors) +// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions. +// + +#pragma once + +#include +#include + +// max memory buffers that can be mapped to the device +#define GGML_METAL_MAX_BUFFERS 16 +#define GGML_METAL_MAX_COMMAND_BUFFERS 32 + +struct ggml_tensor; +struct ggml_cgraph; + +#ifdef __cplusplus +extern "C" { +#endif + +struct ggml_metal_context; + +// number of command buffers to use +struct ggml_metal_context * ggml_metal_init(int n_cb); +void ggml_metal_free(struct ggml_metal_context * ctx); + +void * ggml_metal_host_malloc(size_t n); +void ggml_metal_host_free (void * data); + +// set the number of command buffers to use +void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); + +// creates a mapping between a host memory buffer and a device memory buffer +// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute +// - the mapping is used during computation to determine the arguments of the compute kernels +// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal +// - max_size specifies the maximum size of a tensor and is used to create shared views such +// that it is guaranteed that the tensor will fit in at least one of the views +// +bool ggml_metal_add_buffer( + struct ggml_metal_context * ctx, + const char * name, + void * data, + size_t size, + size_t max_size); + +// set data from host memory into the device +void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); + +// get data from the device into host memory +void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); + +// try to find operations that can be run concurrently in the graph +// you should run it again if the topology of your graph changes +void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem); + +// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized +int ggml_metal_if_optimized(struct ggml_metal_context * ctx); + +// output the concur_list for ggml_alloc +int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx); + +// same as ggml_graph_compute but uses Metal +// creates gf->n_threads command buffers in parallel +void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); + +#ifdef __cplusplus +} +#endif + diff --git a/seamless_communication/ggml/src/ggml-metal.m b/seamless_communication/ggml/src/ggml-metal.m new file mode 100644 index 0000000..7e2355c --- /dev/null +++ b/seamless_communication/ggml/src/ggml-metal.m @@ -0,0 +1,1226 @@ +#import "ggml-metal.h" + +#import "ggml.h" + +#import + +#import + +#undef MIN +#undef MAX +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +// TODO: temporary - reuse llama.cpp logging +#ifdef GGML_METAL_NDEBUG +#define metal_printf(...) +#else +#define metal_printf(...) fprintf(stderr, __VA_ARGS__) +#endif + +#define UNUSED(x) (void)(x) + +#define GGML_MAX_CONCUR (2*GGML_MAX_NODES) + +struct ggml_metal_buffer { + const char * name; + + void * data; + size_t size; + + id metal; +}; + +struct ggml_metal_context { + int n_cb; + + id device; + id queue; + id library; + + id command_buffers [GGML_METAL_MAX_COMMAND_BUFFERS]; + id command_encoders[GGML_METAL_MAX_COMMAND_BUFFERS]; + + dispatch_queue_t d_queue; + + int n_buffers; + struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; + + int concur_list[GGML_MAX_CONCUR]; + int concur_list_len; + + // custom kernels +#define GGML_METAL_DECL_KERNEL(name) \ + id function_##name; \ + id pipeline_##name + + GGML_METAL_DECL_KERNEL(add); + GGML_METAL_DECL_KERNEL(add_row); // TODO: avoid this extra kernel, instead extend the "add" kernel to support broadcast + GGML_METAL_DECL_KERNEL(mul); + GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast + GGML_METAL_DECL_KERNEL(scale); + GGML_METAL_DECL_KERNEL(silu); + GGML_METAL_DECL_KERNEL(relu); + GGML_METAL_DECL_KERNEL(gelu); + GGML_METAL_DECL_KERNEL(soft_max); + GGML_METAL_DECL_KERNEL(diag_mask_inf); + GGML_METAL_DECL_KERNEL(get_rows_f16); + GGML_METAL_DECL_KERNEL(get_rows_q4_0); + GGML_METAL_DECL_KERNEL(get_rows_q4_1); + GGML_METAL_DECL_KERNEL(get_rows_q8_0); + GGML_METAL_DECL_KERNEL(get_rows_q2_K); + GGML_METAL_DECL_KERNEL(get_rows_q3_K); + GGML_METAL_DECL_KERNEL(get_rows_q4_K); + GGML_METAL_DECL_KERNEL(get_rows_q5_K); + GGML_METAL_DECL_KERNEL(get_rows_q6_K); + GGML_METAL_DECL_KERNEL(rms_norm); + GGML_METAL_DECL_KERNEL(norm); + GGML_METAL_DECL_KERNEL(mul_mat_f16_f32); + GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row); + GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32); + GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32); + GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32); + GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32); + GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32); + GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32); + GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32); + GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_f16_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q8_0_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32); + GGML_METAL_DECL_KERNEL(rope); + GGML_METAL_DECL_KERNEL(alibi_f32); + GGML_METAL_DECL_KERNEL(cpy_f32_f16); + GGML_METAL_DECL_KERNEL(cpy_f32_f32); + GGML_METAL_DECL_KERNEL(cpy_f16_f16); + +#undef GGML_METAL_DECL_KERNEL +}; + +// MSL code +// TODO: move the contents here when ready +// for now it is easier to work in a separate file +static NSString * const msl_library_source = @"see metal.metal"; + +// Here to assist with NSBundle Path Hack +@interface GGMLMetalClass : NSObject +@end +@implementation GGMLMetalClass +@end + +struct ggml_metal_context * ggml_metal_init(int n_cb) { + metal_printf("%s: allocating\n", __func__); + + // Show all the Metal device instances in the system + NSArray * devices = MTLCopyAllDevices(); + id device; + NSString * s; + for (device in devices) { + s = [device name]; + metal_printf("%s: found device: %s\n", __func__, [s UTF8String]); + } + + // Pick and show default Metal device + device = MTLCreateSystemDefaultDevice(); + s = [device name]; + metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]); + + // Configure context + struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + ctx->device = device; + ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); + ctx->queue = [ctx->device newCommandQueue]; + ctx->n_buffers = 0; + ctx->concur_list_len = 0; + + ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT); + +#if 0 + // compile from source string and show compile log + { + NSError * error = nil; + + ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error]; + if (error) { + metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + return NULL; + } + } +#else + UNUSED(msl_library_source); + + // read the source from "ggml-metal.metal" into a string and use newLibraryWithSource + { + NSError * error = nil; + + //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; + NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; + NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; + metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]); + + NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; + if (error) { + metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + return NULL; + } + +#ifdef GGML_QKK_64 + MTLCompileOptions* options = [MTLCompileOptions new]; + options.preprocessorMacros = @{ @"QK_K" : @(64) }; + ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error]; +#else + ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error]; +#endif + if (error) { + metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + return NULL; + } + } +#endif + + // load kernels + { + NSError * error = nil; +#define GGML_METAL_ADD_KERNEL(name) \ + ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ + ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ + metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ + (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ + (int) ctx->pipeline_##name.threadExecutionWidth); \ + if (error) { \ + metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + return NULL; \ + } + + GGML_METAL_ADD_KERNEL(add); + GGML_METAL_ADD_KERNEL(add_row); + GGML_METAL_ADD_KERNEL(mul); + GGML_METAL_ADD_KERNEL(mul_row); + GGML_METAL_ADD_KERNEL(scale); + GGML_METAL_ADD_KERNEL(silu); + GGML_METAL_ADD_KERNEL(relu); + GGML_METAL_ADD_KERNEL(gelu); + GGML_METAL_ADD_KERNEL(soft_max); + GGML_METAL_ADD_KERNEL(diag_mask_inf); + GGML_METAL_ADD_KERNEL(get_rows_f16); + GGML_METAL_ADD_KERNEL(get_rows_q4_0); + GGML_METAL_ADD_KERNEL(get_rows_q4_1); + GGML_METAL_ADD_KERNEL(get_rows_q8_0); + GGML_METAL_ADD_KERNEL(get_rows_q2_K); + GGML_METAL_ADD_KERNEL(get_rows_q3_K); + GGML_METAL_ADD_KERNEL(get_rows_q4_K); + GGML_METAL_ADD_KERNEL(get_rows_q5_K); + GGML_METAL_ADD_KERNEL(get_rows_q6_K); + GGML_METAL_ADD_KERNEL(rms_norm); + GGML_METAL_ADD_KERNEL(norm); + GGML_METAL_ADD_KERNEL(mul_mat_f16_f32); + GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row); + GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32); + GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32); + GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32); + GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32); + GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32); + GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32); + GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32); + GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_f16_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); + GGML_METAL_ADD_KERNEL(rope); + GGML_METAL_ADD_KERNEL(alibi_f32); + GGML_METAL_ADD_KERNEL(cpy_f32_f16); + GGML_METAL_ADD_KERNEL(cpy_f32_f32); + GGML_METAL_ADD_KERNEL(cpy_f16_f16); + +#undef GGML_METAL_ADD_KERNEL + } + + metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); + if (ctx->device.maxTransferRate != 0) { + metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); + } else { + metal_printf("%s: maxTransferRate = built-in GPU\n", __func__); + } + + return ctx; +} + +void ggml_metal_free(struct ggml_metal_context * ctx) { + metal_printf("%s: deallocating\n", __func__); +#define GGML_METAL_DEL_KERNEL(name) \ + [ctx->function_##name release]; \ + [ctx->pipeline_##name release]; + + GGML_METAL_DEL_KERNEL(add); + GGML_METAL_DEL_KERNEL(add_row); + GGML_METAL_DEL_KERNEL(mul); + GGML_METAL_DEL_KERNEL(mul_row); + GGML_METAL_DEL_KERNEL(scale); + GGML_METAL_DEL_KERNEL(silu); + GGML_METAL_DEL_KERNEL(relu); + GGML_METAL_DEL_KERNEL(gelu); + GGML_METAL_DEL_KERNEL(soft_max); + GGML_METAL_DEL_KERNEL(diag_mask_inf); + GGML_METAL_DEL_KERNEL(get_rows_f16); + GGML_METAL_DEL_KERNEL(get_rows_q4_0); + GGML_METAL_DEL_KERNEL(get_rows_q4_1); + GGML_METAL_DEL_KERNEL(get_rows_q8_0); + GGML_METAL_DEL_KERNEL(get_rows_q2_K); + GGML_METAL_DEL_KERNEL(get_rows_q3_K); + GGML_METAL_DEL_KERNEL(get_rows_q4_K); + GGML_METAL_DEL_KERNEL(get_rows_q5_K); + GGML_METAL_DEL_KERNEL(get_rows_q6_K); + GGML_METAL_DEL_KERNEL(rms_norm); + GGML_METAL_DEL_KERNEL(norm); + GGML_METAL_DEL_KERNEL(mul_mat_f16_f32); + GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row); + GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q2_K_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q3_K_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_f16_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); + GGML_METAL_DEL_KERNEL(rope); + GGML_METAL_DEL_KERNEL(alibi_f32); + GGML_METAL_DEL_KERNEL(cpy_f32_f16); + GGML_METAL_DEL_KERNEL(cpy_f32_f32); + GGML_METAL_DEL_KERNEL(cpy_f16_f16); + +#undef GGML_METAL_DEL_KERNEL + + for (int i = 0; i < ctx->n_buffers; ++i) { + [ctx->buffers[i].metal release]; + } + + [ctx->library release]; + [ctx->queue release]; + [ctx->device release]; + + dispatch_release(ctx->d_queue); + + free(ctx); +} + +void * ggml_metal_host_malloc(size_t n) { + void * data = NULL; + const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); + if (result != 0) { + metal_printf("%s: error: posix_memalign failed\n", __func__); + return NULL; + } + + return data; +} + +void ggml_metal_host_free(void * data) { + free(data); +} + +void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) { + ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); +} + +int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { + return ctx->concur_list_len; +} + +int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) { + return ctx->concur_list; +} + +// finds the Metal buffer that contains the tensor data on the GPU device +// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the +// Metal buffer based on the host memory pointer +// +static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) { + //metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); + + const int64_t tsize = ggml_nbytes(t); + + // find the view that contains the tensor fully + for (int i = 0; i < ctx->n_buffers; ++i) { + const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data; + + if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { + *offs = (size_t) ioffs; + + //metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); + + return ctx->buffers[i].metal; + } + } + + metal_printf("%s: error: buffer is nil\n", __func__); + + return nil; +} + +bool ggml_metal_add_buffer( + struct ggml_metal_context * ctx, + const char * name, + void * data, + size_t size, + size_t max_size) { + if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { + metal_printf("%s: too many buffers\n", __func__); + return false; + } + + if (data) { + // verify that the buffer does not overlap with any of the existing buffers + for (int i = 0; i < ctx->n_buffers; ++i) { + const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; + + if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { + metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); + return false; + } + } + + const size_t size_page = sysconf(_SC_PAGESIZE); + + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + // the buffer fits into the max buffer size allowed by the device + if (size_aligned <= ctx->device.maxBufferLength) { + ctx->buffers[ctx->n_buffers].name = name; + ctx->buffers[ctx->n_buffers].data = data; + ctx->buffers[ctx->n_buffers].size = size; + + ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; + + if (ctx->buffers[ctx->n_buffers].metal == nil) { + metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + return false; + } + + metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); + + ++ctx->n_buffers; + } else { + // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into + // one of the views + const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case + const size_t size_step = ctx->device.maxBufferLength - size_ovlp; + const size_t size_view = ctx->device.maxBufferLength; + + for (size_t i = 0; i < size; i += size_step) { + const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); + + ctx->buffers[ctx->n_buffers].name = name; + ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); + ctx->buffers[ctx->n_buffers].size = size_step_aligned; + + ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; + + if (ctx->buffers[ctx->n_buffers].metal == nil) { + metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); + return false; + } + + metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); + if (i + size_step < size) { + metal_printf("\n"); + } + + ++ctx->n_buffers; + } + } + + metal_printf(", (%8.2f / %8.2f)", + ctx->device.currentAllocatedSize / 1024.0 / 1024.0, + ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + + if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { + metal_printf(", warning: current allocated size is greater than the recommended max working set size\n"); + } else { + metal_printf("\n"); + } + } + + return true; +} + +void ggml_metal_set_tensor( + struct ggml_metal_context * ctx, + struct ggml_tensor * t) { + size_t offs; + id id_dst = ggml_metal_get_buffer(ctx, t, &offs); + + memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t)); +} + +void ggml_metal_get_tensor( + struct ggml_metal_context * ctx, + struct ggml_tensor * t) { + size_t offs; + id id_src = ggml_metal_get_buffer(ctx, t, &offs); + + memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t)); +} + +void ggml_metal_graph_find_concurrency( + struct ggml_metal_context * ctx, + struct ggml_cgraph * gf, bool check_mem) { + int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time + int nodes_unused[GGML_MAX_CONCUR]; + + for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; } + for (int i = 0; i < gf->n_nodes; i++) { nodes_unused[i] = 1; } + ctx->concur_list_len = 0; + + int n_left = gf->n_nodes; + int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list + int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos + + while (n_left > 0) { + // number of nodes at a layer (that can be issued concurrently) + int concurrency = 0; + for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) { + if (nodes_unused[i]) { + // if the requirements for gf->nodes[i] are satisfied + int exe_flag = 1; + + // scan all srcs + for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) { + struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind]; + if (src_cur) { + // if is leaf nodes it's satisfied. + // TODO: ggml_is_leaf() + if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) { + continue; + } + + // otherwise this src should be the output from previous nodes. + int is_found = 0; + + // scan 2*search_depth back because we inserted barrier. + //for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) { + for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) { + if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) { + is_found = 1; + break; + } + } + if (is_found == 0) { + exe_flag = 0; + break; + } + } + } + if (exe_flag && check_mem) { + // check if nodes[i]'s data will be overwritten by a node before nodes[i]. + // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3] + int64_t data_start = (int64_t) gf->nodes[i]->data; + int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]); + for (int j = n_start; j < i; j++) { + if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \ + && gf->nodes[j]->op != GGML_OP_VIEW \ + && gf->nodes[j]->op != GGML_OP_TRANSPOSE \ + && gf->nodes[j]->op != GGML_OP_PERMUTE) { + if (((int64_t)gf->nodes[j]->data) >= data_start + length || \ + ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) { + continue; + } + + exe_flag = 0; + } + } + } + if (exe_flag) { + ctx->concur_list[level_pos + concurrency] = i; + nodes_unused[i] = 0; + concurrency++; + ctx->concur_list_len++; + } + } + } + n_left -= concurrency; + // adding a barrier different layer + ctx->concur_list[level_pos + concurrency] = -1; + ctx->concur_list_len++; + // jump all sorted nodes at nodes_bak + while (!nodes_unused[n_start]) { + n_start++; + } + level_pos += concurrency + 1; + } + + if (ctx->concur_list_len > GGML_MAX_CONCUR) { + metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__); + } +} + +void ggml_metal_graph_compute( + struct ggml_metal_context * ctx, + struct ggml_cgraph * gf) { + @autoreleasepool { + + // if there is ctx->concur_list, dispatch concurrently + // else fallback to serial dispatch + MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor; + + const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR; + + const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes; + edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial; + + // create multiple command buffers and enqueue them + // then, we encode the graph into the command buffers in parallel + + const int n_cb = ctx->n_cb; + + for (int i = 0; i < n_cb; ++i) { + ctx->command_buffers[i] = [ctx->queue commandBuffer]; + + // enqueue the command buffers in order to specify their execution order + [ctx->command_buffers[i] enqueue]; + + ctx->command_encoders[i] = [ctx->command_buffers[i] computeCommandEncoderWithDescriptor: edesc]; + } + + for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { + const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb; + + dispatch_async(ctx->d_queue, ^{ + size_t offs_src0 = 0; + size_t offs_src1 = 0; + size_t offs_dst = 0; + + id command_buffer = ctx->command_buffers[cb_idx]; + id encoder = ctx->command_encoders[cb_idx]; + + const int node_start = (cb_idx + 0) * n_nodes_per_cb; + const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes); + + for (int ind = node_start; ind < node_end; ++ind) { + const int i = has_concur ? ctx->concur_list[ind] : ind; + + if (i == -1) { + [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers]; + continue; + } + + //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + + struct ggml_tensor * src0 = gf->nodes[i]->src[0]; + struct ggml_tensor * src1 = gf->nodes[i]->src[1]; + struct ggml_tensor * dst = gf->nodes[i]; + + const int64_t ne00 = src0 ? src0->ne[0] : 0; + const int64_t ne01 = src0 ? src0->ne[1] : 0; + const int64_t ne02 = src0 ? src0->ne[2] : 0; + const int64_t ne03 = src0 ? src0->ne[3] : 0; + + const uint64_t nb00 = src0 ? src0->nb[0] : 0; + const uint64_t nb01 = src0 ? src0->nb[1] : 0; + const uint64_t nb02 = src0 ? src0->nb[2] : 0; + const uint64_t nb03 = src0 ? src0->nb[3] : 0; + + const int64_t ne10 = src1 ? src1->ne[0] : 0; + const int64_t ne11 = src1 ? src1->ne[1] : 0; + const int64_t ne12 = src1 ? src1->ne[2] : 0; + const int64_t ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13); + + const uint64_t nb10 = src1 ? src1->nb[0] : 0; + const uint64_t nb11 = src1 ? src1->nb[1] : 0; + const uint64_t nb12 = src1 ? src1->nb[2] : 0; + const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13); + + const int64_t ne0 = dst ? dst->ne[0] : 0; + const int64_t ne1 = dst ? dst->ne[1] : 0; + const int64_t ne2 = dst ? dst->ne[2] : 0; + const int64_t ne3 = dst ? dst->ne[3] : 0; + + const uint64_t nb0 = dst ? dst->nb[0] : 0; + const uint64_t nb1 = dst ? dst->nb[1] : 0; + const uint64_t nb2 = dst ? dst->nb[2] : 0; + const uint64_t nb3 = dst ? dst->nb[3] : 0; + + const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; + const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; + const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; + + id id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil; + id id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; + id id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; + + //metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op)); + //if (src0) { + // metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, + // ggml_is_contiguous(src0), src0->name); + //} + //if (src1) { + // metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, + // ggml_is_contiguous(src1), src1->name); + //} + //if (dst) { + // metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, + // dst->name); + //} + + switch (dst->op) { + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_TRANSPOSE: + case GGML_OP_PERMUTE: + { + // noop + } break; + case GGML_OP_ADD: + { + GGML_ASSERT(ggml_is_contiguous(src0)); + + // utilize float4 + GGML_ASSERT(ne00 % 4 == 0); + const int64_t nb = ne00/4; + + if (ggml_nelements(src1) == ne10) { + // src1 is a row + [encoder setComputePipelineState:ctx->pipeline_add_row]; + } else { + [encoder setComputePipelineState:ctx->pipeline_add]; + } + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&nb length:sizeof(nb) atIndex:3]; + + const int64_t n = ggml_nelements(dst)/4; + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_MUL: + { + GGML_ASSERT(ggml_is_contiguous(src0)); + + // utilize float4 + GGML_ASSERT(ne00 % 4 == 0); + const int64_t nb = ne00/4; + + if (ggml_nelements(src1) == ne10) { + // src1 is a row + [encoder setComputePipelineState:ctx->pipeline_mul_row]; + } else { + [encoder setComputePipelineState:ctx->pipeline_mul]; + } + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&nb length:sizeof(nb) atIndex:3]; + + const int64_t n = ggml_nelements(dst)/4; + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_SCALE: + { + const float scale = *(const float *) src1->data; + + [encoder setComputePipelineState:ctx->pipeline_scale]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(gf->nodes[i])) { + case GGML_UNARY_OP_SILU: + { + [encoder setComputePipelineState:ctx->pipeline_silu]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_RELU: + { + [encoder setComputePipelineState:ctx->pipeline_relu]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_UNARY_OP_GELU: + { + [encoder setComputePipelineState:ctx->pipeline_gelu]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + default: + { + metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + GGML_ASSERT(false); + } + } break; + case GGML_OP_SOFT_MAX: + { + const int nth = 32; + + [encoder setComputePipelineState:ctx->pipeline_soft_max]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_DIAG_MASK_INF: + { + const int n_past = ((int32_t *)(dst->op_params))[0]; + + [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&n_past length:sizeof(int) atIndex:4]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_MUL_MAT: + { + // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224 + + GGML_ASSERT(ne00 == ne10); + // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere + uint gqa = ne12/ne02; + GGML_ASSERT(ne03 == ne13); + + // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs + // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel + if (ggml_is_contiguous(src0) && + ggml_is_contiguous(src1) && + src1t == GGML_TYPE_F32 && + [ctx->device supportsFamily:MTLGPUFamilyApple7] && + ne00%32 == 0 && + ne11 > 1) { + switch (src0->type) { + case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break; + case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break; + case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break; + case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q8_0_f32]; break; + case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break; + case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break; + case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break; + case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break; + case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break; + default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); + } + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9]; + [encoder setBytes:&gqa length:sizeof(gqa) atIndex:10]; + [encoder setThreadgroupMemoryLength:8192 atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + } else { + int nth0 = 32; + int nth1 = 1; + + // use custom matrix x vector kernel + switch (src0t) { + case GGML_TYPE_F16: + { + nth0 = 32; + nth1 = 1; + if (ne11 * ne12 < 4) { + [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row]; + } else { + [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; + } + } break; + case GGML_TYPE_Q4_0: + { + GGML_ASSERT(ne02 == 1); + GGML_ASSERT(ne12 == 1); + + nth0 = 8; + nth1 = 8; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32]; + } break; + case GGML_TYPE_Q4_1: + { + GGML_ASSERT(ne02 == 1); + GGML_ASSERT(ne12 == 1); + + nth0 = 8; + nth1 = 8; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32]; + } break; + case GGML_TYPE_Q8_0: + { + GGML_ASSERT(ne02 == 1); + GGML_ASSERT(ne12 == 1); + + nth0 = 8; + nth1 = 8; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q8_0_f32]; + } break; + case GGML_TYPE_Q2_K: + { + GGML_ASSERT(ne02 == 1); + GGML_ASSERT(ne12 == 1); + + nth0 = 2; + nth1 = 32; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32]; + } break; + case GGML_TYPE_Q3_K: + { + GGML_ASSERT(ne02 == 1); + GGML_ASSERT(ne12 == 1); + + nth0 = 2; + nth1 = 32; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32]; + } break; + case GGML_TYPE_Q4_K: + { + GGML_ASSERT(ne02 == 1); + GGML_ASSERT(ne12 == 1); + + nth0 = 4; //1; + nth1 = 8; //32; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32]; + } break; + case GGML_TYPE_Q5_K: + { + GGML_ASSERT(ne02 == 1); + GGML_ASSERT(ne12 == 1); + + nth0 = 2; + nth1 = 32; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32]; + } break; + case GGML_TYPE_Q6_K: + { + GGML_ASSERT(ne02 == 1); + GGML_ASSERT(ne12 == 1); + + nth0 = 2; + nth1 = 32; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32]; + } break; + default: + { + metal_printf("Asserting on type %d\n",(int)src0t); + GGML_ASSERT(false && "not implemented"); + } + }; + + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16]; + [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17]; + + if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 || + src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q4_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q3_K) { +#ifdef GGML_QKK_64 + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; +#else + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; +#endif + } + else if (src0t == GGML_TYPE_Q5_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q6_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else { + int64_t ny = (ne11 + 3)/4; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + } + } break; + case GGML_OP_GET_ROWS: + { + switch (src0->type) { + case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break; + case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break; + case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break; + case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q8_0]; break; + case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break; + case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break; + case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break; + case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break; + case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break; + default: GGML_ASSERT(false && "not implemented"); + } + + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4]; + [encoder setBytes:&(dst->nb[1]) length:sizeof(uint64_t) atIndex:5]; + + const int64_t n = ggml_nelements(src1); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_RMS_NORM: + { + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + const int nth = 512; + + [encoder setComputePipelineState:ctx->pipeline_rms_norm]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; + [encoder setBytes:&eps length:sizeof( float) atIndex:4]; + [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0]; + + const int64_t nrows = ggml_nrows(src0); + + [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_NORM: + { + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + const int nth = 256; + + [encoder setComputePipelineState:ctx->pipeline_norm]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; + [encoder setBytes:&eps length:sizeof( float) atIndex:4]; + [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; + + const int64_t nrows = ggml_nrows(src0); + + [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ALIBI: + { + GGML_ASSERT((src0t == GGML_TYPE_F32)); + + const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past); + const int n_head = ((int32_t *) dst->op_params)[1]; + float max_bias; + memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); + + if (__builtin_popcount(n_head) != 1) { + GGML_ASSERT(false && "only power-of-two n_head implemented"); + } + + const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); + const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); + + [encoder setComputePipelineState:ctx->pipeline_alibi_f32]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&m0 length:sizeof( float) atIndex:18]; + + const int nth = 32; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ROPE: + { + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + + float freq_base; + float freq_scale; + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + + [encoder setComputePipelineState:ctx->pipeline_rope]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&n_past length:sizeof( int) atIndex:18]; + [encoder setBytes:&n_dims length:sizeof( int) atIndex:19]; + [encoder setBytes:&mode length:sizeof( int) atIndex:20]; + [encoder setBytes:&freq_base length:sizeof(float) atIndex:21]; + [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; + } break; + case GGML_OP_DUP: + case GGML_OP_CPY: + case GGML_OP_CONT: + { + const int nth = 32; + + switch (src0t) { + case GGML_TYPE_F32: + { + switch (dstt) { + case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break; + case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break; + default: GGML_ASSERT(false && "not implemented"); + }; + } break; + case GGML_TYPE_F16: + { + switch (dstt) { + case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break; + case GGML_TYPE_F32: GGML_ASSERT(false && "cpy_f16_f32 not implemented"); break; + default: GGML_ASSERT(false && "not implemented"); + }; + } break; + default: GGML_ASSERT(false && "not implemented"); + } + + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + default: + { + metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + GGML_ASSERT(false); + } + } + } + + if (encoder != nil) { + [encoder endEncoding]; + encoder = nil; + } + + [command_buffer commit]; + }); + } + + // wait for all threads to finish + dispatch_barrier_sync(ctx->d_queue, ^{}); + + // check status of command buffers + // needed to detect if the device ran out-of-memory for example (#1881) + for (int i = 0; i < n_cb; i++) { + [ctx->command_buffers[i] waitUntilCompleted]; + + MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status]; + if (status != MTLCommandBufferStatusCompleted) { + metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status); + GGML_ASSERT(false); + } + } + + } +} diff --git a/seamless_communication/ggml/src/ggml-metal.metal b/seamless_communication/ggml/src/ggml-metal.metal new file mode 100644 index 0000000..5070561 --- /dev/null +++ b/seamless_communication/ggml/src/ggml-metal.metal @@ -0,0 +1,2120 @@ +#include + +using namespace metal; + +#define MAX(x, y) ((x) > (y) ? (x) : (y)) + +#define QK4_0 32 +#define QR4_0 2 +typedef struct { + half d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants +} block_q4_0; + +#define QK4_1 32 +typedef struct { + half d; // delta + half m; // min + uint8_t qs[QK4_1 / 2]; // nibbles / quants +} block_q4_1; + +#define QK8_0 32 +typedef struct { + half d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; + +kernel void kernel_add( + device const float4 * src0, + device const float4 * src1, + device float4 * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] + src1[tpig]; +} + +// assumption: src1 is a row +// broadcast src1 into src0 +kernel void kernel_add_row( + device const float4 * src0, + device const float4 * src1, + device float4 * dst, + constant int64_t & nb, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] + src1[tpig % nb]; +} + +kernel void kernel_mul( + device const float4 * src0, + device const float4 * src1, + device float4 * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] * src1[tpig]; +} + +// assumption: src1 is a row +// broadcast src1 into src0 +kernel void kernel_mul_row( + device const float4 * src0, + device const float4 * src1, + device float4 * dst, + constant int64_t & nb, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] * src1[tpig % nb]; +} + +kernel void kernel_scale( + device const float * src0, + device float * dst, + constant float & scale, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] * scale; +} + +kernel void kernel_silu( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + float x = src0[tpig]; + dst[tpig] = x / (1.0f + exp(-x)); +} + +kernel void kernel_relu( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = max(0.0f, src0[tpig]); +} + +constant float GELU_COEF_A = 0.044715f; +constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; + +kernel void kernel_gelu( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + float x = src0[tpig]; + + // BEWARE !!! + // Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs! + // This was observed with Falcon 7B and 40B models + // + dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); +} + +kernel void kernel_soft_max( + device const float * src0, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + threadgroup float * buf [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig[2]; + const int64_t i02 = tgpig[1]; + const int64_t i01 = tgpig[0]; + + device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + + // parallel max + buf[tpitg[0]] = -INFINITY; + for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { + buf[tpitg[0]] = MAX(buf[tpitg[0]], psrc0[i00]); + } + + // reduce + threadgroup_barrier(mem_flags::mem_threadgroup); + for (uint i = ntg[0]/2; i > 0; i /= 2) { + if (tpitg[0] < i) { + buf[tpitg[0]] = MAX(buf[tpitg[0]], buf[tpitg[0] + i]); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + //// broadcast - not needed. There is a threadgroup barrier above in the last iteration of + // the loop, and when that is done, buf[0] has the correct (synchronized) value + //if (tpitg[0] == 0) { + // buf[0] = buf[0]; + //} + + //threadgroup_barrier(mem_flags::mem_threadgroup); + + const float max = buf[0]; + + // parallel sum + buf[tpitg[0]] = 0.0f; + for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { + const float exp_psrc0 = exp(psrc0[i00] - max); + buf[tpitg[0]] += exp_psrc0; + // Remember the result of exp here. exp is expensive, so we really do not + // whish to compute it twice. + pdst[i00] = exp_psrc0; + } + + // reduce + threadgroup_barrier(mem_flags::mem_threadgroup); + for (uint i = ntg[0]/2; i > 0; i /= 2) { + if (tpitg[0] < i) { + buf[tpitg[0]] += buf[tpitg[0] + i]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + // broadcast - not needed, see above + //// broadcast + //if (tpitg[0] == 0) { + // buf[0] = buf[0]; + //} + + //threadgroup_barrier(mem_flags::mem_threadgroup); + + const float sum = buf[0]; + + for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { + pdst[i00] /= sum; + } +} + +kernel void kernel_diag_mask_inf( + device const float * src0, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int & n_past, + uint3 tpig[[thread_position_in_grid]]) { + const int64_t i02 = tpig[2]; + const int64_t i01 = tpig[1]; + const int64_t i00 = tpig[0]; + + if (i00 > n_past + i01) { + dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY; + } else { + dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00]; + } +} + +kernel void kernel_norm( + device const void * src0, + device float * dst, + constant int64_t & ne00, + constant uint64_t & nb01, + constant float & eps, + threadgroup float * sum [[threadgroup(0)]], + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01); + // MEAN + // parallel sum + sum[tpitg] = 0.0f; + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { + sum[tpitg] += x[i00]; + } + // reduce + threadgroup_barrier(mem_flags::mem_threadgroup); + for (uint i = ntg/2; i > 0; i /= 2) { + if (tpitg < i) { + sum[tpitg] += sum[tpitg + i]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + const float mean = sum[0] / ne00; + + // recenter and VARIANCE + threadgroup_barrier(mem_flags::mem_threadgroup); + device float * y = dst + tgpig*ne00; + sum[tpitg] = 0.0f; + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { + y[i00] = x[i00] - mean; + sum[tpitg] += y[i00] * y[i00]; + } + + // reduce + threadgroup_barrier(mem_flags::mem_threadgroup); + for (uint i = ntg/2; i > 0; i /= 2) { + if (tpitg < i) { + sum[tpitg] += sum[tpitg + i]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + const float variance = sum[0] / ne00; + + const float scale = 1.0f/sqrt(variance + eps); + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { + y[i00] = y[i00] * scale; + } +} + +kernel void kernel_rms_norm( + device const void * src0, + device float * dst, + constant int64_t & ne00, + constant uint64_t & nb01, + constant float & eps, + threadgroup float * sum [[threadgroup(0)]], + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01); + device const float * x_scalar = (device const float *) x; + float4 sumf=0; + float all_sum=0; + + // parallel sum + for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { + sumf += x[i00] * x[i00]; + } + all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3]; + all_sum = simd_sum(all_sum); + if (tiisg == 0) { + sum[sgitg] = all_sum; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + // broadcast, simd group number is ntg / 32 + for (uint i = ntg / 32 / 2; i > 0; i /= 2) { + if (tpitg < i) { + sum[tpitg] += sum[tpitg + i]; + } + } + if (tpitg == 0) { + for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];} + sum[0] /= ne00; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + const float mean = sum[0]; + const float scale = 1.0f/sqrt(mean + eps); + + device float4 * y = (device float4 *) (dst + tgpig*ne00); + device float * y_scalar = (device float *) y; + for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { + y[i00] = x[i00] * scale; + } + if (tpitg == 0) { + for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;} + } +} + +// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i]) +// il indicates where the q4 quants begin (0 or QK4_0/4) +// we assume that the yl's have been multiplied with the appropriate scale factor +// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096) +inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) { + float d = qb_curr->d; + float2 acc = 0.f; + device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2); + for (int i = 0; i < 8; i+=2) { + acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F) + + yl[i + 1] * (qs[i / 2] & 0x0F00); + acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0) + + yl[i + 9] * (qs[i / 2] & 0xF000); + } + return d * (sumy * -8.f + acc[0] + acc[1]); +} + +// function for calculate inner product between half a q4_1 block and 16 floats (yl), sumy is SUM(yl[i]) +// il indicates where the q4 quants begin (0 or QK4_0/4) +// we assume that the yl's have been multiplied with the appropriate scale factor +// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096) +inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) { + float d = qb_curr->d; + float m = qb_curr->m; + device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2); + float2 acc = 0.f; + for (int i = 0; i < 8; i+=2) { + acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F) + + yl[i + 1] * (qs[i / 2] & 0x0F00); + acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0) + + yl[i + 9] * (qs[i / 2] & 0xF000); + } + return d * (acc[0] + acc[1]) + sumy * m; +} + +// putting them in the kernel cause a significant performance penalty +#define N_DST 4 // each SIMD group works on 4 rows +#define N_SIMDGROUP 2 // number of SIMD groups in a thread group +#define N_SIMDWIDTH 32 // assuming SIMD group size is 32 +//Note: This is a template, but strictly speaking it only applies to +// quantizations where the block size is 32. It also does not +// giard against the number of rows not being divisible by +// N_DST, so this is another explicit assumption of the implementation. +template +void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst, + int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa, + uint3 tgpig, uint tiisg, uint sgitg) { + const int nb = ne00/QK4_0; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + const int first_row = (r0 * nsg + sgitg) * nr; + const uint offset0 = first_row * nb + im/gqa*(nb*ne0); + device const block_q_type * x = (device const block_q_type *) src0 + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + float yl[16]; // src1 vector cache + float sumf[nr]={0.f}; + + const int ix = tiisg/2; + const int il = 8*(tiisg%2); + + device const float * yb = y + ix * QK4_0 + il; + + // each thread in a SIMD group deals with half a block. + for (int ib = ix; ib < nb; ib += nw/2) { + float sumy = 0; + for (int i = 0; i < 8; i += 2) { + sumy += yb[i] + yb[i+1]; + yl[i+0] = yb[i+ 0]; + yl[i+1] = yb[i+ 1]/256.f; + sumy += yb[i+16] + yb[i+17]; + yl[i+8] = yb[i+16]/16.f; + yl[i+9] = yb[i+17]/4096.f; + } + + for (int row = 0; row < nr; row++) { + sumf[row] += block_q_n_dot_y(x+ib+row*nb, sumy, yl, il); + } + + yb += QK4_0 * 16; + } + + for (int row = 0; row < nr; ++row) { + const float tot = simd_sum(sumf[row]); + if (tiisg == 0 && first_row + row < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot; + } + } +} + +kernel void kernel_mul_mat_q4_0_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg); +} + +kernel void kernel_mul_mat_q4_1_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg); +} + +#define NB_Q8_0 8 + +kernel void kernel_mul_mat_q8_0_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + const int nr = N_DST; + const int nsg = N_SIMDGROUP; + const int nw = N_SIMDWIDTH; + + const int nb = ne00/QK8_0; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + const int first_row = (r0 * nsg + sgitg) * nr; + const uint offset0 = first_row * nb + im/gqa*(nb*ne0); + device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + + float yl[NB_Q8_0]; + float sumf[nr]={0.f}; + + const int ix = tiisg/4; + const int il = tiisg%4; + + device const float * yb = y + ix * QK8_0 + NB_Q8_0*il; + + // each thread in a SIMD group deals with NB_Q8_0 quants at a time + for (int ib = ix; ib < nb; ib += nw/4) { + for (int i = 0; i < NB_Q8_0; ++i) { + yl[i] = yb[i]; + } + + for (int row = 0; row < nr; row++) { + device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il; + float sumq = 0.f; + for (int iq = 0; iq < NB_Q8_0; ++iq) { + sumq += qs[iq] * yl[iq]; + } + sumf[row] += sumq*x[ib+row*nb].d; + } + + yb += NB_Q8_0 * nw; + } + + for (int row = 0; row < nr; ++row) { + const float tot = simd_sum(sumf[row]); + if (tiisg == 0 && first_row + row < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot; + } + } +} + +kernel void kernel_mul_mat_f16_f32_1row( + device const char * src0, + device const char * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]]) { + + const int64_t r0 = tgpig.x; + const int64_t r1 = tgpig.y; + const int64_t im = tgpig.z; + + device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); + device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); + + float sumf = 0; + if (ne00 < 128) { + for (int i = tiisg; i < ne00; i += 32) { + sumf += (float) x[i] * (float) y[i]; + } + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } + } else { + device const half4 * x4 = (device const half4 *) x; + device const float4 * y4 = (device const float4 *) y; + for (int i = tiisg; i < ne00/4; i += 32) { + for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k]; + } + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i]; + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } + } + +} + +#define N_F16_F32 4 + +kernel void kernel_mul_mat_f16_f32( + device const char * src0, + device const char * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]]) { + + const int64_t r0 = tgpig.x; + const int64_t rb = tgpig.y*N_F16_F32; + const int64_t im = tgpig.z; + + device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); + + if (ne00 < 128) { + for (int row = 0; row < N_F16_F32; ++row) { + int r1 = rb + row; + if (r1 >= ne11) { + break; + } + + device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); + + float sumf = 0; + for (int i = tiisg; i < ne00; i += 32) { + sumf += (float) x[i] * (float) y[i]; + } + + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } + } + } else { + device const half4 * x4 = (device const half4 *)x; + for (int row = 0; row < N_F16_F32; ++row) { + int r1 = rb + row; + if (r1 >= ne11) { + break; + } + + device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); + device const float4 * y4 = (device const float4 *) y; + + float sumf = 0; + for (int i = tiisg; i < ne00/4; i += 32) { + for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k]; + } + + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i]; + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } + } + } +} + +kernel void kernel_alibi_f32( + device const float * src0, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + constant float & m0, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig[2]; + const int64_t i02 = tgpig[1]; + const int64_t i01 = tgpig[0]; + + const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + + const int64_t i3 = n / (ne2*ne1*ne0); + const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); + const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; + const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0); + + device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + float m_k = pow(m0, i2 + 1); + for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) { + device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); + dst_data[i00] = src[0] + m_k * (i00 - ne00 + 1); + } +} + +kernel void kernel_rope( + device const void * src0, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + constant int & n_past, + constant int & n_dims, + constant int & mode, + constant float & freq_base, + constant float & freq_scale, + uint tiitg[[thread_index_in_threadgroup]], + uint3 tptg[[threads_per_threadgroup]], + uint3 tgpig[[threadgroup_position_in_grid]]) { + const int64_t i3 = tgpig[2]; + const int64_t i2 = tgpig[1]; + const int64_t i1 = tgpig[0]; + + const bool is_neox = mode & 2; + + const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); + + const float theta_0 = freq_scale * (float)p; + const float inv_ndims = -1.f/n_dims; + + if (!is_neox) { + for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) { + + const float theta = theta_0 * pow(freq_base, inv_ndims*i0); + const float cos_theta = cos(theta); + const float sin_theta = sin(theta); + + device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float x0 = src[0]; + const float x1 = src[1]; + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[1] = x0*sin_theta + x1*cos_theta; + } + } else { + for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { + for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) { + + const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib); + const float cos_theta = cos(theta); + const float sin_theta = sin(theta); + + const int64_t i0 = ib*n_dims + ic/2; + + device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float x0 = src[0]; + const float x1 = src[n_dims/2]; + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; + } + } + } +} + +kernel void kernel_cpy_f16_f16( + device const half * src0, + device half * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig[2]; + const int64_t i02 = tgpig[1]; + const int64_t i01 = tgpig[0]; + + const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + + const int64_t i3 = n / (ne2*ne1*ne0); + const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); + const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; + const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0); + + device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) { + device const half * src = (device half *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); + dst_data[i00] = src[0]; + } +} + +kernel void kernel_cpy_f32_f16( + device const float * src0, + device half * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig[2]; + const int64_t i02 = tgpig[1]; + const int64_t i01 = tgpig[0]; + + const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + + const int64_t i3 = n / (ne2*ne1*ne0); + const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); + const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; + const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0); + + device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) { + device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); + + dst_data[i00] = src[0]; + } +} + +kernel void kernel_cpy_f32_f32( + device const float * src0, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig[2]; + const int64_t i02 = tgpig[1]; + const int64_t i01 = tgpig[0]; + + const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + + const int64_t i3 = n / (ne2*ne1*ne0); + const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); + const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; + const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0); + + device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) { + device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); + + dst_data[i00] = src[0]; + } +} + +//============================================ k-quants ====================================================== + +#ifndef QK_K +#define QK_K 256 +#else +static_assert(QK_K == 256 || QK_K == 64, "QK_K must be 256 or 64"); +#endif + +#if QK_K == 256 +#define K_SCALE_SIZE 12 +#else +#define K_SCALE_SIZE 4 +#endif + +typedef struct { + uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits + uint8_t qs[QK_K/4]; // quants + half d; // super-block scale for quantized scales + half dmin; // super-block scale for quantized mins +} block_q2_K; +// 84 bytes / block + +typedef struct { + uint8_t hmask[QK_K/8]; // quants - high bit + uint8_t qs[QK_K/4]; // quants - low 2 bits +#if QK_K == 64 + uint8_t scales[2]; +#else + uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits +#endif + half d; // super-block scale +} block_q3_K; + +#if QK_K == 64 +typedef struct { + half d[2]; // super-block scales/mins + uint8_t scales[2]; + uint8_t qs[QK_K/2]; // 4-bit quants +} block_q4_K; +#else +typedef struct { + half d; // super-block scale for quantized scales + half dmin; // super-block scale for quantized mins + uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits + uint8_t qs[QK_K/2]; // 4--bit quants +} block_q4_K; +#endif + +#if QK_K == 64 +typedef struct { + half d; // super-block scales/mins + int8_t scales[QK_K/16]; // 8-bit block scales + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits +} block_q5_K; +#else +typedef struct { + half d; // super-block scale for quantized scales + half dmin; // super-block scale for quantized mins + uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits +} block_q5_K; +// 176 bytes / block +#endif + +typedef struct { + uint8_t ql[QK_K/2]; // quants, lower 4 bits + uint8_t qh[QK_K/4]; // quants, upper 2 bits + int8_t scales[QK_K/16]; // scales, quantized with 8 bits + half d; // super-block scale +} block_q6_K; +// 210 bytes / block + +static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) { + uchar4 r; + if (j < 4) { + r[0] = q[j+0] & 63; + r[2] = q[j+1] & 63; + r[1] = q[j+4] & 63; + r[3] = q[j+5] & 63; + } else { + r[0] = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); + r[2] = (q[j+5] & 0xF) | ((q[j-3] >> 6) << 4); + r[1] = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); + r[3] = (q[j+5] >> 4) | ((q[j+1] >> 6) << 4); + } + return r; +} + +//====================================== dot products ========================= + +kernel void kernel_mul_mat_q2_K_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int r2 = tgpig.z; + + const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int ib_row = first_row * nb; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q2_K * x = (device const block_q2_K *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + float yl[32]; + float sumf[N_DST]={0.f}, all_sum; + + const int step = sizeof(block_q2_K) * nb; + +#if QK_K == 256 + const int ix = tiisg/8; // 0...3 + const int it = tiisg%8; // 0...7 + const int im = it/4; // 0 or 1 + const int ir = it%4; // 0...3 + const int is = (8*ir)/16;// 0 or 1 + + device const float * y4 = y + ix * QK_K + 128 * im + 8 * ir; + + for (int ib = ix; ib < nb; ib += 4) { + + float4 sumy = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; ++i) { + yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0]; + yl[i+ 8] = y4[i+32]; sumy[1] += yl[i+ 8]; + yl[i+16] = y4[i+64]; sumy[2] += yl[i+16]; + yl[i+24] = y4[i+96]; sumy[3] += yl[i+24]; + } + + device const uint8_t * sc = (device const uint8_t *)x[ib].scales + 8*im + is; + device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir; + device const half * dh = &x[ib].d; + + for (int row = 0; row < N_DST; row++) { + + float4 acc1 = {0.f, 0.f, 0.f, 0.f}; + float4 acc2 = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; i += 2) { + acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003); + acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300); + acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c); + acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00); + acc1[2] += yl[i+16] * (qs[i/2] & 0x0030); + acc2[2] += yl[i+17] * (qs[i/2] & 0x3000); + acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0); + acc2[3] += yl[i+25] * (qs[i/2] & 0xc000); + } + float dall = dh[0]; + float dmin = dh[1] * 1.f/16.f; + sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f + + (acc1[1] + 1.f/256.f * acc2[1]) * (sc[2] & 0xF) * 1.f/ 4.f + + (acc1[2] + 1.f/256.f * acc2[2]) * (sc[4] & 0xF) * 1.f/16.f + + (acc1[3] + 1.f/256.f * acc2[3]) * (sc[6] & 0xF) * 1.f/64.f) - + dmin * (sumy[0] * (sc[0] & 0xF0) + sumy[1] * (sc[2] & 0xF0) + sumy[2] * (sc[4] & 0xF0) + sumy[3] * (sc[6] & 0xF0)); + + qs += step/2; + sc += step; + dh += step/2; + } + + y4 += 4 * QK_K; + } +#else + const int ix = tiisg/2; // 0...15 + const int it = tiisg%2; // 0...1 + + device const float * y4 = y + ix * QK_K + 8 * it; + + for (int ib = ix; ib < nb; ib += 16) { + + float4 sumy = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; ++i) { + yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0]; + yl[i+ 8] = y4[i+16]; sumy[1] += yl[i+ 8]; + yl[i+16] = y4[i+32]; sumy[2] += yl[i+16]; + yl[i+24] = y4[i+48]; sumy[3] += yl[i+24]; + } + + device const uint8_t * sc = (device const uint8_t *)x[ib].scales; + device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it; + device const half * dh = &x[ib].d; + + for (int row = 0; row < N_DST; row++) { + + float4 acc1 = {0.f, 0.f, 0.f, 0.f}; + float4 acc2 = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; i += 2) { + acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003); + acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300); + acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c); + acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00); + acc1[2] += yl[i+16] * (qs[i/2] & 0x0030); + acc2[2] += yl[i+17] * (qs[i/2] & 0x3000); + acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0); + acc2[3] += yl[i+25] * (qs[i/2] & 0xc000); + } + + float dall = dh[0]; + float dmin = dh[1]; + sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f + + (acc1[1] + 1.f/256.f * acc2[1]) * (sc[1] & 0xF) * 1.f/ 4.f + + (acc1[2] + 1.f/256.f * acc2[2]) * (sc[2] & 0xF) * 1.f/16.f + + (acc1[3] + 1.f/256.f * acc2[3]) * (sc[3] & 0xF) * 1.f/64.f) - + dmin * (sumy[0] * (sc[0] >> 4) + sumy[1] * (sc[1] >> 4) + sumy[2] * (sc[2] >> 4) + sumy[3] * (sc[3] >> 4)); + + qs += step/2; + sc += step; + dh += step/2; + } + + y4 += 16 * QK_K; + } +#endif + + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = all_sum; + } + } +} + +#if QK_K == 256 +kernel void kernel_mul_mat_q3_K_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const int nb = ne00/QK_K; + + const int64_t r0 = tgpig.x; + const int64_t r1 = tgpig.y; + const int64_t r2 = tgpig.z; + + const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb + offset0; + device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + + float yl[16]; + + const uint16_t kmask1 = 0x0303; + const uint16_t kmask2 = 0x0f0f; + + const int tid = tiisg/2; + const int ix = tiisg%2; + const int ip = tid/8; // 0 or 1 + const int il = tid/2 - 4*ip; // 0...3 + const int ir = tid%2; + const int n = 8; + const int l0 = n*ir; + + const uint16_t m1 = 1 << (4*ip + il); + const uint16_t m2 = m1 << 8; + + const int shift = 2*il; + const uint16_t qm1 = 0x0003 << shift; + const uint16_t qm2 = 0x0300 << shift; + const int32_t v1 = 4 << shift; + const int32_t v2 = 1024 << shift; + + const uint16_t s_shift1 = 4*ip; + const uint16_t s_shift2 = s_shift1 + 2*(il/2); + const int ik = 4 + (il%2); + + const int q_offset = 32*ip + l0; + const int y_offset = 128*ip + 32*il + l0; + + const int step = sizeof(block_q3_K) * nb / 2; + + device const float * y1 = yy + ix*QK_K + y_offset; + + float sumf1[2] = {0.f}, sumf2[2] = {0.f}; + for (int i = ix; i < nb; i += 2) { + + for (int l = 0; l < 8; ++l) { + yl[l+0] = y1[l+ 0]; + yl[l+8] = y1[l+16]; + } + + device const uint16_t * q = (device const uint16_t *)(x[i].qs + q_offset); + device const uint16_t * h = (device const uint16_t *)(x[i].hmask + l0); + device const uint16_t * a = (device const uint16_t *)(x[i].scales); + device const half * dh = &x[i].d; + + for (int row = 0; row < 2; ++row) { + + const float d_all = (float)dh[0]; + const char2 scales = as_type((uint16_t)(((a[il] >> s_shift1) & kmask2) | (((a[ik] >> s_shift2) & kmask1) << 4))); + + float s1 = 0, s2 = 0; + for (int l = 0; l < n; l += 2) { + const uint16_t qs = q[l/2]; + s1 += yl[l+0] * ((int32_t)(qs & qm1) - ((h[l/2] & m1) ? 0 : v1)); + s2 += yl[l+1] * ((int32_t)(qs & qm2) - ((h[l/2] & m2) ? 0 : v2)); + } + float d = d_all * (s1 + 1.f/256.f * s2); + sumf1[row] += d * scales[0]; + sumf2[row] += d; + + s1 = s2 = 0; + for (int l = 0; l < n; l += 2) { + const uint16_t qs = q[l/2+8]; + s1 += yl[l+8] * ((int32_t)(qs & qm1) - ((h[l/2+8] & m1) ? 0 : v1)); + s2 += yl[l+9] * ((int32_t)(qs & qm2) - ((h[l/2+8] & m2) ? 0 : v2)); + } + d = d_all * (s1 + 1.f/256.f * s2); + sumf1[row] += d * scales[1]; + sumf2[row] += d; + + q += step; + h += step; + a += step; + dh += step; + + } + + y1 += 2 * QK_K; + + } + + for (int row = 0; row < 2; ++row) { + const float sumf = (sumf1[row] - 32.f*sumf2[row]) / (1 << shift); + const float tot = simd_sum(sumf); + if (tiisg == 0) { + dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = tot; + } + } +} +#else +kernel void kernel_mul_mat_q3_K_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const int nb = ne00/QK_K; + + const int64_t r0 = tgpig.x; + const int64_t r1 = tgpig.y; + const int64_t r2 = tgpig.z; + + const int row = 2 * r0 + sgitg; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0; + device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + const int ix = tiisg/4; + const int il = 4 * (tiisg%4);// 0, 4, 8, 12 + const int im = il/8; // 0, 0, 1, 1 + const int in = il%8; // 0, 4, 0, 4 + + float2 sum = {0.f, 0.f}; + + for (int i = ix; i < nb; i += 8) { + + const float d_all = (float)(x[i].d); + + device const uint16_t * q = (device const uint16_t *)(x[i].qs + il); + device const uint16_t * h = (device const uint16_t *)(x[i].hmask + in); + device const uint16_t * s = (device const uint16_t *)(x[i].scales); + device const float * y = yy + i * QK_K + il; + + const float d1 = d_all * ((int32_t)(s[0] & 0x000F) - 8); + const float d2 = d_all * ((int32_t)(s[0] & 0x00F0) - 128) * 1.f/64.f; + const float d3 = d_all * ((int32_t)(s[0] & 0x0F00) - 2048) * 1.f/4096.f; + const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f; + + for (int l = 0; l < 4; l += 2) { + const uint16_t hm = h[l/2] >> im; + sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 : 4)) + + y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16)) + + y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64)) + + y[l+48] * d4 * ((int32_t)(q[l/2] & 0x00c0) - ((hm & 0x0040) ? 0 : 256)); + sum[1] += y[l+ 1] * d1 * ((int32_t)(q[l/2] & 0x0300) - ((hm & 0x0100) ? 0 : 1024)) + + y[l+17] * d2 * ((int32_t)(q[l/2] & 0x0c00) - ((hm & 0x0400) ? 0 : 4096)) + + y[l+33] * d3 * ((int32_t)(q[l/2] & 0x3000) - ((hm & 0x1000) ? 0 : 16384)) + + y[l+49] * d4 * ((int32_t)(q[l/2] & 0xc000) - ((hm & 0x4000) ? 0 : 65536)); + } + + } + const float sumf = sum[0] + sum[1] * 1.f/256.f; + + const float tot = simd_sum(sumf); + if (tiisg == 0) { + dst[r1*ne0 + r2*ne0*ne1 + row] = tot; + } + +} +#endif + +#if QK_K == 256 +kernel void kernel_mul_mat_q4_K_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int ix = tiisg/8; // 0...3 + const int it = tiisg%8; // 0...7 + const int im = it/4; // 0 or 1 + const int ir = it%4; // 0...3 + + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int r2 = tgpig.z; + //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int first_row = r0 * N_DST; + const int ib_row = first_row * nb; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + float yl[16]; + float yh[16]; + float sumf[N_DST]={0.f}, all_sum; + + const int step = sizeof(block_q4_K) * nb / 2; + + device const float * y4 = y + ix * QK_K + 64 * im + 8 * ir; + + uint16_t sc16[4]; + thread const uint8_t * sc8 = (thread const uint8_t *)sc16; + + for (int ib = ix; ib < nb; ib += 4) { + + float4 sumy = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; ++i) { + yl[i+0] = y4[i+ 0]; sumy[0] += yl[i+0]; + yl[i+8] = y4[i+ 32]; sumy[1] += yl[i+8]; + yh[i+0] = y4[i+128]; sumy[2] += yh[i+0]; + yh[i+8] = y4[i+160]; sumy[3] += yh[i+8]; + } + + device const uint16_t * sc = (device const uint16_t *)x[ib].scales + im; + device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir; + device const half * dh = &x[ib].d; + + for (int row = 0; row < N_DST; row++) { + + sc16[0] = sc[0] & kmask1; + sc16[1] = sc[2] & kmask1; + sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2); + sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2); + + device const uint16_t * q2 = q1 + 32; + + float4 acc1 = {0.f, 0.f, 0.f, 0.f}; + float4 acc2 = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; i += 2) { + acc1[0] += yl[i+0] * (q1[i/2] & 0x000F); + acc1[1] += yl[i+1] * (q1[i/2] & 0x0F00); + acc1[2] += yl[i+8] * (q1[i/2] & 0x00F0); + acc1[3] += yl[i+9] * (q1[i/2] & 0xF000); + acc2[0] += yh[i+0] * (q2[i/2] & 0x000F); + acc2[1] += yh[i+1] * (q2[i/2] & 0x0F00); + acc2[2] += yh[i+8] * (q2[i/2] & 0x00F0); + acc2[3] += yh[i+9] * (q2[i/2] & 0xF000); + } + + float dall = dh[0]; + float dmin = dh[1]; + sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8[0] + + (acc1[2] + 1.f/256.f * acc1[3]) * sc8[1] * 1.f/16.f + + (acc2[0] + 1.f/256.f * acc2[1]) * sc8[4] + + (acc2[2] + 1.f/256.f * acc2[3]) * sc8[5] * 1.f/16.f) - + dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]); + + q1 += step; + sc += step; + dh += step; + } + + y4 += 4 * QK_K; + } + + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = all_sum; + } + } +} +#else +kernel void kernel_mul_mat_q4_K_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const int ix = tiisg/4; // 0...7 + const int it = tiisg%4; // 0...3 + + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int r2 = tgpig.z; + const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int ib_row = first_row * nb; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + float yl[8]; + float yh[8]; + float sumf[N_DST]={0.f}, all_sum; + + const int step = sizeof(block_q4_K) * nb / 2; + + device const float * y4 = y + ix * QK_K + 8 * it; + + uint16_t sc16[4]; + + for (int ib = ix; ib < nb; ib += 8) { + + float2 sumy = {0.f, 0.f}; + for (int i = 0; i < 8; ++i) { + yl[i] = y4[i+ 0]; sumy[0] += yl[i]; + yh[i] = y4[i+32]; sumy[1] += yh[i]; + } + + device const uint16_t * sc = (device const uint16_t *)x[ib].scales; + device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it; + device const half * dh = x[ib].d; + + for (int row = 0; row < N_DST; row++) { + + sc16[0] = sc[0] & 0x000f; + sc16[1] = sc[0] & 0x0f00; + sc16[2] = sc[0] & 0x00f0; + sc16[3] = sc[0] & 0xf000; + + float2 acc1 = {0.f, 0.f}; + float2 acc2 = {0.f, 0.f}; + for (int i = 0; i < 8; i += 2) { + acc1[0] += yl[i+0] * (qs[i/2] & 0x000F); + acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00); + acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0); + acc2[1] += yh[i+1] * (qs[i/2] & 0xF000); + } + + float dall = dh[0]; + float dmin = dh[1]; + sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] + + (acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) - + dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f); + + qs += step; + sc += step; + dh += step; + } + + y4 += 8 * QK_K; + } + + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0+ r2*ne0*ne1 + first_row + row] = all_sum; + } + } +} +#endif + +kernel void kernel_mul_mat_q5_K_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const int nb = ne00/QK_K; + + const int64_t r0 = tgpig.x; + const int64_t r1 = tgpig.y; + const int r2 = tgpig.z; + + const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb + offset0; + device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + + float sumf[2]={0.f}; + + const int step = sizeof(block_q5_K) * nb; + +#if QK_K == 256 +# + float yl[16], yh[16]; + + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int tid = tiisg/4; + const int ix = tiisg%4; + const int im = tid/4; + const int ir = tid%4; + const int n = 8; + + const int l0 = n*ir; + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + const uint8_t hm1 = 1u << (2*im); + const uint8_t hm2 = hm1 << 1; + const uint8_t hm3 = hm1 << 4; + const uint8_t hm4 = hm2 << 4; + + uint16_t sc16[4]; + thread const uint8_t * sc8 = (thread const uint8_t *)sc16; + + device const float * y1 = yy + ix*QK_K + y_offset; + + for (int i = ix; i < nb; i += 4) { + + device const uint8_t * q1 = x[i].qs + q_offset; + device const uint8_t * qh = x[i].qh + l0; + device const half * dh = &x[i].d; + device const uint16_t * a = (device const uint16_t *)x[i].scales + im; + + device const float * y2 = y1 + 128; + float4 sumy = {0.f, 0.f, 0.f, 0.f}; + for (int l = 0; l < 8; ++l) { + yl[l+0] = y1[l+ 0]; sumy[0] += yl[l+0]; + yl[l+8] = y1[l+32]; sumy[1] += yl[l+8]; + yh[l+0] = y2[l+ 0]; sumy[2] += yh[l+0]; + yh[l+8] = y2[l+32]; sumy[3] += yh[l+8]; + } + + for (int row = 0; row < 2; ++row) { + + device const uint8_t * q2 = q1 + 64; + + sc16[0] = a[0] & kmask1; + sc16[1] = a[2] & kmask1; + sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2); + sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2); + + float4 acc = {0.f, 0.f, 0.f, 0.f}; + for (int l = 0; l < n; ++l) { + uint8_t h = qh[l]; + acc[0] += yl[l+0] * ((uint16_t)(q1[l] & 0x0F) + (h & hm1 ? 16 : 0)); + acc[1] += yl[l+8] * ((uint16_t)(q1[l] & 0xF0) + (h & hm2 ? 256 : 0)); + acc[2] += yh[l+0] * ((uint16_t)(q2[l] & 0x0F) + (h & hm3 ? 16 : 0)); + acc[3] += yh[l+8] * ((uint16_t)(q2[l] & 0xF0) + (h & hm4 ? 256 : 0)); + } + const float dall = dh[0]; + const float dmin = dh[1]; + sumf[row] += dall * (acc[0] * sc8[0] + acc[1] * sc8[1] * 1.f/16.f + acc[2] * sc8[4] + acc[3] * sc8[5] * 1.f/16.f) - + dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]); + + q1 += step; + qh += step; + dh += step/2; + a += step/2; + + } + + y1 += 4 * QK_K; + + } +#else + float yl[8], yh[8]; + + const int il = 4 * (tiisg/8); // 0, 4, 8, 12 + const int ix = tiisg%8; + const int im = il/8; // 0, 0, 1, 1 + const int in = il%8; // 0, 4, 0, 4 + + device const float * y = yy + ix*QK_K + il; + + for (int i = ix; i < nb; i += 8) { + + for (int l = 0; l < 4; ++l) { + yl[l+0] = y[l+ 0]; + yl[l+4] = y[l+16]; + yh[l+0] = y[l+32]; + yh[l+4] = y[l+48]; + } + + device const half * dh = &x[i].d; + device const uint8_t * q = x[i].qs + il; + device const uint8_t * h = x[i].qh + in; + device const int8_t * s = x[i].scales; + + for (int row = 0; row < 2; ++row) { + + const float d = dh[0]; + + float2 acc = {0.f, 0.f}; + for (int l = 0; l < 4; ++l) { + const uint8_t hl = h[l] >> im; + acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16)) + + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16)); + acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256)) + + yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256)); + } + sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]); + + q += step; + h += step; + s += step; + dh += step/2; + + } + + y += 8 * QK_K; + } +#endif + + for (int row = 0; row < 2; ++row) { + const float tot = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = tot; + } + } + +} + +kernel void kernel_mul_mat_q6_K_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const uint8_t kmask1 = 0x03; + const uint8_t kmask2 = 0x0C; + const uint8_t kmask3 = 0x30; + const uint8_t kmask4 = 0xC0; + + const int nb = ne00/QK_K; + + const int64_t r0 = tgpig.x; + const int64_t r1 = tgpig.y; + const int r2 = tgpig.z; + + const int row = 2 * r0 + sgitg; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb + offset0; + device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; + + float sumf = 0; + +#if QK_K == 256 + const int tid = tiisg/2; + const int ix = tiisg%2; + const int ip = tid/8; // 0 or 1 + const int il = tid%8; + const int n = 4; + const int l0 = n*il; + const int is = 8*ip + l0/16; + + const int y_offset = 128*ip + l0; + const int q_offset_l = 64*ip + l0; + const int q_offset_h = 32*ip + l0; + + for (int i = ix; i < nb; i += 2) { + + device const uint8_t * q1 = x[i].ql + q_offset_l; + device const uint8_t * q2 = q1 + 32; + device const uint8_t * qh = x[i].qh + q_offset_h; + device const int8_t * sc = x[i].scales + is; + + device const float * y = yy + i * QK_K + y_offset; + + const float dall = x[i].d; + + float4 sums = {0.f, 0.f, 0.f, 0.f}; + for (int l = 0; l < n; ++l) { + sums[0] += y[l+ 0] * ((int8_t)((q1[l] & 0xF) | ((qh[l] & kmask1) << 4)) - 32); + sums[1] += y[l+32] * ((int8_t)((q2[l] & 0xF) | ((qh[l] & kmask2) << 2)) - 32); + sums[2] += y[l+64] * ((int8_t)((q1[l] >> 4) | ((qh[l] & kmask3) << 0)) - 32); + sums[3] += y[l+96] * ((int8_t)((q2[l] >> 4) | ((qh[l] & kmask4) >> 2)) - 32); + } + + sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]); + + } + +#else + const int ix = tiisg/4; + const int il = 4*(tiisg%4); + + for (int i = ix; i < nb; i += 8) { + device const float * y = yy + i * QK_K + il; + device const uint8_t * ql = x[i].ql + il; + device const uint8_t * qh = x[i].qh + il; + device const int8_t * s = x[i].scales; + + const float d = x[i].d; + + float4 sums = {0.f, 0.f, 0.f, 0.f}; + for (int l = 0; l < 4; ++l) { + sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32); + sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32); + sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >> 4) | ((qh[l] & kmask3) >> 0)) - 32); + sums[3] += y[l+48] * ((int8_t)((ql[l+16] >> 4) | ((qh[l] & kmask4) >> 2)) - 32); + } + sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]); + } + +#endif + + const float tot = simd_sum(sumf); + if (tiisg == 0) { + dst[r1*ne0 + r2*ne0*ne1 + row] = tot; + } +} + +//============================= templates and their specializations ============================= + +template +void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) { + half4x4 temp = *(((device half4x4 *)src)); + for (int i = 0; i < 16; i++){ + reg[i/4][i%4] = temp[i/4][i%4]; + } +} + +template +void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 1); + const half d = il ? (xb->d / 16.h) : xb->d; + const half m = il ? ( -8.h * 16.h) : -8.h; + const ushort mask0 = il ? 0x00F0 : 0x000F; + const ushort mask1 = il ? 0xF000 : 0x0F00; + + for (int i=0;i<8;i++) { + reg[i/2][2*(i%2)] = (((qs[i] & mask0) ) + m) * d; + reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) + m) * d; + } +} + +template +void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 2); + const half d = il ? (xb->d / 16.h) : xb->d; + const half m = xb->m; + const ushort mask0 = il ? 0x00F0 : 0x000F; + const ushort mask1 = il ? 0xF000 : 0x0F00; + + for (int i=0;i<8;i++) { + reg[i/2][2*(i%2)] = (((qs[i] & mask0) ) * d) + m; + reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) * d) + m; + } +} + +template +void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) { + device const int8_t * qs = ((device const int8_t *)xb->qs); + const half d = xb->d; + + for (int i=0;i<16;i++) { + reg[i/4][i%4] = (qs[i + 16*il] * d); + } +} + +template +void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) { + const half d = xb->d; + const half min = xb->dmin; + device const uint8_t * q = (device const uint8_t *)xb->qs; + half dl, ml; + uint8_t sc = xb->scales[il]; + +#if QK_K == 256 + q = q + 32*(il/8) + 16*(il&1); + il = (il/2)%4; +#endif + half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); + uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4); + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * (q[i] & mask) - ml; + } +} + +template +void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) { + const float d_all = (float)(xb->d); + device const uint8_t * q = (device const uint8_t *)xb->qs; + device const uint8_t * h = (device const uint8_t *)xb->hmask; + device const int8_t * scales = (device const int8_t *)xb->scales; + +#if QK_K == 256 + q = q + 32 * (il/8) + 16 * (il&1); + h = h + 16 * (il&1); + uint8_t m = 1 << (il/2); + uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \ + ((il/4)>0 ? 12 : 3); + uint16_t kmask2 = il/8 ? 0xF0 : 0x0F; + uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4]; + int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) : \ + (scale_2&kmask2) | ((scale_1&kmask1) << 4); + float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f); + + il = (il/2)%4; + float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); + uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i] & m) ? 0 : 4.f/coef)); + } +#else + float kcoef = il&1 ? 1.f/16.f : 1.f; + uint16_t kmask = il&1 ? 0xF0 : 0x0F; + float dl = d_all * ((scales[il/2] & kmask) * kcoef - 8); + float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); + uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + uint8_t m = 1<<(il*2); + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i%8] & (m * (1 + i/8))) ? 0 : 4.f/coef)); + } +#endif +} + +template +void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) { + device const uint8_t * q = xb->qs; + +#if QK_K == 256 + const float d = (float)(xb->d); + const float min = (float)(xb->dmin); + short is = (il/4) * 2; + q = q + (il/4) * 32 + 16 * (il&1); + il = il%4; + const uchar4 sc = get_scale_min_k4(is, xb->scales); + const float dl = il<2 ? d * sc[0] : d * sc[2]/16.h; + const float ml = il<2 ? min * sc[1] : min * sc[3]; +#else + q = q + 16 * (il&1); + device const uint8_t * s = xb->scales; + device const half2 * dh = (device const half2 *)xb->d; + const float2 d = (float2)dh[0]; + const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h; + const float ml = il<2 ? d[1] * (s[0]>>4) : d[1 ]* (s[1]>>4); +#endif + const ushort mask = il<2 ? 0x0F : 0xF0; + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * (q[i] & mask) - ml; + } +} + +template +void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) { + device const uint8_t * q = xb->qs; + device const uint8_t * qh = xb->qh; + +#if QK_K == 256 + const float d = (float)(xb->d); + const float min = (float)(xb->dmin); + short is = (il/4) * 2; + q = q + 32 * (il/4) + 16 * (il&1); + qh = qh + 16 * (il&1); + uint8_t ul = 1 << (il/2); + il = il%4; + const uchar4 sc = get_scale_min_k4(is, xb->scales); + const float dl = il<2 ? d * sc[0] : d * sc[2]/16.h; + const float ml = il<2 ? min * sc[1] : min * sc[3]; + + const ushort mask = il<2 ? 0x0F : 0xF0; + const float qh_val = il<2 ? 16.f : 256.f; + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml; + } +#else + q = q + 16 * (il&1); + device const int8_t * s = xb->scales; + const float dl = xb->d * s[il]; + uint8_t m = 1<<(il*2); + const float coef = il<2 ? 1.f : 1.f/16.f; + const ushort mask = il<2 ? 0x0F : 0xF0; + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = coef * dl * ((q[i] & mask) - (qh[i%8] & (m*(1+i/8)) ? 0.f : 16.f/coef)); + } +#endif +} + +template +void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) { + const float d_all = (float)(xb->d); + device const uint8_t * ql = (device const uint8_t *)xb->ql; + device const uint8_t * qh = (device const uint8_t *)xb->qh; + device const int8_t * scales = (device const int8_t *)xb->scales; + +#if QK_K == 256 + ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1); + qh = qh + 32*(il/8) + 16*(il&1); + float sc = scales[(il%2) + 2 * ((il/2))]; + il = (il/2)%4; +#else + ql = ql + 16 * (il&1); + float sc = scales[il]; +#endif + for (int i = 0; i < 16; ++i) { + uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + uint16_t kmask2 = il>1 ? 0xF0 : 0x0F; + const float coef = il>1 ? 1.f/16.f : 1.f; + float q = il&1 ? ((ql[i]&kmask2)|((qh[i]&kmask1)<<2)) - 32.f/coef : \ + ((ql[i]&kmask2)|((qh[i]&kmask1)<<4)) - 32.f/coef; + reg[i/4][i%4] = d_all * sc * q * coef; + } +} + +template +kernel void kernel_get_rows( + device const void * src0, + device const int * src1, + device float * dst, + constant int64_t & ne00, + constant uint64_t & nb01, + constant uint64_t & nb1, + uint tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tptg[[threads_per_threadgroup]]) { + const int i = tgpig; + const int r = ((device int32_t *) src1)[i]; + + for (int ind = tiitg; ind < ne00/16; ind += tptg) { + float4x4 temp; + dequantize_func( + ((device const block_q *) ((device char *) src0 + r*nb01)) + ind/nl, ind%nl, temp); + *(((device float4x4 *) ((device char *) dst + i*nb1)) + ind) = temp; + } +} + +#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A +#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A +#define BLOCK_SIZE_K 32 +#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A +#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B +#define THREAD_PER_BLOCK 128 +#define THREAD_PER_ROW 2 // 2 thread for each row in matrix A to load numbers +#define THREAD_PER_COL 4 // 4 thread for each row in matrix B to load numbers +#define SG_MAT_SIZE 64 // simdgroup matrix is of shape 8x8 +#define SG_MAT_ROW 8 + +// each block_q contains 16*nl weights +template +kernel void kernel_mul_mm(device const uchar * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne02, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & gqa, + threadgroup uchar * shared_memory [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + threadgroup half * sa = ((threadgroup half *)shared_memory); + threadgroup float * sb = (threadgroup float *)(shared_memory + 4096); + + const uint r0 = tgpig.y; + const uint r1 = tgpig.x; + const uint im = tgpig.z; + // if this block is of 64x32 shape or smaller + short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M; + short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N; + // a thread shouldn't load data outside of the matrix + short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; + short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; + + simdgroup_half8x8 ma[4]; + simdgroup_float8x8 mb[2]; + simdgroup_float8x8 c_res[8]; + for (int i = 0; i < 8; i++){ + c_res[i] = make_filled_simdgroup_matrix(0.f); + } + + short il = (tiitg % THREAD_PER_ROW); + uint offset0 = im/gqa*nb02; ushort offset1 = il/nl; + device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1; + device const float * y = src1 + (r1 * BLOCK_SIZE_N + thread_col) * ne00 \ + + BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL) + im * ne00 * ne1; + + for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) { + //load data and store to threadgroup memory + half4x4 temp_a; + dequantize_func(x, il, temp_a); + threadgroup_barrier(mem_flags::mem_threadgroup); + #pragma unroll(16) + for (int i = 0; i < 16; i++) { + *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \ + + 16 * (tiitg % THREAD_PER_ROW) + 8 * (i / 8)) \ + + (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4]; + } + *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) \ + = *((device float2x4 *)y); + il = (il + 2 < nl) ? il + 2 : il % 2; + x = (il < 2) ? x + (2+nl-1)/nl : x; + y += BLOCK_SIZE_K; + + threadgroup_barrier(mem_flags::mem_threadgroup); + //load matrices from threadgroup memory and conduct outer products + threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2)); + threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2)); + #pragma unroll(4) + for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) { + #pragma unroll(4) + for (int i = 0; i < 4; i++) { + simdgroup_load(ma[i],lsma + SG_MAT_SIZE * i); + } + simdgroup_barrier(mem_flags::mem_none); + #pragma unroll(2) + for (int i = 0; i < 2; i++) { + simdgroup_load(mb[i],lsmb + SG_MAT_SIZE * i); + } + + lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE; + lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE; + #pragma unroll(8) + for (int i = 0; i < 8; i++){ + simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]); + } + } + } + + if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) { + device float *C = dst + BLOCK_SIZE_M * r0 + 32 * (sgitg&1) \ + + (BLOCK_SIZE_N * r1 + 16 * (sgitg>>1)) * ne0 + im*ne1*ne0; + for (int i = 0; i < 8; i++) { + simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0); + } + } else { + // block is smaller than 64x32, we should avoid writing data outside of the matrix + threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup float *temp_str = ((threadgroup float *)shared_memory) \ + + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; + for (int i = 0; i < 8; i++) { + simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M); + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; + if (sgitg==0) { + for (int i = 0; i < n_rows; i++) { + for (int j = tiitg; j< n_cols; j += BLOCK_SIZE_N) { + *(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M); + } + } + } + } +} + +#if QK_K == 256 +#define QK_NL 16 +#else +#define QK_NL 4 +#endif + +typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \ + constant uint64_t &, constant uint64_t &, uint, uint, uint); + +template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows; + +typedef void (mat_mm_t)(device const uchar *, device const float *, device float *, constant int64_t &,\ + constant int64_t &, constant int64_t &, constant int64_t &, constant int64_t &, \ + constant int64_t &, constant int64_t &, constant uint &, threadgroup uchar *, uint3, uint, uint); + +template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm; diff --git a/seamless_communication/ggml/src/ggml-opencl.cpp b/seamless_communication/ggml/src/ggml-opencl.cpp new file mode 100644 index 0000000..777048d --- /dev/null +++ b/seamless_communication/ggml/src/ggml-opencl.cpp @@ -0,0 +1,1865 @@ +#include "ggml-opencl.h" + +#include +#include +#include +#include +#include + +#define CL_TARGET_OPENCL_VERSION 110 +#include + +#include +#include +#include + +#include "ggml.h" + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +#define CL_DMMV_BLOCK_SIZE 32 + +#ifndef K_QUANTS_PER_ITERATION +#define K_QUANTS_PER_ITERATION 1 +#else +static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); +#endif + +#define MULTILINE_QUOTE(...) #__VA_ARGS__ +static std::string program_source = MULTILINE_QUOTE( + +typedef char int8_t; +typedef uchar uint8_t; +typedef short int16_t; +typedef ushort uint16_t; +typedef int int32_t; +typedef uint uint32_t; + +struct __attribute__ ((packed)) block_q4_0 +{ + half d; + uint8_t qs[QK4_0 / 2]; +}; + +struct __attribute__ ((packed)) block_q4_1 +{ + half d; + half m; + uint8_t qs[QK4_1 / 2]; +}; + +struct __attribute__ ((packed)) block_q5_0 +{ + half d; + uint32_t qh; + uint8_t qs[QK5_0 / 2]; +}; + +struct __attribute__ ((packed)) block_q5_1 +{ + half d; + half m; + uint32_t qh; + uint8_t qs[QK5_1 / 2]; +}; + +struct __attribute__ ((packed)) block_q8_0 +{ + half d; + int8_t qs[QK8_0]; +}; + +struct __attribute__((packed)) block_q2_K +{ + uint8_t scales[16]; + uint8_t qs[64]; + half d; + half dmin; +}; + +struct __attribute__((packed)) block_q3_K +{ + uint8_t hmask[32]; + uint8_t qs[64]; + uint8_t scales[12]; + half d; +}; + +struct __attribute__((packed)) block_q4_K +{ + half d; + half dmin; + uint8_t scales[12]; + uint8_t qs[128]; +}; + +struct __attribute__((packed)) block_q5_K +{ + half d; + half dmin; + uint8_t scales[12]; + uint8_t qh[32]; + uint8_t qs[128]; +}; + +struct __attribute__((packed)) block_q6_K +{ + uint8_t ql[128]; + uint8_t qh[64]; + int8_t scales[16]; + half d; +}; + +__kernel void convert_fp16_to_fp32(__global half* x, __global float* y) { + const uint i = get_global_id(0); + + y[i] = vload_half(0, &x[i]); +} + +void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) { + const float d = vload_half(0, &x[ib].d); + + const uint8_t vui = x[ib].qs[iqs]; + + const int8_t vi0 = vui & 0xF; + const int8_t vi1 = vui >> 4; + + *v0 = (vi0 - 8)*d; + *v1 = (vi1 - 8)*d; +} +void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) { + const float d = vload_half(0, &x[ib].d); + const float m = vload_half(0, &x[ib].m); + + const uint8_t vui = x[ib].qs[iqs]; + + const int8_t vi0 = vui & 0xF; + const int8_t vi1 = vui >> 4; + + *v0 = vi0*d + m; + *v1 = vi1*d + m; +} +void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) { + const float d = vload_half(0, &x[ib].d); + + uint32_t qh = x[ib].qh; + + const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16; + const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16; + + *v0 = x0*d; + *v1 = x1*d; +} +void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) { + const float d = vload_half(0, &x[ib].d); + const float m = vload_half(0, &x[ib].m); + + uint32_t qh = x[ib].qh; + + const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0); + const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1); + + *v0 = x0*d + m; + *v1 = x1*d + m; +} +void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) { + const float d = vload_half(0, &x[ib].d); + + const int8_t vi0 = x[ib].qs[iqs + 0]; + const int8_t vi1 = x[ib].qs[iqs + 1]; + + *v0 = vi0*d; + *v1 = vi1*d; +} +void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){ + *v0 = vload_half(0, &x[ib + 0]); + *v1 = vload_half(0, &x[ib + 1]); +} +); + +static std::string k_quants_source = MULTILINE_QUOTE( +inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m) +{ + if (j < 4) + { + *d = q[j] & 63; + *m = q[j + 4] & 63; + } + else + { + *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4); + *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4); + } +} + +__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy) +{ + const int i = get_group_id(0); + const int tid = get_local_id(0); + const int n = tid / 32; + const int l = tid - 32 * n; + const int is = 8 * n + l / 16; + + const uint8_t q = x[i].qs[32 * n + l]; + __global float *y = yy + i * QK_K + 128 * n; + + const float dall = vload_half(0, &x[i].d); + const float dmin = vload_half(0, &x[i].dmin); + + y[l + 0] = dall * (x[i].scales[is + 0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is + 0] >> 4); + y[l + 32] = dall * (x[i].scales[is + 2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is + 2] >> 4); + y[l + 64] = dall * (x[i].scales[is + 4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is + 4] >> 4); + y[l + 96] = dall * (x[i].scales[is + 6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is + 6] >> 4); +} + +__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy) +{ + int r = get_local_id(0) / 4; + int i = get_group_id(0); + int tid = r / 2; + int is0 = r % 2; + int l0 = 16 * is0 + 4 * (get_local_id(0) % 4); + int n = tid / 4; + int j = tid - 4 * n; + + uint8_t m = 1 << (4 * n + j); + int is = 8 * n + 2 * j + is0; + int shift = 2 * j; + + int8_t us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4) + : is < 8 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 4] >> 2) & 3) << 4) + : is < 12 ? (x[i].scales[is - 8] >> 4) | (((x[i].scales[is + 0] >> 4) & 3) << 4) + : (x[i].scales[is - 8] >> 4) | (((x[i].scales[is - 4] >> 6) & 3) << 4); + float d_all = vload_half(0, &x[i].d); + float dl = d_all * (us - 32); + + __global float *y = yy + i * QK_K + 128 * n + 32 * j; + const __global uint8_t *q = x[i].qs + 32 * n; + const __global uint8_t *hm = x[i].hmask; + + for (int l = l0; l < l0 + 4; ++l) + y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); +} + +__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy) +{ + const int i = get_group_id(0); + const int tid = get_local_id(0); + const int il = tid / 8; + const int ir = tid % 8; + const int is = 2 * il; + const int n = 4; + + __global float *y = yy + i * QK_K + 64 * il + n * ir; + + const float dall = vload_half(0, &x[i].d); + const float dmin = vload_half(0, &x[i].dmin); + + __global const uint8_t *q = x[i].qs + 32 * il + n * ir; + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[i].scales, &sc, &m); + float d1 = dall * sc; + float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, &sc, &m); + float d2 = dall * sc; + float m2 = dmin * m; + for (int l = 0; l < n; ++l) + { + y[l + 0] = d1 * (q[l] & 0xF) - m1; + y[l + 32] = d2 * (q[l] >> 4) - m2; + } +} + +__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy) +{ + const int i = get_group_id(0); + const int tid = get_local_id(0); + const int il = tid / 16; + const int ir = tid % 16; + const int is = 2 * il; + + __global float *y = yy + i * QK_K + 64 * il + 2 * ir; + + const float dall = vload_half(0, &x[i].d); + const float dmin = vload_half(0, &x[i].dmin); + + __global const uint8_t *ql = x[i].qs + 32 * il + 2 * ir; + __global const uint8_t *qh = x[i].qh + 2 * ir; + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[i].scales, &sc, &m); + const float d1 = dall * sc; + const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, &sc, &m); + const float d2 = dall * sc; + const float m2 = dmin * m; + + uint8_t hm = 1 << (2 * il); + y[0] = d1 * ((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0)) - m1; + y[1] = d1 * ((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0)) - m1; + hm <<= 1; + y[32] = d2 * ((ql[0] >> 4) + (qh[0] & hm ? 16 : 0)) - m2; + y[33] = d2 * ((ql[1] >> 4) + (qh[1] & hm ? 16 : 0)) - m2; +} + +__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy) +{ + const int i = get_group_id(0); + const int tid = get_local_id(0); + const int ip = tid / 32; + const int il = tid - 32 * ip; + const int is = 8 * ip + il / 16; + + __global float *y = yy + i * QK_K + 128 * ip + il; + + const float d = vload_half(0, &x[i].d); + + __global const uint8_t *ql = x[i].ql + 64 * ip + il; + const uint8_t qh = x[i].qh[32 * ip + il]; + __global const int8_t *sc = x[i].scales + is; + + y[0] = d * sc[0] * ((int8_t)((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); + y[64] = d * sc[4] * ((int8_t)((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); + y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); +} + +__kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { + + const int row = get_group_id(0); + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + __global const struct block_q2_K * x = xx + ib0; + + const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...15 + const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int step = 16/K_QUANTS_PER_ITERATION; + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int s_offset = 8*im; + const int y_offset = 128*im + l0; + + tmp[16 * ix + tid] = 0; + + uint32_t aux[4]; + const uint8_t * d = (const uint8_t *)aux; + const uint8_t * m = (const uint8_t *)(aux + 2); + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + __global const float * y = yy + i * QK_K + y_offset; + __global const uint8_t * q = x[i].qs + q_offset; + + const float dall = vload_half(0, &x[i].d); + const float dmin = vload_half(0, &x[i].dmin); + + __global const uint32_t * a = (__global const uint32_t *)(x[i].scales + s_offset); + aux[0] = a[0] & 0x0f0f0f0f; + aux[1] = a[1] & 0x0f0f0f0f; + aux[2] = (a[0] >> 4) & 0x0f0f0f0f; + aux[3] = (a[1] >> 4) & 0x0f0f0f0f; + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) + + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) + + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) + + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) + + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) + + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) + + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) + +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); + sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] + + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; + + } + tmp[16 * ix + tid] += dall * sum1 - dmin * sum2; + + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=16; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} + +__kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { + const uint16_t kmask1 = 0x0303; + const uint16_t kmask2 = 0x0f0f; + + const int row = get_group_id(0); + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + __global const struct block_q3_K * x = xx + ib0; + + const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop + const int step = 16/K_QUANTS_PER_ITERATION; + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0....15 or 0...7 + + const uint8_t m = 1 << (4*im); + + const int l0 = n*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int y_offset = 128*im + l0; + + uint16_t utmp[4]; + const int8_t * s = (const int8_t *)utmp; + + const uint16_t s_shift = 4*im; + + tmp[16 * ix + tid] = 0; + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + __global const float * y = yy + i * QK_K + y_offset; + __global const uint8_t * q = x[i].qs + q_offset; + __global const uint8_t * h = x[i].hmask + l0; + + __global const uint16_t * a = (__global const uint16_t *)x[i].scales; + utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); + utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); + utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); + utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); + + const float d = vload_half(0, &x[i].d); + + float sum = 0; + for (int l = 0; l < n; ++l) { + sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) + + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) + + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) + + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); + sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) + + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) + + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) + + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); + } + tmp[16 * ix + tid] += d * sum; + + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=16; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} + +__kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { + + //to rename it later, just to test now + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int row = get_group_id(0); + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15 + const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; + + const int step = 8/K_QUANTS_PER_ITERATION; + + const int il = tid/step; // 0...3 + const int ir = tid - step*il;// 0...3 + const int n = 2*K_QUANTS_PER_ITERATION; + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + + __global const struct block_q4_K * x = xx + ib0; + + tmp[16 * ix + tid] = 0; + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + __global const uint8_t * q1 = x[i].qs + q_offset; + __global const uint8_t * q2 = q1 + 64; + __global const float * y1 = yy + i*QK_K + y_offset; + __global const float * y2 = y1 + 128; + + const float dall = vload_half(0, &x[i].d); + const float dmin = vload_half(0, &x[i].dmin); + + __global const uint16_t * a = (__global const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + + float4 s = (float4)(0.f); + float smin = 0; + for (int l = 0; l < n; ++l) { + s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4); + s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4); + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp[16 * ix + tid] += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin; + + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=16; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} + +__kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { + + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int row = get_group_id(0); + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const int tid = get_local_id(0)/2; // 0...15 + const int ix = get_local_id(0)%2; + + const int il = tid/4; // 0...3 + const int ir = tid - 4*il;// 0...3 + const int n = 2; + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + const uint8_t hm1 = 1 << (2*im); + const uint8_t hm2 = hm1 << 4; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + + __global const struct block_q5_K * x = xx + ib0; + + tmp[16 * ix + tid] = 0; + + for (int i = ix; i < num_blocks_per_row; i += 2) { + + __global const uint8_t * ql1 = x[i].qs + q_offset; + __global const uint8_t * ql2 = ql1 + 64; + __global const uint8_t * qh = x[i].qh + l0; + __global const float * y1 = yy + i*QK_K + y_offset; + __global const float * y2 = y1 + 128; + + const float dall = vload_half(0, &x[i].d); + const float dmin = vload_half(0, &x[i].dmin); + + __global const uint16_t * a = (__global const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + + float4 sum = (float4)(0.f); + float smin = 0; + for (int l = 0; l < n; ++l) { + sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0)) + + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0)); + sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0)) + + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0)); + sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0)) + + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0)); + sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0)) + + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0)); + smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] + + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; + } + tmp[16 * ix + tid] += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin; + + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=16; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} + +__kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) { + + const int row = get_group_id(0); + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + __global const struct block_q6_K * x = xx + ib0; + + const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + + const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + +\n#if K_QUANTS_PER_ITERATION == 1\n + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 + const int is = 0; + +\n#else\n + + const int l0 = 4 * in; // 0, 4, 8, ..., 28 + const int is = in / 4; + +\n#endif\n + + const int ql_offset = 64*im + l0; + const int qh_offset = 32*im + l0; + const int s_offset = 8*im + is; + const int y_offset = 128*im + l0; + + tmp[16 * ix + tid] = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + __global const float * y = yy + i * QK_K + y_offset; + __global const uint8_t * ql = x[i].ql + ql_offset; + __global const uint8_t * qh = x[i].qh + qh_offset; + __global const int8_t * s = x[i].scales + s_offset; + + const float d = vload_half(0, &x[i].d); + +\n#if K_QUANTS_PER_ITERATION == 1\n + float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) + + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) + + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) + + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) + + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) + + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) + + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) + +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); + tmp[16 * ix + tid] += sum; +\n#else\n + float sum = 0; + for (int l = 0; l < 4; ++l) { + sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) + + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) + + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) + + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); + } + tmp[16 * ix + tid] += sum; +\n#endif\n + + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=16; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} + +); + + +std::string dequant_template = MULTILINE_QUOTE( +__kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) { + const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2; + + if (i >= get_global_size(0)) { + return; + } + + const uint qk = QUANT_K; + const uint qr = QUANT_R; + + const int ib = i/qk; // block index + const int iqs = (i%qk)/qr; // quant index + const int iybs = i - i%qk; // y block start index + const int y_offset = qr == 1 ? 1 : qk/2; + + // dequantize + float v0, v1; + DEQUANT_FUNC(x, ib, iqs, &v0, &v1); + y[iybs + iqs + 0] = v0; + y[iybs + iqs + y_offset] = v1; +} +); + +std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE( +__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { + const int block_size = get_local_size(0); + const int row = get_group_id(0); + const int tid = get_local_id(0); + + const uint qk = QUANT_K; + const uint qr = QUANT_R; + + const int y_offset = qr == 1 ? 1 : qk/2; + + tmp[tid] = 0; + + for (int i = 0; i < ncols/block_size; i += 2) { + const int col = i*block_size + 2*tid; + const int ib = (row*ncols + col)/qk; // block index + const int iqs = (col%qk)/qr; // quant index + const int iybs = col - col%qk; // y block start index + + // dequantize + float v0, v1; + DEQUANT_FUNC(x, ib, iqs, &v0, &v1); + + // matrix multiplication + tmp[tid] += v0 * y[iybs + iqs + 0]; + tmp[tid] += v1 * y[iybs + iqs + y_offset]; + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=block_size/2; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} +); + + +std::string mul_template = MULTILINE_QUOTE( +__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) { + const int i = get_group_id(0)*get_local_size(0) + get_local_id(0); + + if (i >= get_global_size(0)) { + return; + } + + dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky]; +} +); + +#define CL_CHECK(err) \ + do { \ + cl_int err_ = (err); \ + if (err_ != CL_SUCCESS) { \ + fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \ + #err, err_, __FILE__, __LINE__); \ + exit(1); \ + } \ + } while (0) + +#define CLBLAST_CHECK(err) \ + do { \ + CLBlastStatusCode err_ = (err); \ + if (err_ != CLBlastSuccess) { \ + fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \ + #err, err_, __FILE__, __LINE__); \ + exit(1); \ + } \ + } while (0) + +std::array dequant_str_keys = { + "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC" +}; + +std::array dequant_str_values = { + "dequantize_row_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0", + "dequantize_row_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1", + "dequantize_row_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0", + "dequantize_row_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1", + "dequantize_row_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0", + "convert_row_f16", "half", "1", "1", "convert_f16" +}; + +std::array dequant_mul_mat_vec_str_values = { + "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0", + "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1", + "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0", + "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1", + "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0", + "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16" +}; + +std::array mul_str_keys = { + "KERNEL_NAME", "TYPE" +}; +std::array mul_str_values = { + "mul_f32", "float" +}; + +std::string& replace(std::string& s, const std::string& from, const std::string& to) { + size_t pos = 0; + while ((pos = s.find(from, pos)) != std::string::npos) { + s.replace(pos, from.length(), to); + pos += to.length(); + } + return s; +} + +std::string generate_kernels() { + std::stringstream src; + src << program_source << '\n'; + src << k_quants_source << '\n'; + for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) { + std::string dequant_kernel = dequant_template; + std::string dmmv_kernel = dequant_mul_mat_vec_template; + for (size_t j = 0; j < dequant_str_keys.size(); j++) { + replace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]); + replace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]); + } + src << dequant_kernel << '\n'; + src << dmmv_kernel << '\n'; + } + for (size_t i = 0; i < mul_str_values.size(); i += mul_str_keys.size()) { + std::string mul_kernel = mul_template; + for (size_t j = 0; j < mul_str_keys.size(); j++) { + replace(mul_kernel, mul_str_keys[j], mul_str_values[i + j]); + } + src << mul_kernel << '\n'; + } + + return src.str(); +} + +static cl_platform_id platform; +static cl_device_id device; +static cl_context context; +static cl_command_queue queue; +static cl_program program; +static cl_kernel convert_row_f16_cl; +static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl; +static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl; +static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl; +static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl; +static cl_kernel mul_f32_cl; +static bool fp16_support; + +static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) { + cl_program p; + char *program_log; + size_t program_size; + size_t log_size; + int err; + + program_size = strlen(program_buffer); + + p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err); + if(err < 0) { + fprintf(stderr, "OpenCL error creating program"); + exit(1); + } + + std::string compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math " + "-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1 " + "-DQK_K=256 -DK_QUANTS_PER_ITERATION=" + std::to_string(K_QUANTS_PER_ITERATION); + + err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL); + if(err < 0) { + + clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + program_log = (char*) malloc(log_size + 1); + program_log[log_size] = '\0'; + clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL); + fprintf(stderr, "ggml_opencl: kernel compile error:\n\n%s\n", program_log); + free(program_log); + exit(1); + } + + return p; +} + +void ggml_cl_init(void) { + cl_int err; + + struct cl_device; + struct cl_platform { + cl_platform_id id; + unsigned number; + char name[128]; + char vendor[128]; + struct cl_device * devices; + unsigned n_devices; + struct cl_device * default_device; + }; + + struct cl_device { + struct cl_platform * platform; + cl_device_id id; + unsigned number; + cl_device_type type; + char name[128]; + }; + + enum { NPLAT = 16, NDEV = 16 }; + + struct cl_platform platforms[NPLAT]; + unsigned n_platforms = 0; + struct cl_device devices[NDEV]; + unsigned n_devices = 0; + struct cl_device * default_device = NULL; + + platform = NULL; + device = NULL; + + cl_platform_id platform_ids[NPLAT]; + CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms)); + + for (unsigned i = 0; i < n_platforms; i++) { + struct cl_platform * p = &platforms[i]; + p->number = i; + p->id = platform_ids[i]; + CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL)); + CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL)); + + cl_device_id device_ids[NDEV]; + cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices); + if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) { + p->n_devices = 0; + } else { + CL_CHECK(clGetDeviceIDsError); + } + p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL; + p->default_device = NULL; + + for (unsigned j = 0; j < p->n_devices; j++) { + struct cl_device * d = &devices[n_devices]; + d->number = n_devices++; + d->id = device_ids[j]; + d->platform = p; + CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL)); + CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL)); + + if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) { + p->default_device = d; + } + } + + if (default_device == NULL && p->default_device != NULL) { + default_device = p->default_device; + } + } + + if (n_devices == 0) { + fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n"); + exit(1); + } + + char * user_platform_string = getenv("GGML_OPENCL_PLATFORM"); + char * user_device_string = getenv("GGML_OPENCL_DEVICE"); + int user_platform_number = -1; + int user_device_number = -1; + + unsigned n; + if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) { + user_platform_number = (int)n; + } + if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) { + user_device_number = (int)n; + } + if (user_platform_number != -1 && user_device_number != -1) { + cl_platform* platform = &platforms[user_platform_number]; + if ((unsigned)user_device_number >= platform->n_devices) { + fprintf(stderr, "ggml_opencl: invalid device number %d\n", user_device_number); + exit(1); + } + default_device = &platform->devices[user_device_number]; + } else { + + struct cl_device * selected_devices = devices; + unsigned n_selected_devices = n_devices; + + if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) { + for (unsigned i = 0; i < n_platforms; i++) { + struct cl_platform * p = &platforms[i]; + if (strstr(p->name, user_platform_string) != NULL || + strstr(p->vendor, user_platform_string) != NULL) { + user_platform_number = (int)i; + break; + } + } + if (user_platform_number == -1) { + fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", user_platform_string); + exit(1); + } + } + if (user_platform_number != -1) { + struct cl_platform * p = &platforms[user_platform_number]; + selected_devices = p->devices; + n_selected_devices = p->n_devices; + default_device = p->default_device; + if (n_selected_devices == 0) { + fprintf(stderr, "ggml_opencl: selected platform '%s' does not have any devices.\n", p->name); + exit(1); + } + } + + if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) { + for (unsigned i = 0; i < n_selected_devices; i++) { + struct cl_device * d = &selected_devices[i]; + if (strstr(d->name, user_device_string) != NULL) { + user_device_number = d->number; + break; + } + } + if (user_device_number == -1) { + fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", user_device_string); + exit(1); + } + } + if (user_device_number != -1) { + selected_devices = &devices[user_device_number]; + n_selected_devices = 1; + default_device = &selected_devices[0]; + } + + GGML_ASSERT(n_selected_devices > 0); + + if (default_device == NULL) { + default_device = &selected_devices[0]; + } + } + + fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", default_device->platform->name); + fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", default_device->name); + if (default_device->type != CL_DEVICE_TYPE_GPU) { + fprintf(stderr, "ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name); + } + + platform = default_device->platform->id; + device = default_device->id; + + size_t ext_str_size; + clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size); + char *ext_buffer = (char *)alloca(ext_str_size + 1); + clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL); + ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated + // Check if ext_buffer contains cl_khr_fp16 + fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL; + fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false"); + + cl_context_properties properties[] = { + (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0 + }; + + CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err)); + + CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err), + (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err : + (queue = clCreateCommandQueue(context, device, 0, &err), err) + ))); + + const std::string kernel_src = generate_kernels(); + + program = build_program_from_source(context, device, kernel_src.c_str()); + + // FP16 to FP32 kernel + CL_CHECK((convert_row_f16_cl = clCreateKernel(program, "convert_row_f16", &err), err)); + + // Dequantize kernels + CL_CHECK((dequantize_row_q4_0_cl = clCreateKernel(program, "dequantize_row_q4_0", &err), err)); + CL_CHECK((dequantize_row_q4_1_cl = clCreateKernel(program, "dequantize_row_q4_1", &err), err)); + CL_CHECK((dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err), err)); + CL_CHECK((dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err), err)); + CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err)); + CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err)); + CL_CHECK((dequantize_block_q2_k_cl = clCreateKernel(program, "dequantize_block_q2_K", &err), err)); + CL_CHECK((dequantize_block_q3_k_cl = clCreateKernel(program, "dequantize_block_q3_K", &err), err)); + CL_CHECK((dequantize_block_q4_k_cl = clCreateKernel(program, "dequantize_block_q4_K", &err), err)); + CL_CHECK((dequantize_block_q5_k_cl = clCreateKernel(program, "dequantize_block_q5_K", &err), err)); + CL_CHECK((dequantize_block_q6_k_cl = clCreateKernel(program, "dequantize_block_q6_K", &err), err)); + + // dequant mul mat kernel + CL_CHECK((dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err), err)); + CL_CHECK((dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err), err)); + CL_CHECK((dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err), err)); + CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err)); + CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err)); + CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err)); + CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err)); + CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err)); + CL_CHECK((dequantize_mul_mat_vec_q4_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_K", &err), err)); + CL_CHECK((dequantize_mul_mat_vec_q5_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_K", &err), err)); + CL_CHECK((dequantize_mul_mat_vec_q6_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q6_K", &err), err)); + + // mul kernel + CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err)); +} + +static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return &dequantize_row_q4_0_cl; + case GGML_TYPE_Q4_1: + return &dequantize_row_q4_1_cl; + case GGML_TYPE_Q5_0: + return &dequantize_row_q5_0_cl; + case GGML_TYPE_Q5_1: + return &dequantize_row_q5_1_cl; + case GGML_TYPE_Q8_0: + return &dequantize_row_q8_0_cl; + case GGML_TYPE_Q2_K: + return &dequantize_block_q2_k_cl; + case GGML_TYPE_Q3_K: + return &dequantize_block_q3_k_cl; + case GGML_TYPE_Q4_K: + return &dequantize_block_q4_k_cl; + case GGML_TYPE_Q5_K: + return &dequantize_block_q5_k_cl; + case GGML_TYPE_Q6_K: + return &dequantize_block_q6_k_cl; + case GGML_TYPE_F16: + return &convert_row_f16_cl; + default: + return nullptr; + } +} + +static size_t ggml_cl_global_denom(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return 1; + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + return 4; + case GGML_TYPE_Q4_K: + return 8; + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + return 4; + case GGML_TYPE_F16: + default: + return 1; + } +} + +static size_t ggml_cl_local_size(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return 0; + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + return 64; + case GGML_TYPE_Q4_K: + return 32; + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + return 64; + case GGML_TYPE_F16: + default: + return 0; + } +} + +static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return &dequantize_mul_mat_vec_q4_0_cl; + case GGML_TYPE_Q4_1: + return &dequantize_mul_mat_vec_q4_1_cl; + case GGML_TYPE_Q5_0: + return &dequantize_mul_mat_vec_q5_0_cl; + case GGML_TYPE_Q5_1: + return &dequantize_mul_mat_vec_q5_1_cl; + case GGML_TYPE_Q8_0: + return &dequantize_mul_mat_vec_q8_0_cl; + case GGML_TYPE_F16: + return &convert_mul_mat_vec_f16_cl; + case GGML_TYPE_Q2_K: + return &dequantize_mul_mat_vec_q2_K_cl; + case GGML_TYPE_Q3_K: + return &dequantize_mul_mat_vec_q3_K_cl; + case GGML_TYPE_Q4_K: + return &dequantize_mul_mat_vec_q4_K_cl; + case GGML_TYPE_Q5_K: + return &dequantize_mul_mat_vec_q5_K_cl; + case GGML_TYPE_Q6_K: + return &dequantize_mul_mat_vec_q6_K_cl; + default: + return nullptr; + } +} + +// buffer pool for cl +#define MAX_CL_BUFFERS 256 + +struct scoped_spin_lock { + std::atomic_flag& lock; + scoped_spin_lock(std::atomic_flag& lock) : lock(lock) { + while (lock.test_and_set(std::memory_order_acquire)) { + ; // spin + } + } + ~scoped_spin_lock() { + lock.clear(std::memory_order_release); + } + scoped_spin_lock(const scoped_spin_lock&) = delete; + scoped_spin_lock& operator=(const scoped_spin_lock&) = delete; +}; + +struct cl_buffer { + cl_mem mem; + size_t size = 0; +}; + +static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS]; +static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT; + +static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) { + scoped_spin_lock lock(g_cl_pool_lock); + cl_int err; + + int best_i = -1; + size_t best_size = std::numeric_limits::max(); //smallest unused buffer that fits our needs + int worst_i = -1; + size_t worst_size = 0; //largest unused buffer seen so far + for (int i = 0; i < MAX_CL_BUFFERS; ++i) { + cl_buffer &b = g_cl_buffer_pool[i]; + if (b.size > 0 && b.size >= size && b.size < best_size) + { + best_i = i; + best_size = b.size; + } + if (b.size > 0 && b.size > worst_size) + { + worst_i = i; + worst_size = b.size; + } + } + if(best_i!=-1) //found the smallest buffer that fits our needs + { + cl_buffer& b = g_cl_buffer_pool[best_i]; + cl_mem mem = b.mem; + *actual_size = b.size; + b.size = 0; + return mem; + } + if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory + { + cl_buffer& b = g_cl_buffer_pool[worst_i]; + cl_mem mem = b.mem; + b.size = 0; + clReleaseMemObject(mem); + } + cl_mem mem; + CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err)); + *actual_size = size; + return mem; +} + +static void ggml_cl_pool_free(cl_mem mem, size_t size) { + scoped_spin_lock lock(g_cl_pool_lock); + + for (int i = 0; i < MAX_CL_BUFFERS; ++i) { + cl_buffer& b = g_cl_buffer_pool[i]; + if (b.size == 0) { + b.mem = mem; + b.size = size; + return; + } + } + fprintf(stderr, "WARNING: cl buffer pool full, increase MAX_CL_BUFFERS\n"); + clReleaseMemObject(mem); +} + +void ggml_cl_free_data(const struct ggml_tensor* tensor) { + if (tensor->backend != GGML_BACKEND_GPU) { + return; + } + + cl_mem mem = (cl_mem)tensor->extra; + clReleaseMemObject(mem); +} + +static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) { + cl_int err; + const uint64_t ne0 = src->ne[0]; + const uint64_t ne1 = src->ne[1]; + const uint64_t nb0 = src->nb[0]; + const uint64_t nb1 = src->nb[1]; + const uint64_t nb2 = src->nb[2]; + const uint64_t nb3 = src->nb[3]; + const enum ggml_type type = src->type; + const size_t ts = ggml_type_size(type); + const size_t bs = ggml_blck_size(type); + + const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3); + if (nb0 == ts && nb1 == ts*ne0/bs) { + err = clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*nb1, x, 0, NULL, ev); + return err; + } + if (nb0 == ts) { + const size_t buffer_origin[3] = { offset, 0, 0 }; + const size_t host_origin[3] = { 0, 0, 0 }; + const size_t region[3] = { ts*ne0/bs, ne1, 1 }; + err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0, nb1, 0, x, 0, NULL, ev); + return err; + } + for (uint64_t i1 = 0; i1 < ne1; i1++) { + // pretend the row is a matrix with cols=1 + const size_t buffer_origin[3] = { offset, i1, 0 }; + const size_t host_origin[3] = { 0, 0, 0 }; + const size_t region[3] = { ts/bs, ne0, 1 }; + err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0, 0, nb0, 0, ((const char *)x) + i1*nb0, 0, NULL, ev); + if (err != CL_SUCCESS) { + break; + } + } + return err; +} + +static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + const int64_t ne0 = ne00 * ne01 * ne02 * ne03; + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + const int64_t nb10 = src1->nb[0]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + size_t x_size; + size_t d_size; + + cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0 + cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted. + cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst + + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const int i0 = i03*ne02 + i02; + + cl_event ev; + + // copy src0 to device + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev)); + + if (nb10 == sizeof(float)) { + // Contiguous, avoid overhead from queueing many kernel runs + const int64_t i13 = i03%ne13; + const int64_t i12 = i02%ne12; + const int i1 = i13*ne12*ne11 + i12*ne11; + + cl_int x_offset = 0; + cl_int y_offset = i1*ne10; + cl_int d_offset = 0; + + size_t global = ne00 * ne01; + cl_int ky = ne10; + CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X)); + CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset)); + CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y)); + CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset)); + CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D)); + CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset)); + CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky)); + CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL)); + } else { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const int64_t i13 = i03%ne13; + const int64_t i12 = i02%ne12; + const int64_t i11 = i01%ne11; + const int i1 = i13*ne12*ne11 + i12*ne11 + i11; + + cl_int x_offset = i01*ne00; + cl_int y_offset = i1*ne10; + cl_int d_offset = i01*ne00; + + // compute + size_t global = ne00; + cl_int ky = ne10; + CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X)); + CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset)); + CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y)); + CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset)); + CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D)); + CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset)); + CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky)); + CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL)); + } + } + + CL_CHECK(clReleaseEvent(ev)); + CL_CHECK(clFinish(queue)); + + // copy dst to host + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL)); + } + } + ggml_cl_pool_free(d_X, x_size); + ggml_cl_pool_free(d_D, d_size); +} + +void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cl_mul_f32(src0, src1, dst); +} + +static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const float alpha = 1.0f; + const float beta = 0.0f; + const int x_ne = ne01 * ne00; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + + size_t x_size; + size_t y_size; + size_t d_size; + cl_mem d_X; + if (src0->backend == GGML_BACKEND_GPU) { // NOLINT + d_X = (cl_mem) src0->extra; + } else { + d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); + } + cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); + cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + // copy data to device + if (src0->backend != GGML_BACKEND_GPU) { + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); + } + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL)); + + CL_CHECK(clFinish(queue)); + + // compute + cl_event ev_sgemm; + clblast::StatusCode status = clblast::Gemm(clblast::Layout::kColMajor, + clblast::Transpose::kYes, clblast::Transpose::kNo, + ne01, ne11, ne10, + alpha, + d_X, 0, ne00, + d_Y, 0, ne10, + beta, + d_D, 0, ne01, + &queue, &ev_sgemm); + + if (status != clblast::StatusCode::kSuccess) { + GGML_ASSERT(false); + } + + // copy dst to host + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); + } + } + + if (src0->backend != GGML_BACKEND_GPU) { + ggml_cl_pool_free(d_X, x_size); + } + ggml_cl_pool_free(d_Y, y_size); + ggml_cl_pool_free(d_D, d_size); +} + +static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) { + GGML_ASSERT(fp16_support); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + const int nb13 = src1->nb[3]; + + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f); + const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f); + const int x_ne = ne01 * ne00; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + + size_t x_size; + size_t y_size; + size_t d_size; + cl_mem d_X; + if (src0->backend == GGML_BACKEND_GPU) { // NOLINT + d_X = (cl_mem) src0->extra; + } else { + d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size); + } + cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size); + cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size); + + bool src1_cont_rows = nb10 == sizeof(float); + bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + // copy src0 to device + if (src0->backend != GGML_BACKEND_GPU) { + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); + } + + // convert src1 to fp16 + // TODO: use multiple threads + ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02); + char * src1i = (char *) src1->data + i03*nb13 + i02*nb12; + if (src1_cont_rows) { + if (src1_cont_cols) { + ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11); + } + else { + for (int64_t i01 = 0; i01 < ne11; i01++) { + ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10); + } + } + } + else { + for (int64_t i01 = 0; i01 < ne11; i01++) { + for (int64_t i00 = 0; i00 < ne10; i00++) { + // very slow due to no inlining + tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10)); + } + } + } + + // copy src1 to device + CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL)); + + CL_CHECK(clFinish(queue)); + + // compute + cl_event ev_sgemm; + clblast::StatusCode status = clblast::Gemm(clblast::Layout::kColMajor, + clblast::Transpose::kYes, clblast::Transpose::kNo, + ne01, ne11, ne10, + alpha, + d_X, 0, ne00, + d_Y, 0, ne10, + beta, + d_D, 0, ne01, + &queue, &ev_sgemm); + + if (status != clblast::StatusCode::kSuccess) { + GGML_ASSERT(false); + } + + // copy dst to host, then convert to float + CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); + + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + + ggml_fp16_to_fp32_row(tmp, d, d_ne); + } + } + + if (src0->backend != GGML_BACKEND_GPU) { + ggml_cl_pool_free(d_X, x_size); + } + ggml_cl_pool_free(d_Y, y_size); + ggml_cl_pool_free(d_D, d_size); +} + +static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + const ggml_type type = src0->type; + const bool mul_mat_vec = ne11 == 1; + + const float alpha = 1.0f; + const float beta = 0.0f; + const int x_ne = ne01 * ne00; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type); + + size_t x_size; + size_t y_size; + size_t d_size; + size_t q_size; + cl_mem d_X; + if (!mul_mat_vec) { + d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); + } + cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); + cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); + cl_mem d_Q; + if (src0->backend == GGML_BACKEND_CPU) { + d_Q = ggml_cl_pool_malloc(q_sz, &q_size); + } + + cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type); + cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type); + GGML_ASSERT(to_fp32_cl != nullptr); + + const size_t global_denom = ggml_cl_global_denom(type); + const size_t local = ggml_cl_local_size(type); + + size_t ev_idx = 0; + std::vector events; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + // copy src0 to device if necessary + if (src0->backend == GGML_BACKEND_CPU) { + events.emplace_back(); + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++)); + } else if (src0->backend == GGML_BACKEND_GPU) { + d_Q = (cl_mem) src0->extra; + } else { + GGML_ASSERT(false); + } + if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel + // copy src1 to device + events.emplace_back(); + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++)); + + // compute + const size_t global = ne01 * CL_DMMV_BLOCK_SIZE; + const size_t local = CL_DMMV_BLOCK_SIZE; + const cl_int ncols = ne00; + events.emplace_back(); + CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q)); + CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL)); + CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y)); + CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D)); + CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols)); + CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++)); + } else { // general dequantization kernel + CLBlast matrix matrix multiplication + // convert src0 to fp32 on device + const size_t global = x_ne / global_denom; + CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q)); + CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X)); + CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL)); + + // copy src1 to device + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL)); + + events.emplace_back(); + + // wait for conversion + CL_CHECK(clFinish(queue)); + + // compute + clblast::StatusCode status = clblast::Gemm(clblast::Layout::kColMajor, + clblast::Transpose::kYes, clblast::Transpose::kNo, + ne01, ne11, ne10, + alpha, + d_X, 0, ne00, + d_Y, 0, ne10, + beta, + d_D, 0, ne01, + &queue, events.data() + ev_idx++); + + if (status != clblast::StatusCode::kSuccess) { + GGML_ASSERT(false); + } + } + + // copy dst to host + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL)); + for (auto *event : events) { + clReleaseEvent(event); + } + + ev_idx = 0; + events.clear(); + } + } + + if (!mul_mat_vec) { + ggml_cl_pool_free(d_X, x_size); + } + ggml_cl_pool_free(d_Y, y_size); + ggml_cl_pool_free(d_D, d_size); + if (src0->backend == GGML_BACKEND_CPU) { + ggml_cl_pool_free(d_Q, q_size); + } +} + + +bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + // TODO: find the optimal values for these + if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && + src1->type == GGML_TYPE_F32 && + dst->type == GGML_TYPE_F32 && + ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) { + return true; + } + + return false; +} + +bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) { + // If device doesn't support FP16 + if (!fp16_support) { + return false; + } + + size_t src0_sz = ggml_nbytes(src0); + size_t src1_sz = ggml_nbytes(src1); + + // mul_mat_q: src0 is converted to fp32 on device + size_t mul_mat_q_transfer = src0_sz + src1_sz; + + // mul_mat_f16: src1 is converted to fp16 on cpu + size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_fp16_t) * ggml_nelements(src1); + + // choose the smaller one to transfer to the device + // TODO: this is not always the best choice due to the overhead of converting to fp16 + return mul_mat_f16_transfer < mul_mat_q_transfer; +} + +void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) { + GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst)); + + if (src0->type == GGML_TYPE_F32) { + ggml_cl_mul_mat_f32(src0, src1, dst); + } + else if (src0->type == GGML_TYPE_F16) { + if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) { + ggml_cl_mul_mat_f16(src0, src1, dst, wdata, wsize); + } + else { + ggml_cl_mul_mat_q_f32(src0, src1, dst); + } + } + else if (ggml_is_quantized(src0->type)) { + ggml_cl_mul_mat_q_f32(src0, src1, dst); + } + else { + GGML_ASSERT(false); + } +} + +size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) { + return ggml_nelements(src1) * sizeof(ggml_fp16_t); + } + return 0; +} + +void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) { + const int64_t ne0 = tensor->ne[0]; + const int64_t ne1 = tensor->ne[1]; + const int64_t ne2 = tensor->ne[2]; + const int64_t ne3 = tensor->ne[3]; + + const ggml_type type = tensor->type; + const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type); + + size_t q_size; + cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size); + + tensor->data = data; + // copy tensor to device + for (int64_t i3 = 0; i3 < ne3; i3++) { + for (int64_t i2 = 0; i2 < ne2; i2++) { + int i = i3*ne2 + i2; + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL)); + } + } + + CL_CHECK(clFinish(queue)); + + tensor->extra = dst; + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); +} diff --git a/seamless_communication/ggml/src/ggml-opencl.h b/seamless_communication/ggml/src/ggml-opencl.h new file mode 100644 index 0000000..a92b445 --- /dev/null +++ b/seamless_communication/ggml/src/ggml-opencl.h @@ -0,0 +1,25 @@ +#pragma once + +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void ggml_cl_init(void); + +void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); + +void * ggml_cl_host_malloc(size_t size); +void ggml_cl_host_free(void * ptr); + +void ggml_cl_free_data(const struct ggml_tensor* tensor); + +void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor); + +#ifdef __cplusplus +} +#endif diff --git a/seamless_communication/ggml/src/ggml.c b/seamless_communication/ggml/src/ggml.c new file mode 100644 index 0000000..06ad462 --- /dev/null +++ b/seamless_communication/ggml/src/ggml.c @@ -0,0 +1,21909 @@ +#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows + +#include "ggml.h" + +#ifdef GGML_USE_K_QUANTS +#include "k_quants.h" +#endif + +#if defined(_MSC_VER) || defined(__MINGW32__) +#include // using malloc.h with MSC/MINGW +#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#ifdef GGML_USE_METAL +#include +#endif + + +// static_assert should be a #define, but if it's not, +// fall back to the _Static_assert C11 keyword. +// if C99 - static_assert is noop +// ref: https://stackoverflow.com/a/53923785/4039976 +#ifndef static_assert +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) +#define static_assert(cond, msg) _Static_assert(cond, msg) +#else +#define static_assert(cond, msg) struct global_scope_noop_trick +#endif +#endif + +#if defined(_MSC_VER) +// disable "possible loss of data" to avoid hundreds of casts +// we should just be careful :) +#pragma warning(disable: 4244 4267) + +// disable POSIX deprecation warnigns +// these functions are never going away, anyway +#pragma warning(disable: 4996) +#endif + +#if defined(_WIN32) + +#include + +typedef volatile LONG atomic_int; +typedef atomic_int atomic_bool; + +static void atomic_store(atomic_int * ptr, LONG val) { + InterlockedExchange(ptr, val); +} +static LONG atomic_load(atomic_int * ptr) { + return InterlockedCompareExchange(ptr, 0, 0); +} +static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) { + return InterlockedExchangeAdd(ptr, inc); +} +static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) { + return atomic_fetch_add(ptr, -(dec)); +} + +typedef HANDLE pthread_t; + +typedef DWORD thread_ret_t; +static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) { + (void) unused; + HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); + if (handle == NULL) + { + return EAGAIN; + } + + *out = handle; + return 0; +} + +static int pthread_join(pthread_t thread, void * unused) { + (void) unused; + return (int) WaitForSingleObject(thread, INFINITE); +} + +static int sched_yield (void) { + Sleep (0); + return 0; +} +#else +#include +#include + +typedef void * thread_ret_t; + +#include +#include +#include + +#endif +#ifdef GGML_USE_CPU_HBM +#include +#endif + +// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 +#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) +#ifndef __FMA__ +#define __FMA__ +#endif +#ifndef __F16C__ +#define __F16C__ +#endif +#ifndef __SSE3__ +#define __SSE3__ +#endif +#endif + +/*#define GGML_PERF*/ +#define GGML_DEBUG 0 +// #define GGML_GELU_FP16 +// #define GGML_GELU_QUICK_FP16 +#define GGML_SILU_FP16 +// #define GGML_CROSS_ENTROPY_EXP_FP16 +// #define GGML_FLASH_ATTN_EXP_FP16 + +#define GGML_SOFT_MAX_UNROLL 4 +#define GGML_VEC_DOT_UNROLL 2 + +// +// logging +// + +#if (GGML_DEBUG >= 1) +#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG(...) +#endif + +#if (GGML_DEBUG >= 5) +#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG_5(...) +#endif + +#if (GGML_DEBUG >= 10) +#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG_10(...) +#endif + +#define GGML_PRINT(...) printf(__VA_ARGS__) + +#ifdef GGML_USE_ACCELERATE +// uncomment to use vDSP for soft max computation +// note: not sure if it is actually faster +// #define GGML_SOFT_MAX_ACCELERATE +#endif + +// +// logging +// + +#if (GGML_DEBUG >= 1) +#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG(...) +#endif + +#if (GGML_DEBUG >= 5) +#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG_5(...) +#endif + +#if (GGML_DEBUG >= 10) +#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG_10(...) +#endif + +#define GGML_PRINT(...) printf(__VA_ARGS__) + +// +// end of logging block +// + +#if defined(_MSC_VER) || defined(__MINGW32__) +#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) +#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) +#else +inline static void * ggml_aligned_malloc(size_t size) { + if (size == 0) { + GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n"); + return NULL; + } + void * aligned_memory = NULL; +#ifdef GGML_USE_CPU_HBM + int result = hbw_posix_memalign(&aligned_memory, 16, size); +#elif GGML_USE_METAL + int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size); +#else + int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size); +#endif + if (result != 0) { + // Handle allocation failure + const char *error_desc = "unknown allocation error"; + switch (result) { + case EINVAL: + error_desc = "invalid alignment value"; + break; + case ENOMEM: + error_desc = "insufficient memory"; + break; + } + GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0)); + return NULL; + } + return aligned_memory; +} +#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size) +#ifdef GGML_USE_CPU_HBM +#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr) +#else +#define GGML_ALIGNED_FREE(ptr) free(ptr) +#endif +#endif + +#define UNUSED GGML_UNUSED +#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) + +// +// tensor access macros +// + +#define GGML_TENSOR_UNARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + +#define GGML_TENSOR_BINARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \ + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); \ + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + +#if defined(GGML_USE_ACCELERATE) +#include +#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions +#include "ggml-opencl.h" +#endif +#elif defined(GGML_USE_OPENBLAS) +#if defined(GGML_BLAS_USE_MKL) +#include +#else +#include +#endif +#elif defined(GGML_USE_CUBLAS) +#include "ggml-cuda.h" +#elif defined(GGML_USE_CLBLAST) +#include "ggml-opencl.h" +#endif + +#undef MIN +#undef MAX +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +// floating point type used to accumulate sums +typedef double ggml_float; + +// 16-bit float +// on Arm, we use __fp16 +// on x86, we use uint16_t +#ifdef __ARM_NEON + +// if YCM cannot find , make a symbolic link to it, for example: +// +// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ +// +#include + +#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x)) +#define GGML_COMPUTE_FP32_TO_FP16(x) (x) + +#define GGML_FP16_TO_FP32(x) ((float) (x)) +#define GGML_FP32_TO_FP16(x) (x) + +#else + +#ifdef __wasm_simd128__ +#include +#else +#ifdef __POWER9_VECTOR__ +#include +#undef bool +#define bool _Bool +#else +#if defined(_MSC_VER) || defined(__MINGW32__) +#include +#else +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) +#if !defined(__riscv) +#include +#endif +#endif +#endif +#endif +#endif + +#ifdef __riscv_v_intrinsic +#include +#endif + +#ifdef __F16C__ + +#ifdef _MSC_VER +#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) +#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) +#else +#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) +#endif + +#elif defined(__POWER9_VECTOR__) + +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) +/* the inline asm below is about 12% faster than the lookup method */ +#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + register float f; + register double d; + __asm__( + "mtfprd %0,%2\n" + "xscvhpdp %0,%0\n" + "frsp %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=f"(f): + /* in */ "r"(h)); + return f; +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { + register double d; + register ggml_fp16_t r; + __asm__( /* xscvdphp can work on double or single precision */ + "xscvdphp %0,%2\n" + "mffprd %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=r"(r): + /* in */ "f"(f)); + return r; +} + +#else + +// FP16 <-> FP32 +// ref: https://github.com/Maratyszcza/FP16 + +static inline float fp32_from_bits(uint32_t w) { + union { + uint32_t as_bits; + float as_value; + } fp32; + fp32.as_bits = w; + return fp32.as_value; +} + +static inline uint32_t fp32_to_bits(float f) { + union { + float as_value; + uint32_t as_bits; + } fp32; + fp32.as_value = f; + return fp32.as_bits; +} + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + const uint32_t w = (uint32_t) h << 16; + const uint32_t sign = w & UINT32_C(0x80000000); + const uint32_t two_w = w + w; + + const uint32_t exp_offset = UINT32_C(0xE0) << 23; +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) + const float exp_scale = 0x1.0p-112f; +#else + const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); +#endif + const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; + + const uint32_t magic_mask = UINT32_C(126) << 23; + const float magic_bias = 0.5f; + const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; + + const uint32_t denormalized_cutoff = UINT32_C(1) << 27; + const uint32_t result = sign | + (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); + return fp32_from_bits(result); +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) + const float scale_to_inf = 0x1.0p+112f; + const float scale_to_zero = 0x1.0p-110f; +#else + const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); + const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); +#endif + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; + + const uint32_t w = fp32_to_bits(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) { + bias = UINT32_C(0x71000000); + } + + base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = fp32_to_bits(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); +} + +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) + +#endif // __F16C__ + +#endif // __ARM_NEON + +// +// global data +// + +// precomputed gelu table for f16 (128 KB) +static ggml_fp16_t table_gelu_f16[1 << 16]; + +// precomputed quick gelu table for f16 (128 KB) +static ggml_fp16_t table_gelu_quick_f16[1 << 16]; + +// precomputed silu table for f16 (128 KB) +static ggml_fp16_t table_silu_f16[1 << 16]; + +// precomputed exp table for f16 (128 KB) +static ggml_fp16_t table_exp_f16[1 << 16]; + +// precomputed f32 table for f16 (256 KB) +static float table_f32_f16[1 << 16]; + +#if defined(__ARM_NEON) || defined(__wasm_simd128__) +#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s +#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) +#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s) +#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s) +#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s) +#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s) +#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s) +#define B8(c,s ) B7(c,s, c), B7(c,s, s) + +// precomputed tables for expanding 8bits to 8 bytes: +static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4 +static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 +#endif + +// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, +// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. +// This is also true for POWER9. +#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16) + +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { + uint16_t s; + memcpy(&s, &f, sizeof(uint16_t)); + return table_f32_f16[s]; +} + +#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) + +#endif + +// note: do not use these inside ggml.c +// these are meant to be used via the ggml.h API +float ggml_fp16_to_fp32(ggml_fp16_t x) { + return (float) GGML_FP16_TO_FP32(x); +} + +ggml_fp16_t ggml_fp32_to_fp16(float x) { + return GGML_FP32_TO_FP16(x); +} + +void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) { + for (int i = 0; i < n; i++) { + y[i] = GGML_FP16_TO_FP32(x[i]); + } +} + +void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) { + int i = 0; +#if defined(__F16C__) + for (; i + 7 < n; i += 8) { + __m256 x_vec = _mm256_loadu_ps(x + i); + __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storeu_si128((__m128i *)(y + i), y_vec); + } + for(; i + 3 < n; i += 4) { + __m128 x_vec = _mm_loadu_ps(x + i); + __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storel_epi64((__m128i *)(y + i), y_vec); + } +#endif + for (; i < n; i++) { + y[i] = GGML_FP32_TO_FP16(x[i]); + } +} + +// +// timing +// + +#if defined(_MSC_VER) || defined(__MINGW32__) +static int64_t timer_freq, timer_start; +void ggml_time_init(void) { + LARGE_INTEGER t; + QueryPerformanceFrequency(&t); + timer_freq = t.QuadPart; + + // The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq + // and the uptime is high enough. + // We subtract the program start time to reduce the likelihood of that happening. + QueryPerformanceCounter(&t); + timer_start = t.QuadPart; +} +int64_t ggml_time_ms(void) { + LARGE_INTEGER t; + QueryPerformanceCounter(&t); + return ((t.QuadPart-timer_start) * 1000) / timer_freq; +} +int64_t ggml_time_us(void) { + LARGE_INTEGER t; + QueryPerformanceCounter(&t); + return ((t.QuadPart-timer_start) * 1000000) / timer_freq; +} +#else +void ggml_time_init(void) {} +int64_t ggml_time_ms(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000; +} + +int64_t ggml_time_us(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000; +} +#endif + +int64_t ggml_cycles(void) { + return clock(); +} + +int64_t ggml_cycles_per_ms(void) { + return CLOCKS_PER_SEC/1000; +} + +#ifdef GGML_PERF +#define ggml_perf_time_ms() ggml_time_ms() +#define ggml_perf_time_us() ggml_time_us() +#define ggml_perf_cycles() ggml_cycles() +#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms() +#else +#define ggml_perf_time_ms() 0 +#define ggml_perf_time_us() 0 +#define ggml_perf_cycles() 0 +#define ggml_perf_cycles_per_ms() 0 +#endif + + +// +// cache line +// + +#if defined(__cpp_lib_hardware_interference_size) +#define CACHE_LINE_SIZE hardware_destructive_interference_size +#else +#if defined(__POWER9_VECTOR__) +#define CACHE_LINE_SIZE 128 +#else +#define CACHE_LINE_SIZE 64 +#endif +#endif + +static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); + +// +// quantization +// + +#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) + +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) +// multiply int8_t, add results pairwise twice +static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { + // Get absolute values of x vectors + const __m128i ax = _mm_sign_epi8(x, x); + // Sign the values of the y vectors + const __m128i sy = _mm_sign_epi8(y, x); + // Perform multiplication and create 16-bit values + const __m128i dot = _mm_maddubs_epi16(ax, sy); + const __m128i ones = _mm_set1_epi16(1); + return _mm_madd_epi16(ones, dot); +} + +#if __AVX__ || __AVX2__ || __AVX512F__ +// horizontally add 8 floats +static inline float hsum_float_8(const __m256 x) { + __m128 res = _mm256_extractf128_ps(x, 1); + res = _mm_add_ps(res, _mm256_castps256_ps128(x)); + res = _mm_add_ps(res, _mm_movehl_ps(res, res)); + res = _mm_add_ss(res, _mm_movehdup_ps(res)); + return _mm_cvtss_f32(res); +} + +// horizontally add 8 int32_t +static inline int hsum_i32_8(const __m256i a) { + const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)); + const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128); + const __m128i sum64 = _mm_add_epi32(hi64, sum128); + const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); + return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); +} + +// horizontally add 4 int32_t +static inline int hsum_i32_4(const __m128i a) { + const __m128i hi64 = _mm_unpackhi_epi64(a, a); + const __m128i sum64 = _mm_add_epi32(hi64, a); + const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); + return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); +} + +#if defined(__AVX2__) || defined(__AVX512F__) +// spread 32 bits to 32 bytes { 0x00, 0xFF } +static inline __m256i bytes_from_bits_32(const uint8_t * x) { + uint32_t x32; + memcpy(&x32, x, sizeof(uint32_t)); + const __m256i shuf_mask = _mm256_set_epi64x( + 0x0303030303030303, 0x0202020202020202, + 0x0101010101010101, 0x0000000000000000); + __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask); + const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe); + bytes = _mm256_or_si256(bytes, bit_mask); + return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1)); +} + +// Unpack 32 4-bit fields into 32 bytes +// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval +static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) +{ + const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); + const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp); + const __m256i lowMask = _mm256_set1_epi8( 0xF ); + return _mm256_and_si256(lowMask, bytes); +} + +// add int16_t pairwise and return as float vector +static inline __m256 sum_i16_pairs_float(const __m256i x) { + const __m256i ones = _mm256_set1_epi16(1); + const __m256i summed_pairs = _mm256_madd_epi16(ones, x); + return _mm256_cvtepi32_ps(summed_pairs); +} + +static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { +#if __AVXVNNI__ + const __m256i zero = _mm256_setzero_si256(); + const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); + return _mm256_cvtepi32_ps(summed_pairs); +#else + // Perform multiplication and create 16-bit values + const __m256i dot = _mm256_maddubs_epi16(ax, sy); + return sum_i16_pairs_float(dot); +#endif +} + +// multiply int8_t, add results pairwise twice and return as float vector +static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { +#if __AVXVNNIINT8__ + const __m256i zero = _mm256_setzero_si256(); + const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y); + return _mm256_cvtepi32_ps(summed_pairs); +#else + // Get absolute values of x vectors + const __m256i ax = _mm256_sign_epi8(x, x); + // Sign the values of the y vectors + const __m256i sy = _mm256_sign_epi8(y, x); + return mul_sum_us8_pairs_float(ax, sy); +#endif +} + +static inline __m128i packNibbles( __m256i bytes ) +{ + // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh +#if __AVX512F__ + const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000 + bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh + return _mm256_cvtepi16_epi8(bytes); // abcd_efgh +#else + const __m256i lowByte = _mm256_set1_epi16( 0xFF ); + __m256i high = _mm256_andnot_si256( lowByte, bytes ); + __m256i low = _mm256_and_si256( lowByte, bytes ); + high = _mm256_srli_epi16( high, 4 ); + bytes = _mm256_or_si256( low, high ); + + // Compress uint16_t lanes into bytes + __m128i r0 = _mm256_castsi256_si128( bytes ); + __m128i r1 = _mm256_extracti128_si256( bytes, 1 ); + return _mm_packus_epi16( r0, r1 ); +#endif +} +#elif defined(__AVX__) +// spread 32 bits to 32 bytes { 0x00, 0xFF } +static inline __m256i bytes_from_bits_32(const uint8_t * x) { + uint32_t x32; + memcpy(&x32, x, sizeof(uint32_t)); + const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000); + const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202); + __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl); + __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh); + const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe); + bytesl = _mm_or_si128(bytesl, bit_mask); + bytesh = _mm_or_si128(bytesh, bit_mask); + bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1)); + bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1)); + return MM256_SET_M128I(bytesh, bytesl); +} + +// Unpack 32 4-bit fields into 32 bytes +// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval +static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) +{ + // Load 16 bytes from memory + __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi); + __m128i tmph = _mm_srli_epi16(tmpl, 4); + const __m128i lowMask = _mm_set1_epi8(0xF); + tmpl = _mm_and_si128(lowMask, tmpl); + tmph = _mm_and_si128(lowMask, tmph); + return MM256_SET_M128I(tmph, tmpl); +} + +// add int16_t pairwise and return as float vector +static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) { + const __m128i ones = _mm_set1_epi16(1); + const __m128i summed_pairsl = _mm_madd_epi16(ones, xl); + const __m128i summed_pairsh = _mm_madd_epi16(ones, xh); + const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl); + return _mm256_cvtepi32_ps(summed_pairs); +} + +static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { + const __m128i axl = _mm256_castsi256_si128(ax); + const __m128i axh = _mm256_extractf128_si256(ax, 1); + const __m128i syl = _mm256_castsi256_si128(sy); + const __m128i syh = _mm256_extractf128_si256(sy, 1); + // Perform multiplication and create 16-bit values + const __m128i dotl = _mm_maddubs_epi16(axl, syl); + const __m128i doth = _mm_maddubs_epi16(axh, syh); + return sum_i16_pairs_float(doth, dotl); +} + +// multiply int8_t, add results pairwise twice and return as float vector +static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { + const __m128i xl = _mm256_castsi256_si128(x); + const __m128i xh = _mm256_extractf128_si256(x, 1); + const __m128i yl = _mm256_castsi256_si128(y); + const __m128i yh = _mm256_extractf128_si256(y, 1); + // Get absolute values of x vectors + const __m128i axl = _mm_sign_epi8(xl, xl); + const __m128i axh = _mm_sign_epi8(xh, xh); + // Sign the values of the y vectors + const __m128i syl = _mm_sign_epi8(yl, xl); + const __m128i syh = _mm_sign_epi8(yh, xh); + // Perform multiplication and create 16-bit values + const __m128i dotl = _mm_maddubs_epi16(axl, syl); + const __m128i doth = _mm_maddubs_epi16(axh, syh); + return sum_i16_pairs_float(doth, dotl); +} + +static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) +{ + // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh + const __m128i lowByte = _mm_set1_epi16( 0xFF ); + __m128i high = _mm_andnot_si128( lowByte, bytes1 ); + __m128i low = _mm_and_si128( lowByte, bytes1 ); + high = _mm_srli_epi16( high, 4 ); + bytes1 = _mm_or_si128( low, high ); + high = _mm_andnot_si128( lowByte, bytes2 ); + low = _mm_and_si128( lowByte, bytes2 ); + high = _mm_srli_epi16( high, 4 ); + bytes2 = _mm_or_si128( low, high ); + + return _mm_packus_epi16( bytes1, bytes2); +} +#endif +#elif defined(__SSSE3__) +// horizontally add 4x4 floats +static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) { + __m128 res_0 =_mm_hadd_ps(a, b); + __m128 res_1 =_mm_hadd_ps(c, d); + __m128 res =_mm_hadd_ps(res_0, res_1); + res =_mm_hadd_ps(res, res); + res =_mm_hadd_ps(res, res); + + return _mm_cvtss_f32(res); +} +#endif // __AVX__ || __AVX2__ || __AVX512F__ +#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) + +#if defined(__ARM_NEON) + +#if !defined(__aarch64__) + +inline static int32_t vaddvq_s32(int32x4_t v) { + return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); +} + +inline static float vaddvq_f32(float32x4_t v) { + return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); +} + +inline static float vmaxvq_f32(float32x4_t v) { + return + MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), + MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); +} + +inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { + int32x4_t res; + + res[0] = roundf(vgetq_lane_f32(v, 0)); + res[1] = roundf(vgetq_lane_f32(v, 1)); + res[2] = roundf(vgetq_lane_f32(v, 2)); + res[3] = roundf(vgetq_lane_f32(v, 3)); + + return res; +} + +#endif +#endif + +#define QK4_0 32 +typedef struct { + ggml_fp16_t d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants +} block_q4_0; +static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding"); + +#define QK4_1 32 +typedef struct { + ggml_fp16_t d; // delta + ggml_fp16_t m; // min + uint8_t qs[QK4_1 / 2]; // nibbles / quants +} block_q4_1; +static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding"); + +#define QK5_0 32 +typedef struct { + ggml_fp16_t d; // delta + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[QK5_0 / 2]; // nibbles / quants +} block_q5_0; +static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding"); + +#define QK5_1 32 +typedef struct { + ggml_fp16_t d; // delta + ggml_fp16_t m; // min + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[QK5_1 / 2]; // nibbles / quants +} block_q5_1; +static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); + +#define QK8_0 32 +typedef struct { + ggml_fp16_t d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; +static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding"); + +#define QK8_1 32 +typedef struct { + float d; // delta + float s; // d * sum(qs[i]) + int8_t qs[QK8_1]; // quants +} block_q8_1; +static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding"); + +// reference implementation for deterministic creation of model files +static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { + static const int qk = QK4_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } + } + + const float d = max / -8; + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + for (int j = 0; j < qk/2; ++j) { + const float x0 = x[i*qk + 0 + j]*id; + const float x1 = x[i*qk + qk/2 + j]*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); + + y[i].qs[j] = xi0; + y[i].qs[j] |= xi1 << 4; + } + } +} + +static void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { + quantize_row_q4_0_reference(x, y, k); +} + +static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) { + const int qk = QK4_1; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float min = FLT_MAX; + float max = -FLT_MAX; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + + if (v < min) min = v; + if (v > max) max = v; + } + + const float d = (max - min) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + y[i].m = GGML_FP32_TO_FP16(min); + + for (int j = 0; j < qk/2; ++j) { + const float x0 = (x[i*qk + 0 + j] - min)*id; + const float x1 = (x[i*qk + qk/2 + j] - min)*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); + + y[i].qs[j] = xi0; + y[i].qs[j] |= xi1 << 4; + } + } +} + +static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { + quantize_row_q4_1_reference(x, y, k); +} + +static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) { + static const int qk = QK5_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } + } + + const float d = max / -16; + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + uint32_t qh = 0; + + for (int j = 0; j < qk/2; ++j) { + const float x0 = x[i*qk + 0 + j]*id; + const float x1 = x[i*qk + qk/2 + j]*id; + + const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f)); + const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f)); + + y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); + + // get the 5-th bit and store it in qh at the right position + qh |= ((xi0 & 0x10) >> 4) << (j + 0); + qh |= ((xi1 & 0x10) >> 4) << (j + qk/2); + } + + memcpy(&y[i].qh, &qh, sizeof(qh)); + } +} + +static void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) { + quantize_row_q5_0_reference(x, y, k); +} + +static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) { + const int qk = QK5_1; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float min = FLT_MAX; + float max = -FLT_MAX; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + + if (v < min) min = v; + if (v > max) max = v; + } + + const float d = (max - min) / ((1 << 5) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + y[i].m = GGML_FP32_TO_FP16(min); + + uint32_t qh = 0; + + for (int j = 0; j < qk/2; ++j) { + const float x0 = (x[i*qk + 0 + j] - min)*id; + const float x1 = (x[i*qk + qk/2 + j] - min)*id; + + const uint8_t xi0 = (uint8_t)(x0 + 0.5f); + const uint8_t xi1 = (uint8_t)(x1 + 0.5f); + + y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); + + // get the 5-th bit and store it in qh at the right position + qh |= ((xi0 & 0x10) >> 4) << (j + 0); + qh |= ((xi1 & 0x10) >> 4) << (j + qk/2); + } + + memcpy(&y[i].qh, &qh, sizeof(y[i].qh)); + } +} + +static void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) { + quantize_row_q5_1_reference(x, y, k); +} + +// reference implementation for deterministic creation of model files +static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) { + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = x[i*QK8_0 + j]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = x[i*QK8_0 + j]*id; + + y[i].qs[j] = roundf(x0); + } + } +} + +static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + for (int i = 0; i < nb; i++) { + float32x4_t srcv [8]; + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); + + const float amax = vmaxvq_f32(amaxv[0]); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + for (int j = 0; j < 8; j++) { + const float32x4_t v = vmulq_n_f32(srcv[j], id); + const int32x4_t vi = vcvtnq_s32_f32(v); + + y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); + } + } +#elif defined(__wasm_simd128__) + for (int i = 0; i < nb; i++) { + v128_t srcv [8]; + v128_t asrcv[8]; + v128_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]); + + const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0), + wasm_f32x4_extract_lane(amaxv[0], 1)), + MAX(wasm_f32x4_extract_lane(amaxv[0], 2), + wasm_f32x4_extract_lane(amaxv[0], 3))); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + for (int j = 0; j < 8; j++) { + const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); + const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v); + + y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0); + y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1); + y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2); + y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3); + } + } +#elif defined(__AVX2__) || defined(__AVX__) + for (int i = 0; i < nb; i++) { + // Load elements into 4 AVX vectors + __m256 v0 = _mm256_loadu_ps( x ); + __m256 v1 = _mm256_loadu_ps( x + 8 ); + __m256 v2 = _mm256_loadu_ps( x + 16 ); + __m256 v3 = _mm256_loadu_ps( x + 24 ); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 signBit = _mm256_set1_ps( -0.0f ); + __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); + + __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); + max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); + max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); + const float maxScalar = _mm_cvtss_f32( max4 ); + + // Quantize these floats + const float d = maxScalar / 127.f; + y[i].d = GGML_FP32_TO_FP16(d); + const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; + const __m256 mul = _mm256_set1_ps( id ); + + // Apply the multiplier + v0 = _mm256_mul_ps( v0, mul ); + v1 = _mm256_mul_ps( v1, mul ); + v2 = _mm256_mul_ps( v2, mul ); + v3 = _mm256_mul_ps( v3, mul ); + + // Round to nearest integer + v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); + v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); + v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); + v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); + + // Convert floats to integers + __m256i i0 = _mm256_cvtps_epi32( v0 ); + __m256i i1 = _mm256_cvtps_epi32( v1 ); + __m256i i2 = _mm256_cvtps_epi32( v2 ); + __m256i i3 = _mm256_cvtps_epi32( v3 ); + +#if defined(__AVX2__) + // Convert int32 to int16 + i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 + i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 + // Convert int16 to int8 + i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 + + // We got our precious signed bytes, but the order is now wrong + // These AVX2 pack instructions process 16-byte pieces independently + // The following instruction is fixing the order + const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); + i0 = _mm256_permutevar8x32_epi32( i0, perm ); + + _mm256_storeu_si256((__m256i *)y[i].qs, i0); +#else + // Since we don't have in AVX some necessary functions, + // we split the registers in half and call AVX2 analogs from SSE + __m128i ni0 = _mm256_castsi256_si128( i0 ); + __m128i ni1 = _mm256_extractf128_si256( i0, 1); + __m128i ni2 = _mm256_castsi256_si128( i1 ); + __m128i ni3 = _mm256_extractf128_si256( i1, 1); + __m128i ni4 = _mm256_castsi256_si128( i2 ); + __m128i ni5 = _mm256_extractf128_si256( i2, 1); + __m128i ni6 = _mm256_castsi256_si128( i3 ); + __m128i ni7 = _mm256_extractf128_si256( i3, 1); + + // Convert int32 to int16 + ni0 = _mm_packs_epi32( ni0, ni1 ); + ni2 = _mm_packs_epi32( ni2, ni3 ); + ni4 = _mm_packs_epi32( ni4, ni5 ); + ni6 = _mm_packs_epi32( ni6, ni7 ); + // Convert int16 to int8 + ni0 = _mm_packs_epi16( ni0, ni2 ); + ni4 = _mm_packs_epi16( ni4, ni6 ); + + _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); + _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); +#endif + } +#else + // scalar + quantize_row_q8_0_reference(x, y, k); +#endif +} + +// reference implementation for deterministic creation of model files +static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) { + assert(QK8_1 == 32); + assert(k % QK8_1 == 0); + const int nb = k / QK8_1; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_1; j++) { + const float v = x[i*QK8_1 + j]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + int sum = 0; + + for (int j = 0; j < QK8_1/2; ++j) { + const float v0 = x[i*QK8_1 + j]*id; + const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id; + + y[i].qs[ j] = roundf(v0); + y[i].qs[QK8_1/2 + j] = roundf(v1); + + sum += y[i].qs[ j]; + sum += y[i].qs[QK8_1/2 + j]; + } + + y[i].s = sum*d; + } +} + +static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { + assert(k % QK8_1 == 0); + const int nb = k / QK8_1; + + block_q8_1 * restrict y = vy; + +#if defined(__ARM_NEON) + for (int i = 0; i < nb; i++) { + float32x4_t srcv [8]; + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); + + const float amax = vmaxvq_f32(amaxv[0]); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + int32x4_t accv = vdupq_n_s32(0); + + for (int j = 0; j < 8; j++) { + const float32x4_t v = vmulq_n_f32(srcv[j], id); + const int32x4_t vi = vcvtnq_s32_f32(v); + + y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); + + accv = vaddq_s32(accv, vi); + } + + y[i].s = d * vaddvq_s32(accv); + } +#elif defined(__wasm_simd128__) + for (int i = 0; i < nb; i++) { + v128_t srcv [8]; + v128_t asrcv[8]; + v128_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]); + + const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0), + wasm_f32x4_extract_lane(amaxv[0], 1)), + MAX(wasm_f32x4_extract_lane(amaxv[0], 2), + wasm_f32x4_extract_lane(amaxv[0], 3))); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + v128_t accv = wasm_i32x4_splat(0); + + for (int j = 0; j < 8; j++) { + const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); + const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v); + + y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0); + y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1); + y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2); + y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3); + + accv = wasm_i32x4_add(accv, vi); + } + + y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) + + wasm_i32x4_extract_lane(accv, 1) + + wasm_i32x4_extract_lane(accv, 2) + + wasm_i32x4_extract_lane(accv, 3)); + } +#elif defined(__AVX2__) || defined(__AVX__) + for (int i = 0; i < nb; i++) { + // Load elements into 4 AVX vectors + __m256 v0 = _mm256_loadu_ps( x ); + __m256 v1 = _mm256_loadu_ps( x + 8 ); + __m256 v2 = _mm256_loadu_ps( x + 16 ); + __m256 v3 = _mm256_loadu_ps( x + 24 ); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 signBit = _mm256_set1_ps( -0.0f ); + __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); + + __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); + max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); + max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); + const float maxScalar = _mm_cvtss_f32( max4 ); + + // Quantize these floats + const float d = maxScalar / 127.f; + y[i].d = d; + const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; + const __m256 mul = _mm256_set1_ps( id ); + + // Apply the multiplier + v0 = _mm256_mul_ps( v0, mul ); + v1 = _mm256_mul_ps( v1, mul ); + v2 = _mm256_mul_ps( v2, mul ); + v3 = _mm256_mul_ps( v3, mul ); + + // Round to nearest integer + v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); + v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); + v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); + v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); + + // Convert floats to integers + __m256i i0 = _mm256_cvtps_epi32( v0 ); + __m256i i1 = _mm256_cvtps_epi32( v1 ); + __m256i i2 = _mm256_cvtps_epi32( v2 ); + __m256i i3 = _mm256_cvtps_epi32( v3 ); + +#if defined(__AVX2__) + // Compute the sum of the quants and set y[i].s + y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))); + + // Convert int32 to int16 + i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 + i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 + // Convert int16 to int8 + i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 + + // We got our precious signed bytes, but the order is now wrong + // These AVX2 pack instructions process 16-byte pieces independently + // The following instruction is fixing the order + const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); + i0 = _mm256_permutevar8x32_epi32( i0, perm ); + + _mm256_storeu_si256((__m256i *)y[i].qs, i0); +#else + // Since we don't have in AVX some necessary functions, + // we split the registers in half and call AVX2 analogs from SSE + __m128i ni0 = _mm256_castsi256_si128( i0 ); + __m128i ni1 = _mm256_extractf128_si256( i0, 1); + __m128i ni2 = _mm256_castsi256_si128( i1 ); + __m128i ni3 = _mm256_extractf128_si256( i1, 1); + __m128i ni4 = _mm256_castsi256_si128( i2 ); + __m128i ni5 = _mm256_extractf128_si256( i2, 1); + __m128i ni6 = _mm256_castsi256_si128( i3 ); + __m128i ni7 = _mm256_extractf128_si256( i3, 1); + + // Compute the sum of the quants and set y[i].s + const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3)); + const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7)); + y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1)); + + // Convert int32 to int16 + ni0 = _mm_packs_epi32( ni0, ni1 ); + ni2 = _mm_packs_epi32( ni2, ni3 ); + ni4 = _mm_packs_epi32( ni4, ni5 ); + ni6 = _mm_packs_epi32( ni6, ni7 ); + // Convert int16 to int8 + ni0 = _mm_packs_epi16( ni0, ni2 ); + ni4 = _mm_packs_epi16( ni4, ni6 ); + + _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); + _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); +#endif + } +#else + // scalar + quantize_row_q8_1_reference(x, y, k); +#endif +} + +static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) { + static const int qk = QK4_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + + for (int j = 0; j < qk/2; ++j) { + const int x0 = (x[i].qs[j] & 0x0F) - 8; + const int x1 = (x[i].qs[j] >> 4) - 8; + + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; + } + } +} + +static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) { + static const int qk = QK4_1; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + const float m = GGML_FP16_TO_FP32(x[i].m); + + for (int j = 0; j < qk/2; ++j) { + const int x0 = (x[i].qs[j] & 0x0F); + const int x1 = (x[i].qs[j] >> 4); + + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; + } + } +} + +static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) { + static const int qk = QK5_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; + const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; + + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; + } + } +} + +static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) { + static const int qk = QK5_1; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + const float m = GGML_FP16_TO_FP32(x[i].m); + + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int x0 = (x[i].qs[j] & 0x0F) | xh_0; + const int x1 = (x[i].qs[j] >> 4) | xh_1; + + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; + } + } +} + +static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, int k) { + static const int qk = QK8_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + const block_q8_0 * restrict x = vx; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + + for (int j = 0; j < qk; ++j) { + y[i*qk + j] = x[i].qs[j]*d; + } + } +} + +static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); +static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y); +static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); + +static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { + [GGML_TYPE_I8] = { + .type_name = "i8", + .blck_size = 1, + .type_size = sizeof(int8_t), + .is_quantized = false, + }, + [GGML_TYPE_I16] = { + .type_name = "i16", + .blck_size = 1, + .type_size = sizeof(int16_t), + .is_quantized = false, + }, + [GGML_TYPE_I32] = { + .type_name = "i32", + .blck_size = 1, + .type_size = sizeof(int32_t), + .is_quantized = false, + }, + [GGML_TYPE_F32] = { + .type_name = "f32", + .blck_size = 1, + .type_size = sizeof(float), + .is_quantized = false, + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, + .vec_dot_type = GGML_TYPE_F32, + }, + [GGML_TYPE_F16] = { + .type_name = "f16", + .blck_size = 1, + .type_size = sizeof(ggml_fp16_t), + .is_quantized = false, + .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, + .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row, + .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row, + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, + .vec_dot_type = GGML_TYPE_F16, + }, + [GGML_TYPE_Q4_0] = { + .type_name = "q4_0", + .blck_size = QK4_0, + .type_size = sizeof(block_q4_0), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q4_0, + .from_float = quantize_row_q4_0, + .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference, + .vec_dot = ggml_vec_dot_q4_0_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + }, + [GGML_TYPE_Q4_1] = { + .type_name = "q4_1", + .blck_size = QK4_1, + .type_size = sizeof(block_q4_1), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q4_1, + .from_float = quantize_row_q4_1, + .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference, + .vec_dot = ggml_vec_dot_q4_1_q8_1, + .vec_dot_type = GGML_TYPE_Q8_1, + }, + [GGML_TYPE_Q5_0] = { + .type_name = "q5_0", + .blck_size = QK5_0, + .type_size = sizeof(block_q5_0), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q5_0, + .from_float = quantize_row_q5_0, + .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference, + .vec_dot = ggml_vec_dot_q5_0_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + }, + [GGML_TYPE_Q5_1] = { + .type_name = "q5_1", + .blck_size = QK5_1, + .type_size = sizeof(block_q5_1), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q5_1, + .from_float = quantize_row_q5_1, + .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference, + .vec_dot = ggml_vec_dot_q5_1_q8_1, + .vec_dot_type = GGML_TYPE_Q8_1, + }, + [GGML_TYPE_Q8_0] = { + .type_name = "q8_0", + .blck_size = QK8_0, + .type_size = sizeof(block_q8_0), + .is_quantized = true, + .to_float = dequantize_row_q8_0, + .from_float = quantize_row_q8_0, + .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference, + .vec_dot = ggml_vec_dot_q8_0_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + }, + [GGML_TYPE_Q8_1] = { + .type_name = "q8_1", + .blck_size = QK8_1, + .type_size = sizeof(block_q8_1), + .is_quantized = true, + .from_float = quantize_row_q8_1, + .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference, + .vec_dot_type = GGML_TYPE_Q8_1, + }, +#ifdef GGML_USE_K_QUANTS + [GGML_TYPE_Q2_K] = { + .type_name = "q2_K", + .blck_size = QK_K, + .type_size = sizeof(block_q2_K), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q2_K, + .from_float = quantize_row_q2_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference, + .vec_dot = ggml_vec_dot_q2_K_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + }, + [GGML_TYPE_Q3_K] = { + .type_name = "q3_K", + .blck_size = QK_K, + .type_size = sizeof(block_q3_K), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q3_K, + .from_float = quantize_row_q3_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference, + .vec_dot = ggml_vec_dot_q3_K_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + }, + [GGML_TYPE_Q4_K] = { + .type_name = "q4_K", + .blck_size = QK_K, + .type_size = sizeof(block_q4_K), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q4_K, + .from_float = quantize_row_q4_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference, + .vec_dot = ggml_vec_dot_q4_K_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + }, + [GGML_TYPE_Q5_K] = { + .type_name = "q5_K", + .blck_size = QK_K, + .type_size = sizeof(block_q5_K), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q5_K, + .from_float = quantize_row_q5_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference, + .vec_dot = ggml_vec_dot_q5_K_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + }, + [GGML_TYPE_Q6_K] = { + .type_name = "q6_K", + .blck_size = QK_K, + .type_size = sizeof(block_q6_K), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q6_K, + .from_float = quantize_row_q6_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference, + .vec_dot = ggml_vec_dot_q6_K_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + }, + [GGML_TYPE_Q8_K] = { + .type_name = "q8_K", + .blck_size = QK_K, + .type_size = sizeof(block_q8_K), + .is_quantized = true, + .from_float = quantize_row_q8_K, + } +#endif +}; + +// For internal test use +ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { + GGML_ASSERT(type < GGML_TYPE_COUNT); + return type_traits[type]; +} + + +// +// simd mappings +// + +// we define a common set of C macros which map to specific intrinsics based on the current architecture +// we then implement the fundamental computation operations below using only these macros +// adding support for new architectures requires to define the corresponding SIMD macros +// +// GGML_F32_STEP / GGML_F16_STEP +// number of elements to process in a single step +// +// GGML_F32_EPR / GGML_F16_EPR +// number of elements to fit in a single register +// + +#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) + +#define GGML_SIMD + +// F32 NEON + +#define GGML_F32_STEP 16 +#define GGML_F32_EPR 4 + +#define GGML_F32x4 float32x4_t +#define GGML_F32x4_ZERO vdupq_n_f32(0.0f) +#define GGML_F32x4_SET1(x) vdupq_n_f32(x) +#define GGML_F32x4_LOAD vld1q_f32 +#define GGML_F32x4_STORE vst1q_f32 +#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c) +#define GGML_F32x4_ADD vaddq_f32 +#define GGML_F32x4_MUL vmulq_f32 +#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) +#define GGML_F32x4_REDUCE(res, x) \ +{ \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vaddq_f32(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vaddq_f32(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vaddq_f32(x[i], x[offset+i]); \ + } \ + res = GGML_F32x4_REDUCE_ONE(x[0]); \ +} + +#define GGML_F32_VEC GGML_F32x4 +#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD +#define GGML_F32_VEC_STORE GGML_F32x4_STORE +#define GGML_F32_VEC_FMA GGML_F32x4_FMA +#define GGML_F32_VEC_ADD GGML_F32x4_ADD +#define GGML_F32_VEC_MUL GGML_F32x4_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE + +// F16 NEON + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + #define GGML_F16_STEP 32 + #define GGML_F16_EPR 8 + + #define GGML_F16x8 float16x8_t + #define GGML_F16x8_ZERO vdupq_n_f16(0.0f) + #define GGML_F16x8_SET1(x) vdupq_n_f16(x) + #define GGML_F16x8_LOAD vld1q_f16 + #define GGML_F16x8_STORE vst1q_f16 + #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) + #define GGML_F16x8_ADD vaddq_f16 + #define GGML_F16x8_MUL vmulq_f16 + #define GGML_F16x8_REDUCE(res, x) \ + { \ + int offset = GGML_F16_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vaddq_f16(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vaddq_f16(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vaddq_f16(x[i], x[offset+i]); \ + } \ + const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ + const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ + res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ + } + + #define GGML_F16_VEC GGML_F16x8 + #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO + #define GGML_F16_VEC_SET1 GGML_F16x8_SET1 + #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) + #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i]) + #define GGML_F16_VEC_FMA GGML_F16x8_FMA + #define GGML_F16_VEC_ADD GGML_F16x8_ADD + #define GGML_F16_VEC_MUL GGML_F16x8_MUL + #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE +#else + // if FP16 vector arithmetic is not supported, we use FP32 instead + // and take advantage of the vcvt_ functions to convert to/from FP16 + + #define GGML_F16_STEP 16 + #define GGML_F16_EPR 4 + + #define GGML_F32Cx4 float32x4_t + #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) + #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) + #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x)) + #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) + #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) + #define GGML_F32Cx4_ADD vaddq_f32 + #define GGML_F32Cx4_MUL vmulq_f32 + #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE + + #define GGML_F16_VEC GGML_F32Cx4 + #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO + #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 + #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) + #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) + #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA + #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD + #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL + #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE +#endif + +#elif defined(__AVX__) + +#define GGML_SIMD + +// F32 AVX + +#define GGML_F32_STEP 32 +#define GGML_F32_EPR 8 + +#define GGML_F32x8 __m256 +#define GGML_F32x8_ZERO _mm256_setzero_ps() +#define GGML_F32x8_SET1(x) _mm256_set1_ps(x) +#define GGML_F32x8_LOAD _mm256_loadu_ps +#define GGML_F32x8_STORE _mm256_storeu_ps +#if defined(__FMA__) + #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a) +#else + #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a) +#endif +#define GGML_F32x8_ADD _mm256_add_ps +#define GGML_F32x8_MUL _mm256_mul_ps +#define GGML_F32x8_REDUCE(res, x) \ +{ \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm256_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm256_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm256_add_ps(x[i], x[offset+i]); \ + } \ + const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \ + _mm256_extractf128_ps(x[0], 1)); \ + const __m128 t1 = _mm_hadd_ps(t0, t0); \ + res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \ +} +// TODO: is this optimal ? + +#define GGML_F32_VEC GGML_F32x8 +#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x8_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD +#define GGML_F32_VEC_STORE GGML_F32x8_STORE +#define GGML_F32_VEC_FMA GGML_F32x8_FMA +#define GGML_F32_VEC_ADD GGML_F32x8_ADD +#define GGML_F32_VEC_MUL GGML_F32x8_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE + +// F16 AVX + +#define GGML_F16_STEP 32 +#define GGML_F16_EPR 8 + +// F16 arithmetic is not supported by AVX, so we use F32 instead + +#define GGML_F32Cx8 __m256 +#define GGML_F32Cx8_ZERO _mm256_setzero_ps() +#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x) + +#if defined(__F16C__) +// the _mm256_cvt intrinsics require F16C +#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x))) +#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) +#else +static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) { + float tmp[8]; + + for (int i = 0; i < 8; i++) { + tmp[i] = GGML_FP16_TO_FP32(x[i]); + } + + return _mm256_loadu_ps(tmp); +} +static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { + float arr[8]; + + _mm256_storeu_ps(arr, y); + + for (int i = 0; i < 8; i++) + x[i] = GGML_FP32_TO_FP16(arr[i]); +} +#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x) +#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) +#endif + +#define GGML_F32Cx8_FMA GGML_F32x8_FMA +#define GGML_F32Cx8_ADD _mm256_add_ps +#define GGML_F32Cx8_MUL _mm256_mul_ps +#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE + +#define GGML_F16_VEC GGML_F32Cx8 +#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO +#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1 +#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p) +#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i]) +#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA +#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD +#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL +#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE + +#elif defined(__POWER9_VECTOR__) + +#define GGML_SIMD + +// F32 POWER9 + +#define GGML_F32_STEP 32 +#define GGML_F32_EPR 4 + +#define GGML_F32x4 vector float +#define GGML_F32x4_ZERO 0.0f +#define GGML_F32x4_SET1 vec_splats +#define GGML_F32x4_LOAD(p) vec_xl(0, p) +#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) +#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a) +#define GGML_F32x4_ADD vec_add +#define GGML_F32x4_MUL vec_mul +#define GGML_F32x4_REDUCE(res, x) \ +{ \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vec_add(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vec_add(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vec_add(x[i], x[offset+i]); \ + } \ + res = vec_extract(x[0], 0) + \ + vec_extract(x[0], 1) + \ + vec_extract(x[0], 2) + \ + vec_extract(x[0], 3); \ +} + +#define GGML_F32_VEC GGML_F32x4 +#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD +#define GGML_F32_VEC_STORE GGML_F32x4_STORE +#define GGML_F32_VEC_FMA GGML_F32x4_FMA +#define GGML_F32_VEC_ADD GGML_F32x4_ADD +#define GGML_F32_VEC_MUL GGML_F32x4_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE + +// F16 POWER9 +#define GGML_F16_STEP GGML_F32_STEP +#define GGML_F16_EPR GGML_F32_EPR +#define GGML_F16_VEC GGML_F32x4 +#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO +#define GGML_F16_VEC_SET1 GGML_F32x4_SET1 +#define GGML_F16_VEC_FMA GGML_F32x4_FMA +#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE +// Use vec_xl, not vec_ld, in case the load address is not aligned. +#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \ + vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \ + vec_extract_fp32_from_shortl(vec_xl(0, p)) +#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i] +#define GGML_F16_VEC_STORE(p, r, i) \ + if (i & 0x1) \ + vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \ + r[i - GGML_ENDIAN_BYTE(0)]), \ + 0, p - GGML_F16_EPR) + +#elif defined(__wasm_simd128__) + +#define GGML_SIMD + +// F32 WASM + +#define GGML_F32_STEP 16 +#define GGML_F32_EPR 4 + +#define GGML_F32x4 v128_t +#define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f) +#define GGML_F32x4_SET1(x) wasm_f32x4_splat(x) +#define GGML_F32x4_LOAD wasm_v128_load +#define GGML_F32x4_STORE wasm_v128_store +#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a) +#define GGML_F32x4_ADD wasm_f32x4_add +#define GGML_F32x4_MUL wasm_f32x4_mul +#define GGML_F32x4_REDUCE(res, x) \ +{ \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ + } \ + res = wasm_f32x4_extract_lane(x[0], 0) + \ + wasm_f32x4_extract_lane(x[0], 1) + \ + wasm_f32x4_extract_lane(x[0], 2) + \ + wasm_f32x4_extract_lane(x[0], 3); \ +} + +#define GGML_F32_VEC GGML_F32x4 +#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD +#define GGML_F32_VEC_STORE GGML_F32x4_STORE +#define GGML_F32_VEC_FMA GGML_F32x4_FMA +#define GGML_F32_VEC_ADD GGML_F32x4_ADD +#define GGML_F32_VEC_MUL GGML_F32x4_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE + +// F16 WASM + +#define GGML_F16_STEP 16 +#define GGML_F16_EPR 4 + +inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) { + float tmp[4]; + + tmp[0] = GGML_FP16_TO_FP32(p[0]); + tmp[1] = GGML_FP16_TO_FP32(p[1]); + tmp[2] = GGML_FP16_TO_FP32(p[2]); + tmp[3] = GGML_FP16_TO_FP32(p[3]); + + return wasm_v128_load(tmp); +} + +inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { + float tmp[4]; + + wasm_v128_store(tmp, x); + + p[0] = GGML_FP32_TO_FP16(tmp[0]); + p[1] = GGML_FP32_TO_FP16(tmp[1]); + p[2] = GGML_FP32_TO_FP16(tmp[2]); + p[3] = GGML_FP32_TO_FP16(tmp[3]); +} + +#define GGML_F16x4 v128_t +#define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f) +#define GGML_F16x4_SET1(x) wasm_f32x4_splat(x) +#define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x) +#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y) +#define GGML_F16x4_FMA GGML_F32x4_FMA +#define GGML_F16x4_ADD wasm_f32x4_add +#define GGML_F16x4_MUL wasm_f32x4_mul +#define GGML_F16x4_REDUCE(res, x) \ +{ \ + int offset = GGML_F16_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ + } \ + res = wasm_f32x4_extract_lane(x[0], 0) + \ + wasm_f32x4_extract_lane(x[0], 1) + \ + wasm_f32x4_extract_lane(x[0], 2) + \ + wasm_f32x4_extract_lane(x[0], 3); \ +} + +#define GGML_F16_VEC GGML_F16x4 +#define GGML_F16_VEC_ZERO GGML_F16x4_ZERO +#define GGML_F16_VEC_SET1 GGML_F16x4_SET1 +#define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p) +#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i]) +#define GGML_F16_VEC_FMA GGML_F16x4_FMA +#define GGML_F16_VEC_ADD GGML_F16x4_ADD +#define GGML_F16_VEC_MUL GGML_F16x4_MUL +#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE + +#elif defined(__SSE3__) + +#define GGML_SIMD + +// F32 SSE + +#define GGML_F32_STEP 32 +#define GGML_F32_EPR 4 + +#define GGML_F32x4 __m128 +#define GGML_F32x4_ZERO _mm_setzero_ps() +#define GGML_F32x4_SET1(x) _mm_set1_ps(x) +#define GGML_F32x4_LOAD _mm_loadu_ps +#define GGML_F32x4_STORE _mm_storeu_ps +#if defined(__FMA__) + // TODO: Does this work? + #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a) +#else + #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a) +#endif +#define GGML_F32x4_ADD _mm_add_ps +#define GGML_F32x4_MUL _mm_mul_ps +#define GGML_F32x4_REDUCE(res, x) \ +{ \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm_add_ps(x[i], x[offset+i]); \ + } \ + const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \ + res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \ +} +// TODO: is this optimal ? + +#define GGML_F32_VEC GGML_F32x4 +#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD +#define GGML_F32_VEC_STORE GGML_F32x4_STORE +#define GGML_F32_VEC_FMA GGML_F32x4_FMA +#define GGML_F32_VEC_ADD GGML_F32x4_ADD +#define GGML_F32_VEC_MUL GGML_F32x4_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE + +// F16 SSE + +#define GGML_F16_STEP 32 +#define GGML_F16_EPR 4 + +static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) { + float tmp[4]; + + tmp[0] = GGML_FP16_TO_FP32(x[0]); + tmp[1] = GGML_FP16_TO_FP32(x[1]); + tmp[2] = GGML_FP16_TO_FP32(x[2]); + tmp[3] = GGML_FP16_TO_FP32(x[3]); + + return _mm_loadu_ps(tmp); +} + +static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) { + float arr[4]; + + _mm_storeu_ps(arr, y); + + x[0] = GGML_FP32_TO_FP16(arr[0]); + x[1] = GGML_FP32_TO_FP16(arr[1]); + x[2] = GGML_FP32_TO_FP16(arr[2]); + x[3] = GGML_FP32_TO_FP16(arr[3]); +} + +#define GGML_F32Cx4 __m128 +#define GGML_F32Cx4_ZERO _mm_setzero_ps() +#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x) +#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x) +#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y) +#define GGML_F32Cx4_FMA GGML_F32x4_FMA +#define GGML_F32Cx4_ADD _mm_add_ps +#define GGML_F32Cx4_MUL _mm_mul_ps +#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE + +#define GGML_F16_VEC GGML_F32Cx4 +#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO +#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 +#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) +#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) +#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA +#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD +#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL +#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE + +#endif + +// GGML_F32_ARR / GGML_F16_ARR +// number of registers to use per step +#ifdef GGML_SIMD +#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR) +#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) +#endif + +// +// fundamental operations +// + +inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } +inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } +inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } +inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; } +inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } +inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } +inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } +inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } +inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } + +static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { +#if defined(GGML_USE_OPENBLAS) + float sumf = cblas_sdot(n, x, 1, y, 1); +#elif defined(GGML_SIMD) + float sumf = 0.0f; + const int np = (n & ~(GGML_F32_STEP - 1)); + + GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; + + GGML_F32_VEC ax[GGML_F32_ARR]; + GGML_F32_VEC ay[GGML_F32_ARR]; + + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; j++) { + ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); + ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); + + sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]); + } + } + + // reduce sum0..sum3 to sum0 + GGML_F32_VEC_REDUCE(sumf, sum); + + // leftovers + for (int i = np; i < n; ++i) { + sumf += x[i]*y[i]; + } +#else + // scalar + ggml_float sumf = 0.0; + for (int i = 0; i < n; ++i) { + sumf += (ggml_float)(x[i]*y[i]); + } +#endif + + *s = sumf; +} + +static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { + ggml_float sumf = 0.0; + +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F16_STEP - 1)); + + GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO }; + + GGML_F16_VEC ax[GGML_F16_ARR]; + GGML_F16_VEC ay[GGML_F16_ARR]; + + for (int i = 0; i < np; i += GGML_F16_STEP) { + for (int j = 0; j < GGML_F16_ARR; j++) { + ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + + sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]); + } + } + + // reduce sum0..sum3 to sum0 + GGML_F16_VEC_REDUCE(sumf, sum); + + // leftovers + for (int i = np; i < n; ++i) { + sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); + } +#else + for (int i = 0; i < n; ++i) { + sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); + } +#endif + + *s = sumf; +} + +static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + + const block_q4_0 * restrict x = vx; + const block_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb + for (int i = 0; i < nb; i += 2) { + const block_q4_0 * restrict x0 = &x[i + 0]; + const block_q4_0 * restrict x1 = &x[i + 1]; + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + const int8x16_t s8b = vdupq_n_s8(0x8); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // sub 8 + const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); + const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); + const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); + const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + // dot product into int32x4_t + const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); + const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); +#endif + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; ++i) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. + const __m256i off = _mm256_set1_epi8( 8 ); + bx = _mm256_sub_epi8( bx, off ); + + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps( d, q, acc ); + } + + *s = hsum_float_8(acc); +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; ++i) { + // Compute combined scale for the block + const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); + + const __m128i lowMask = _mm_set1_epi8(0xF); + const __m128i off = _mm_set1_epi8(8); + + const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs); + + __m128i bx = _mm_and_si128(lowMask, tmp); + __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs); + bx = _mm_sub_epi8(bx, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx, by); + + bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4)); + by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); + bx = _mm_sub_epi8(bx, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx, by); + + // Convert int32_t to float + __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1)); + + // Apply the scale, and accumulate + acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); + } + + *s = hsum_float_8(acc); +#elif defined(__SSSE3__) + // set constants + const __m128i lowMask = _mm_set1_epi8(0xF); + const __m128i off = _mm_set1_epi8(8); + + // Initialize accumulator with zeros + __m128 acc_0 = _mm_setzero_ps(); + __m128 acc_1 = _mm_setzero_ps(); + __m128 acc_2 = _mm_setzero_ps(); + __m128 acc_3 = _mm_setzero_ps(); + + // First round without accumulation + { + _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 0 and 1 + const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) ); + + const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs); + + __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); + __m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs); + bx_0 = _mm_sub_epi8(bx_0, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); + + __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); + __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16)); + bx_1 = _mm_sub_epi8(bx_1, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); + + _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 2 and 3 + const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) ); + + const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs); + + __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); + __m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs); + bx_2 = _mm_sub_epi8(bx_2, off); + const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); + + __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); + __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16)); + bx_3 = _mm_sub_epi8(bx_3, off); + const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); + + // Convert int32_t to float + __m128 p0 = _mm_cvtepi32_ps(i32_0); + __m128 p1 = _mm_cvtepi32_ps(i32_1); + __m128 p2 = _mm_cvtepi32_ps(i32_2); + __m128 p3 = _mm_cvtepi32_ps(i32_3); + + // Apply the scale + acc_0 = _mm_mul_ps( d_0_1, p0 ); + acc_1 = _mm_mul_ps( d_0_1, p1 ); + acc_2 = _mm_mul_ps( d_2_3, p2 ); + acc_3 = _mm_mul_ps( d_2_3, p3 ); + } + + // Main loop + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb + for (int i = 2; i < nb; i+=2) { + _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 0 and 1 + const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); + + const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs); + + __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); + __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs); + bx_0 = _mm_sub_epi8(bx_0, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); + + __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); + __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); + bx_1 = _mm_sub_epi8(bx_1, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); + + _mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 2 and 3 + const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) ); + + const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs); + + __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); + __m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs); + bx_2 = _mm_sub_epi8(bx_2, off); + const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); + + __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); + __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16)); + bx_3 = _mm_sub_epi8(bx_3, off); + const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); + + // Convert int32_t to float + __m128 p0 = _mm_cvtepi32_ps(i32_0); + __m128 p1 = _mm_cvtepi32_ps(i32_1); + __m128 p2 = _mm_cvtepi32_ps(i32_2); + __m128 p3 = _mm_cvtepi32_ps(i32_3); + + // Apply the scale + __m128 p0_d = _mm_mul_ps( d_0_1, p0 ); + __m128 p1_d = _mm_mul_ps( d_0_1, p1 ); + __m128 p2_d = _mm_mul_ps( d_2_3, p2 ); + __m128 p3_d = _mm_mul_ps( d_2_3, p3 ); + + // Acummulate + acc_0 = _mm_add_ps(p0_d, acc_0); + acc_1 = _mm_add_ps(p1_d, acc_1); + acc_2 = _mm_add_ps(p2_d, acc_2); + acc_3 = _mm_add_ps(p3_d, acc_3); + } + + *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl); + vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d); + } + + *s = sumf; +#else + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + int sumi = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[i].qs[j] & 0x0F) - 8; + const int v1 = (x[i].qs[j] >> 4) - 8; + + sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); + } + + sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d); + } + + *s = sumf; +#endif +} + +static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int qk = QK8_1; + const int nb = n / qk; + + assert(n % qk == 0); + + const block_q4_1 * restrict x = vx; + const block_q8_1 * restrict y = vy; + + // TODO: add WASM SIMD +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + float summs = 0; + + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb + for (int i = 0; i < nb; i += 2) { + const block_q4_1 * restrict x0 = &x[i + 0]; + const block_q4_1 * restrict x1 = &x[i + 1]; + const block_q8_1 * restrict y0 = &y[i + 0]; + const block_q8_1 * restrict y1 = &y[i + 1]; + + summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + // dot product into int32x4_t + const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); + const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d); +#endif + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs; +#elif defined(__AVX2__) || defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + float summs = 0; + + // Main loop + for (int i = 0; i < nb; ++i) { + const float d0 = GGML_FP16_TO_FP32(x[i].d); + const float d1 = y[i].d; + + summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; + + const __m256 d0v = _mm256_set1_ps( d0 ); + const __m256 d1v = _mm256_set1_ps( d1 ); + + // Compute combined scales + const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); + + // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes + const __m256i bx = bytes_from_nibbles_32(x[i].qs); + const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs ); + + const __m256 xy = mul_sum_us8_pairs_float(bx, by); + + // Accumulate d0*d1*x*y +#if defined(__AVX2__) + acc = _mm256_fmadd_ps( d0d1, xy, acc ); +#else + acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc ); +#endif + } + + *s = hsum_float_8(acc) + summs; +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + } + + *s = sumf; +#else + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + int sumi = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[i].qs[j] & 0x0F); + const int v1 = (x[i].qs[j] >> 4); + + sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); + } + + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + } + + *s = sumf; +#endif +} + +static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(qk == QK5_0); + + const block_q5_0 * restrict x = vx; + const block_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + uint32_t qh0; + uint32_t qh1; + + uint64_t tmp0[4]; + uint64_t tmp1[4]; + + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb + for (int i = 0; i < nb; i += 2) { + const block_q5_0 * restrict x0 = &x[i]; + const block_q5_0 * restrict x1 = &x[i + 1]; + const block_q8_0 * restrict y0 = &y[i]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + + // extract the 5th bit via lookup table ((!b) << 4) + memcpy(&qh0, x0->qh, sizeof(qh0)); + memcpy(&qh1, x1->qh, sizeof(qh1)); + + tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF]; + tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF]; + tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF]; + tmp0[3] = table_b2b_1[(qh0 >> 24) ]; + + tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF]; + tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF]; + tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF]; + tmp1[3] = table_b2b_1[(qh1 >> 24) ]; + + const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); + const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); + const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); + const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) + const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0); + const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0); + const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1); + const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), + vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), + vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); +#endif + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined(__wasm_simd128__) + v128_t sumv = wasm_f32x4_splat(0.0f); + + uint32_t qh; + uint64_t tmp[4]; + + // TODO: check if unrolling this is better + for (int i = 0; i < nb; ++i) { + const block_q5_0 * restrict x0 = &x[i]; + const block_q8_0 * restrict y0 = &y[i]; + + const v128_t m4b = wasm_i8x16_splat(0x0F); + + // extract the 5th bit + memcpy(&qh, x0->qh, sizeof(qh)); + + tmp[0] = table_b2b_1[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_1[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_1[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_1[(qh >> 24) ]; + + const v128_t qhl = wasm_v128_load(tmp + 0); + const v128_t qhh = wasm_v128_load(tmp + 2); + + const v128_t v0 = wasm_v128_load(x0->qs); + + // 4-bit -> 8-bit + const v128_t v0l = wasm_v128_and (v0, m4b); + const v128_t v0h = wasm_u8x16_shr(v0, 4); + + // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) + const v128_t v0lf = wasm_i8x16_sub(v0l, qhl); + const v128_t v0hf = wasm_i8x16_sub(v0h, qhh); + + // load y + const v128_t v1l = wasm_v128_load(y0->qs); + const v128_t v1h = wasm_v128_load(y0->qs + 16); + + // int8x16 -> int16x8 + const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); + const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); + const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); + const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); + + const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); + const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); + const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); + const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); + + // dot product + sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( + wasm_i32x4_add( + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), + wasm_i32x4_dot_i16x8(v0lfh, v1lh)), + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), + wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), + wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d)))); + } + + *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3); +#elif defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; i++) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i bxhi = bytes_from_bits_32(x[i].qh); + bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0)); + bx = _mm256_or_si256(bx, bxhi); + + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps(d, q, acc); + } + + *s = hsum_float_8(acc); +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + __m128i mask = _mm_set1_epi8((char)0xF0); + + // Main loop + for (int i = 0; i < nb; i++) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + const __m256i bxhi = bytes_from_bits_32(x[i].qh); + __m128i bxhil = _mm256_castsi256_si128(bxhi); + __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); + bxhil = _mm_andnot_si128(bxhil, mask); + bxhih = _mm_andnot_si128(bxhih, mask); + __m128i bxl = _mm256_castsi256_si128(bx); + __m128i bxh = _mm256_extractf128_si256(bx, 1); + bxl = _mm_or_si128(bxl, bxhil); + bxh = _mm_or_si128(bxh, bxhih); + bx = MM256_SET_M128I(bxh, bxl); + + const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + /* Multiply q with scale and accumulate */ + acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc); + } + + *s = hsum_float_8(acc); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + uint32_t qh; + + // These temp values are for masking and shift operations + uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000}; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + memcpy(&qh, x[i].qh, sizeof(uint32_t)); + + // temporary registers + vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl); + vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl); + vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl); + vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl); + + // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl); + vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl); + vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl); + + // ((qh & (1u << (j + 16))) >> (j + 12)); + vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl); + vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl); + + // narrowing + vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl); + vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl); + + vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl); + vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl); + + // load + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl); + vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl); + + vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl); + vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi; + } + + *s = sumf; +#else + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); + + int sumi = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + + const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; + const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; + + sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); + } + + sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi; + } + + *s = sumf; +#endif +} + +static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int qk = QK8_1; + const int nb = n / qk; + + assert(n % qk == 0); + assert(qk == QK5_1); + + const block_q5_1 * restrict x = vx; + const block_q8_1 * restrict y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + float summs0 = 0.0f; + float summs1 = 0.0f; + + uint32_t qh0; + uint32_t qh1; + + uint64_t tmp0[4]; + uint64_t tmp1[4]; + + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb + for (int i = 0; i < nb; i += 2) { + const block_q5_1 * restrict x0 = &x[i]; + const block_q5_1 * restrict x1 = &x[i + 1]; + const block_q8_1 * restrict y0 = &y[i]; + const block_q8_1 * restrict y1 = &y[i + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + + summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s; + summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s; + + // extract the 5th bit via lookup table ((b) << 4) + memcpy(&qh0, x0->qh, sizeof(qh0)); + memcpy(&qh1, x1->qh, sizeof(qh1)); + + tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF]; + tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF]; + tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF]; + tmp0[3] = table_b2b_0[(qh0 >> 24) ]; + + tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF]; + tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF]; + tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF]; + tmp1[3] = table_b2b_0[(qh1 >> 24) ]; + + const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); + const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); + const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); + const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // add high bit + const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0); + const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0); + const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1); + const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), + vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), + vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d); +#endif + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1; +#elif defined(__wasm_simd128__) + v128_t sumv = wasm_f32x4_splat(0.0f); + + float summs = 0.0f; + + uint32_t qh; + uint64_t tmp[4]; + + // TODO: check if unrolling this is better + for (int i = 0; i < nb; ++i) { + const block_q5_1 * restrict x0 = &x[i]; + const block_q8_1 * restrict y0 = &y[i]; + + summs += GGML_FP16_TO_FP32(x0->m) * y0->s; + + const v128_t m4b = wasm_i8x16_splat(0x0F); + + // extract the 5th bit + memcpy(&qh, x0->qh, sizeof(qh)); + + tmp[0] = table_b2b_0[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_0[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_0[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_0[(qh >> 24) ]; + + const v128_t qhl = wasm_v128_load(tmp + 0); + const v128_t qhh = wasm_v128_load(tmp + 2); + + const v128_t v0 = wasm_v128_load(x0->qs); + + // 4-bit -> 8-bit + const v128_t v0l = wasm_v128_and (v0, m4b); + const v128_t v0h = wasm_u8x16_shr(v0, 4); + + // add high bit + const v128_t v0lf = wasm_v128_or(v0l, qhl); + const v128_t v0hf = wasm_v128_or(v0h, qhh); + + // load y + const v128_t v1l = wasm_v128_load(y0->qs); + const v128_t v1h = wasm_v128_load(y0->qs + 16); + + // int8x16 -> int16x8 + const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); + const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); + const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); + const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); + + const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); + const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); + const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); + const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); + + // dot product + sumv = wasm_f32x4_add(sumv, + wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add( + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), + wasm_i32x4_dot_i16x8(v0lfh, v1lh)), + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), + wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), + wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d))); + } + + *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs; +#elif defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + float summs = 0.0f; + + // Main loop + for (int i = 0; i < nb; i++) { + const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)); + + summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i bxhi = bytes_from_bits_32(x[i].qh); + bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10)); + bx = _mm256_or_si256(bx, bxhi); + + const __m256 dy = _mm256_set1_ps(y[i].d); + const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_us8_pairs_float(bx, by); + + acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc); + } + + *s = hsum_float_8(acc) + summs; +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + __m128i mask = _mm_set1_epi8(0x10); + + float summs = 0.0f; + + // Main loop + for (int i = 0; i < nb; i++) { + const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)); + + summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + const __m256i bxhi = bytes_from_bits_32(x[i].qh); + __m128i bxhil = _mm256_castsi256_si128(bxhi); + __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); + bxhil = _mm_and_si128(bxhil, mask); + bxhih = _mm_and_si128(bxhih, mask); + __m128i bxl = _mm256_castsi256_si128(bx); + __m128i bxh = _mm256_extractf128_si256(bx, 1); + bxl = _mm_or_si128(bxl, bxhil); + bxh = _mm_or_si128(bxh, bxhih); + bx = MM256_SET_M128I(bxh, bxl); + + const __m256 dy = _mm256_set1_ps(y[i].d); + const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_us8_pairs_float(bx, by); + + acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc); + } + + *s = hsum_float_8(acc) + summs; +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + uint32_t qh; + + // These temp values are for shift operations + uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + memcpy(&qh, x[i].qh, sizeof(uint32_t)); + + // temporary registers + vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl); + vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl); + + // load qh + vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl); + + // ((qh >> (j + 0)) << 4) & 0x10; + vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl); + vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl); + vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl); + + // ((qh >> (j + 12)) ) & 0x10; + vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl); + vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl); + + // narrowing + vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl); + vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl); + + vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl); + vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl); + + // load + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl); + vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl); + + vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + } + + *s = sumf; +#else + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); + + int sumi = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0; + const int32_t x1 = (x[i].qs[j] >> 4) | xh_1; + + sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); + } + + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + } + + *s = sumf; +#endif +} + +static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + + const block_q8_0 * restrict x = vx; + const block_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb + for (int i = 0; i < nb; i += 2) { + const block_q8_0 * restrict x0 = &x[i + 0]; + const block_q8_0 * restrict x1 = &x[i + 1]; + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + const int8x16_t x0_0 = vld1q_s8(x0->qs); + const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); + const int8x16_t x1_0 = vld1q_s8(x1->qs); + const int8x16_t x1_1 = vld1q_s8(x1->qs + 16); + + // load y + const int8x16_t y0_0 = vld1q_s8(y0->qs); + const int8x16_t y0_1 = vld1q_s8(y0->qs + 16); + const int8x16_t y1_0 = vld1q_s8(y1->qs); + const int8x16_t y1_1 = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), + vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), + vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + +#else + const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0)); + const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0)); + const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1)); + const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1)); + + const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0)); + const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0)); + const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1)); + const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1)); + + const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1)); + const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3)); + const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1)); + const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); +#endif + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined(__AVX2__) || defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; ++i) { + // Compute combined scale for the block + const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); + __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs); + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + // Multiply q with scale and accumulate +#if defined(__AVX2__) + acc = _mm256_fmadd_ps( d, q, acc ); +#else + acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc ); +#endif + } + + *s = hsum_float_8(acc); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + size_t vl = __riscv_vsetvl_e8m1(qk); + + for (int i = 0; i < nb; i++) { + // load elements + vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl); + vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl); + + vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl); + + vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl); + vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum); + + sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)); + } + + *s = sumf; +#else + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + int sumi = 0; + + for (int j = 0; j < qk; j++) { + sumi += x[i].qs[j]*y[i].qs[j]; + } + + sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)); + } + + *s = sumf; +#endif +} + +// compute GGML_VEC_DOT_UNROLL dot products at once +// xs - x row stride in bytes +inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) { + ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 }; + + ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL]; + + for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { + x[i] = (ggml_fp16_t *) ((char *) xv + i*xs); + } + +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F16_STEP - 1)); + + GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } }; + + GGML_F16_VEC ax[GGML_F16_ARR]; + GGML_F16_VEC ay[GGML_F16_ARR]; + + for (int i = 0; i < np; i += GGML_F16_STEP) { + for (int j = 0; j < GGML_F16_ARR; j++) { + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + + for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { + ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j); + + sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]); + } + } + } + + // reduce sum0..sum3 to sum0 + for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { + GGML_F16_VEC_REDUCE(sumf[k], sum[k]); + } + + // leftovers + for (int i = np; i < n; ++i) { + for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { + sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); + } + } +#else + for (int i = 0; i < n; ++i) { + for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { + sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); + } + } +#endif + + for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { + s[i] = sumf[i]; + } +} + +inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) { +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F32_STEP - 1)); + + GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); + + GGML_F32_VEC ax[GGML_F32_ARR]; + GGML_F32_VEC ay[GGML_F32_ARR]; + + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; j++) { + ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); + ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); + ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx); + + GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); + } + } + + // leftovers + for (int i = np; i < n; ++i) { + y[i] += x[i]*v; + } +#else + // scalar + for (int i = 0; i < n; ++i) { + y[i] += x[i]*v; + } +#endif +} + +//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } +inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { +#if defined(GGML_USE_ACCELERATE) + vDSP_vsmul(y, 1, &v, y, 1, n); +#elif defined(GGML_SIMD) + const int np = (n & ~(GGML_F32_STEP - 1)); + + GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); + + GGML_F32_VEC ay[GGML_F32_ARR]; + + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; j++) { + ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); + ay[j] = GGML_F32_VEC_MUL(ay[j], vx); + + GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); + } + } + + // leftovers + for (int i = np; i < n; ++i) { + y[i] *= v; + } +#else + // scalar + for (int i = 0; i < n; ++i) { + y[i] *= v; + } +#endif +} + +inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); } +inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } +inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } +inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); } +inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); } +inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } +inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } +inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); } +inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } +inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } + +static const float GELU_COEF_A = 0.044715f; +static const float GELU_QUICK_COEF = -1.702f; +static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; + +inline static float ggml_gelu_f32(float x) { + return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); +} + +inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { + const uint16_t * i16 = (const uint16_t *) x; + for (int i = 0; i < n; ++i) { + y[i] = table_gelu_f16[i16[i]]; + } +} + +#ifdef GGML_GELU_FP16 +inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { + uint16_t t; + for (int i = 0; i < n; ++i) { + ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + memcpy(&t, &fp16, sizeof(uint16_t)); + y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]); + } +} +#else +inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { + for (int i = 0; i < n; ++i) { + y[i] = ggml_gelu_f32(x[i]); + } +} +#endif + +inline static float ggml_gelu_quick_f32(float x) { + return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x))); +} + +//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { +// const uint16_t * i16 = (const uint16_t *) x; +// for (int i = 0; i < n; ++i) { +// y[i] = table_gelu_quick_f16[i16[i]]; +// } +//} + +#ifdef GGML_GELU_QUICK_FP16 +inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { + uint16_t t; + for (int i = 0; i < n; ++i) { + ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + memcpy(&t, &fp16, sizeof(uint16_t)); + y[i] = GGML_FP16_TO_FP32(table_gelu_quick_f16[t]); + } +} +#else +inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { + for (int i = 0; i < n; ++i) { + y[i] = ggml_gelu_quick_f32(x[i]); + } +} +#endif + +// Sigmoid Linear Unit (SiLU) function +inline static float ggml_silu_f32(float x) { + return x/(1.0f + expf(-x)); +} + +//inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { +// const uint16_t * i16 = (const uint16_t *) x; +// for (int i = 0; i < n; ++i) { +// y[i] = table_silu_f16[i16[i]]; +// } +//} + +#ifdef GGML_SILU_FP16 +inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) { + uint16_t t; + for (int i = 0; i < n; ++i) { + ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + memcpy(&t, &fp16, sizeof(uint16_t)); + y[i] = GGML_FP16_TO_FP32(table_silu_f16[t]); + } +} +#else +inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) { + for (int i = 0; i < n; ++i) { + y[i] = ggml_silu_f32(x[i]); + } +} +#endif + +inline static float ggml_silu_backward_f32(float x, float dy) { + const float s = 1.0f/(1.0f + expf(-x)); + return dy*s*(1.0f + x*(1.0f - s)); +} + +#ifdef GGML_SILU_FP16 +inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { + for (int i = 0; i < n; ++i) { + // we did not use x[i] to compute forward silu but its f16 equivalent + // take derivative at f16 of x[i]: + ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + float usedx = GGML_FP16_TO_FP32(fp16); + dx[i] = ggml_silu_backward_f32(usedx, dy[i]); + } +} +#else +inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { + for (int i = 0; i < n; ++i) { + dx[i] = ggml_silu_backward_f32(x[i], dy[i]); + } +} +#endif + +inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { +#ifndef GGML_USE_ACCELERATE + ggml_float sum = 0.0; + for (int i = 0; i < n; ++i) { + sum += (ggml_float)x[i]; + } + *s = sum; +#else + vDSP_sve(x, 1, s, n); +#endif +} + +inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) { + ggml_float sum = 0.0; + for (int i = 0; i < n; ++i) { + sum += (ggml_float)x[i]; + } + *s = sum; +} + +inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) { + float sum = 0.0f; + for (int i = 0; i < n; ++i) { + sum += GGML_FP16_TO_FP32(x[i]); + } + *s = sum; +} + +inline static void ggml_vec_max_f32(const int n, float * s, const float * x) { +#ifndef GGML_USE_ACCELERATE + float max = -INFINITY; + for (int i = 0; i < n; ++i) { + max = MAX(max, x[i]); + } + *s = max; +#else + vDSP_maxv(x, 1, s, n); +#endif +} + +inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { + ggml_vec_norm_f32(n, s, x); + *s = 1.f/(*s); +} + +inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) { + float max = -INFINITY; + int idx = 0; + for (int i = 0; i < n; ++i) { + max = MAX(max, x[i]); + if (max == x[i]) { idx = i; } + } + *s = idx; +} + +// +// data types +// + +static const char * GGML_OP_NAME[GGML_OP_COUNT] = { + "NONE", + + "DUP", + "ADD", + "ADD1", + "ACC", + "SUB", + "MUL", + "DIV", + "SQR", + "SQRT", + "LOG", + "SUM", + "SUM_ROWS", + "MEAN", + "ARGMAX", + "REPEAT", + "REPEAT_BACK", + "CONCAT", + "SILU_BACK", + "NORM", + "BATCH_NORM", + "RMS_NORM", + "RMS_NORM_BACK", + "GROUP_NORM", + + "MUL_MAT", + "OUT_PROD", + + "SCALE", + "SET", + "CPY", + "CONT", + "RESHAPE", + "VIEW", + "PERMUTE", + "TRANSPOSE", + "GET_ROWS", + "GET_ROWS_BACK", + "DIAG", + "DIAG_MASK_INF", + "DIAG_MASK_ZERO", + "SOFT_MAX", + "SOFT_MAX_BACK", + "ROPE", + "ROPE_BACK", + "ALIBI", + "CLAMP", + "CONV_1D", + "CONV_1D_GENERIC", + "CONV_2D", + "CONV_TRANSPOSE_2D", + "POOL_1D", + "POOL_2D", + "UPSCALE", + + "CONV_1D_STAGE_0", + "CONV_1D_STAGE_1", + "CONV_1D_STAGE_2", + + "CONV_1D_GENERIC_STAGE_0", + "CONV_1D_GENERIC_STAGE_1", + + "FLASH_ATTN", + "FLASH_FF", + "FLASH_ATTN_BACK", + "WIN_PART", + "WIN_UNPART", + "GET_REL_POS", + "ADD_REL_POS", + + "UNARY", + + "MAP_UNARY", + "MAP_BINARY", + + "MAP_CUSTOM1_F32", + "MAP_CUSTOM2_F32", + "MAP_CUSTOM3_F32", + + "MAP_CUSTOM1", + "MAP_CUSTOM2", + "MAP_CUSTOM3", + + "CROSS_ENTROPY_LOSS", + "CROSS_ENTROPY_LOSS_BACK", +}; + +static_assert(GGML_OP_COUNT == 75, "GGML_OP_COUNT != 75"); + +static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { + "none", + + "x", + "x+y", + "x+y", + "view(x,nb,offset)+=y->x", + "x-y", + "x*y", + "x/y", + "x^2", + "√x", + "log(x)", + "Σx", + "Σx_k", + "Σx/n", + "argmax(x)", + "repeat(x)", + "repeat_back(x)", + "concat(x, y)", + "silu_back(x)", + "norm(x)", + "batch_norm(x)", + "rms_norm(x)", + "rms_norm_back(x)", + "group_norm(x)", + + "X*Y", + "X*Y", + + "x*v", + "y-\\>view(x)", + "x-\\>y", + "cont(x)", + "reshape(x)", + "view(x)", + "permute(x)", + "transpose(x)", + "get_rows(x)", + "get_rows_back(x)", + "diag(x)", + "diag_mask_inf(x)", + "diag_mask_zero(x)", + "soft_max(x)", + "soft_max_back(x)", + "rope(x)", + "rope_back(x)", + "alibi(x)", + "clamp(x)", + "conv_1d(x)", + "conv_1d_generic(x)", + "conv_2d(x)", + "conv_transpose_2d(x)", + "pool_1d(x)", + "pool_2d(x)", + "upscale(x)", + "conv_1d_stage_0(x)", + "conv_1d_stage_1(x)", + "conv_1d_stage_2(x)", + "conv_1d_generic_stage_0(x)", + "conv_1d_generic_stage_1(x)", + + "flash_attn(x)", + "flash_ff(x)", + "flash_attn_back(x)", + "win_part(x)", + "win_unpart(x)", + "get_rel_pos(x)", + "add_rel_pos(x)", + + "unary(x)", + + "f(x)", + "f(x,y)", + + "custom_f32(x)", + "custom_f32(x,y)", + "custom_f32(x,y,z)", + + "custom(x)", + "custom(x,y)", + "custom(x,y,z)", + + "cross_entropy_loss(x,y)", + "cross_entropy_loss_back(x,y)", +}; + +static_assert(GGML_OP_COUNT == 75, "GGML_OP_COUNT != 75"); + +static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); + +static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); +static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); + +// WARN: +// Mis-confguration can lead to problem that's hard to reason about: +// * At best it crash or talks nosense. +// * At worst it talks slightly difference but hard to perceive. +// +// An op has to enable INIT or FINALIZE when any of it's branch needs that pass. +// Take care about compile options (e.g., GGML_USE_xxx). +static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 }; +static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 }; + +static void ggml_setup_op_has_task_pass(void) { + { // INIT + bool * p = GGML_OP_HAS_INIT; + + p[GGML_OP_ACC ] = true; + p[GGML_OP_MUL_MAT ] = true; + p[GGML_OP_OUT_PROD ] = true; + p[GGML_OP_SET ] = true; + p[GGML_OP_GET_ROWS_BACK ] = true; + p[GGML_OP_DIAG_MASK_INF ] = true; + p[GGML_OP_DIAG_MASK_ZERO ] = true; + p[GGML_OP_CONV_1D ] = true; + p[GGML_OP_CONV_1D_STAGE_0 ] = true; + p[GGML_OP_CONV_1D_STAGE_1 ] = true; + p[GGML_OP_CONV_1D_STAGE_2 ] = true; + p[GGML_OP_CONV_1D_GENERIC ] = true; + p[GGML_OP_CONV_1D_GENERIC_STAGE_0 ] = true; + p[GGML_OP_CONV_1D_GENERIC_STAGE_1 ] = true; + p[GGML_OP_CONV_2D ] = true; + p[GGML_OP_CONV_TRANSPOSE_2D ] = true; + p[GGML_OP_FLASH_ATTN_BACK ] = true; + p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; + p[GGML_OP_ADD_REL_POS ] = true; + } + + { // FINALIZE + bool * p = GGML_OP_HAS_FINALIZE; + + p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; + } +} + +// +// ggml context +// + +struct ggml_context { + int64_t mem_size; + void * mem_buffer; + bool mem_buffer_owned; + bool no_alloc; + bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers + + int n_objects; + + struct ggml_object * objects_begin; + struct ggml_object * objects_end; + + struct ggml_scratch scratch; + struct ggml_scratch scratch_save; +}; + +struct ggml_context_container { + bool used; + + struct ggml_context context; +}; + +// +// NUMA support +// + +#define GGML_NUMA_MAX_NODES 8 +#define GGML_NUMA_MAX_CPUS 512 + +struct ggml_numa_node { + uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node + uint32_t n_cpus; +}; + +struct ggml_numa_nodes { + struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES]; + uint32_t n_nodes; + uint32_t total_cpus; // hardware threads on system +}; + +// +// ggml state +// + +struct ggml_state { + struct ggml_context_container contexts[GGML_MAX_CONTEXTS]; + struct ggml_numa_nodes numa; +}; + +// global state +static struct ggml_state g_state; +static atomic_int g_state_barrier = 0; + +// barrier via spin lock +inline static void ggml_critical_section_start(void) { + int processing = atomic_fetch_add(&g_state_barrier, 1); + + while (processing > 0) { + // wait for other threads to finish + atomic_fetch_sub(&g_state_barrier, 1); + sched_yield(); // TODO: reconsider this + processing = atomic_fetch_add(&g_state_barrier, 1); + } +} + +// TODO: make this somehow automatically executed +// some sort of "sentry" mechanism +inline static void ggml_critical_section_end(void) { + atomic_fetch_sub(&g_state_barrier, 1); +} + +void ggml_numa_init(void) { + if (g_state.numa.n_nodes > 0) { + fprintf(stderr, "ggml_numa_init: NUMA already initialized\n"); + + return; + } + +#ifdef __linux__ + struct stat st; + char path[256]; + int rv; + + // enumerate nodes + while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) { + rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes); + GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); + if (stat(path, &st) != 0) { break; } + ++g_state.numa.n_nodes; + } + + // enumerate CPUs + while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) { + rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus); + GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); + if (stat(path, &st) != 0) { break; } + ++g_state.numa.total_cpus; + } + + GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus); + + if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) { + g_state.numa.n_nodes = 0; + return; + } + + for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) { + struct ggml_numa_node * node = &g_state.numa.nodes[n]; + GGML_PRINT_DEBUG("CPUs on node %u:", n); + node->n_cpus = 0; + for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) { + rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c); + GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); + if (stat(path, &st) == 0) { + node->cpus[node->n_cpus++] = c; + GGML_PRINT_DEBUG(" %u", c); + } + } + GGML_PRINT_DEBUG("\n"); + } + + if (ggml_is_numa()) { + FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r"); + if (fptr != NULL) { + char buf[42]; + if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) { + GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n"); + } + fclose(fptr); + } + } +#else + // TODO +#endif +} + +bool ggml_is_numa(void) { + return g_state.numa.n_nodes > 1; +} + +//////////////////////////////////////////////////////////////////////////////// + +void ggml_print_object(const struct ggml_object * obj) { + GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n", + obj->type, obj->offs, obj->size, (const void *) obj->next); +} + +void ggml_print_objects(const struct ggml_context * ctx) { + struct ggml_object * obj = ctx->objects_begin; + + GGML_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx); + + while (obj != NULL) { + ggml_print_object(obj); + obj = obj->next; + } + + GGML_PRINT("%s: --- end ---\n", __func__); +} + +int64_t ggml_nelements(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; +} + +int64_t ggml_nrows(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; +} + +size_t ggml_nbytes(const struct ggml_tensor * tensor) { + size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type); + for (int i = 1; i < GGML_MAX_DIMS; ++i) { + nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; + } + return nbytes; +} + +size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) { + return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN); +} + +size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type); +} + +int ggml_blck_size(enum ggml_type type) { + return type_traits[type].blck_size; +} + +size_t ggml_type_size(enum ggml_type type) { + return type_traits[type].type_size; +} + +float ggml_type_sizef(enum ggml_type type) { + return ((float)(type_traits[type].type_size))/type_traits[type].blck_size; +} + +const char * ggml_type_name(enum ggml_type type) { + return type_traits[type].type_name; +} + +bool ggml_is_quantized(enum ggml_type type) { + return type_traits[type].is_quantized; +} + +const char * ggml_op_name(enum ggml_op op) { + return GGML_OP_NAME[op]; +} + +const char * ggml_op_symbol(enum ggml_op op) { + return GGML_OP_SYMBOL[op]; +} + +size_t ggml_element_size(const struct ggml_tensor * tensor) { + return ggml_type_size(tensor->type); +} + +static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; +} + +static inline bool ggml_is_vector(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; +} + +static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return tensor->ne[2] == 1 && tensor->ne[3] == 1; +} + +static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return (t0->ne[0] == t1->ne[0]) && + (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable + (t1->ne[3]%t0->ne[3] == 0); +} + +static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + (t0->ne[1] == t1->ne[1]) && + (t0->ne[2] == t1->ne[2]) && + (t0->ne[3] == t1->ne[3]); +} + +enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { + enum ggml_type wtype = GGML_TYPE_COUNT; + + switch (ftype) { + case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break; + case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break; + case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break; + case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break; + case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break; + case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break; + case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break; + case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break; + case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break; + case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break; + case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break; + case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break; + case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; + case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; + } + + GGML_ASSERT(wtype != GGML_TYPE_COUNT); + + return wtype; +} + +size_t ggml_tensor_overhead(void) { + return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE; +} + +bool ggml_is_transposed(const struct ggml_tensor * tensor) { + return tensor->nb[0] > tensor->nb[1]; +} + +bool ggml_is_contiguous(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + tensor->nb[0] == ggml_type_size(tensor->type) && + tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && + tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && + tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; +} + +static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + tensor->nb[0] == ggml_type_size(tensor->type) && + tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && + tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; +} + +bool ggml_is_permuted(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3]; +} + +static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + tensor->nb[0] == ggml_type_size(tensor->type) && + tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && + tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; +} + +bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + (t0->ne[0] == t1->ne[0] ) && + (t0->ne[1] == t1->ne[1] ) && + (t0->ne[2] == t1->ne[2] ) && + (t0->ne[3] == t1->ne[3] ); +} + +// check if t1 can be represented as a repeatition of t0 +static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + (t1->ne[0]%t0->ne[0] == 0) && + (t1->ne[1]%t0->ne[1] == 0) && + (t1->ne[2]%t0->ne[2] == 0) && + (t1->ne[3]%t0->ne[3] == 0); +} + +static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1); +} + +static inline int ggml_up32(int n) { + return (n + 31) & ~31; +} + +//static inline int ggml_up64(int n) { +// return (n + 63) & ~63; +//} + +static inline int ggml_up(int n, int m) { + // assert m is a power of 2 + GGML_ASSERT((m & (m - 1)) == 0); + return (n + m - 1) & ~(m - 1); +} + +// assert that pointer is aligned to GGML_MEM_ALIGN +#define ggml_assert_aligned(ptr) \ + GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0) + +//////////////////////////////////////////////////////////////////////////////// + +struct ggml_context * ggml_init(struct ggml_init_params params) { + // make this function thread safe + ggml_critical_section_start(); + + static bool is_first_call = true; + + if (is_first_call) { + // initialize time system (required on Windows) + ggml_time_init(); + + // initialize GELU, Quick GELU, SILU and EXP F32 tables + { + const uint64_t t_start = ggml_time_us(); UNUSED(t_start); + + ggml_fp16_t ii; + for (int i = 0; i < (1 << 16); ++i) { + uint16_t ui = i; + memcpy(&ii, &ui, sizeof(ii)); + const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii); + table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); + table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f)); + table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f)); + table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f)); + } + + const uint64_t t_end = ggml_time_us(); UNUSED(t_end); + + GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); + } + + // initialize g_state + { + const uint64_t t_start = ggml_time_us(); UNUSED(t_start); + + g_state = (struct ggml_state) { + /*.contexts =*/ { { 0 } }, + /*.numa =*/ { + .n_nodes = 0, + .total_cpus = 0, + }, + }; + + for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) { + g_state.contexts[i].used = false; + } + + const uint64_t t_end = ggml_time_us(); UNUSED(t_end); + + GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); + } + +#if defined(GGML_USE_CUBLAS) + ggml_init_cublas(); +#elif defined(GGML_USE_CLBLAST) + ggml_cl_init(); +#endif + + ggml_setup_op_has_task_pass(); + + is_first_call = false; + } + + // find non-used context in g_state + struct ggml_context * ctx = NULL; + + for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { + if (!g_state.contexts[i].used) { + g_state.contexts[i].used = true; + ctx = &g_state.contexts[i].context; + + GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i); + break; + } + } + + if (ctx == NULL) { + GGML_PRINT_DEBUG("%s: no unused context found\n", __func__); + + ggml_critical_section_end(); + + return NULL; + } + + // allow to call ggml_init with 0 size + if (params.mem_size == 0) { + params.mem_size = GGML_MEM_ALIGN; + } + + const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN); + + *ctx = (struct ggml_context) { + /*.mem_size =*/ mem_size, + /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size), + /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, + /*.no_alloc =*/ params.no_alloc, + /*.no_alloc_save =*/ params.no_alloc, + /*.n_objects =*/ 0, + /*.objects_begin =*/ NULL, + /*.objects_end =*/ NULL, + /*.scratch =*/ { 0, 0, NULL, }, + /*.scratch_save =*/ { 0, 0, NULL, }, + }; + + GGML_ASSERT(ctx->mem_buffer != NULL); + + ggml_assert_aligned(ctx->mem_buffer); + + GGML_PRINT_DEBUG("%s: context initialized\n", __func__); + + ggml_critical_section_end(); + + return ctx; +} + +void ggml_free(struct ggml_context * ctx) { + // make this function thread safe + ggml_critical_section_start(); + + bool found = false; + + for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { + if (&g_state.contexts[i].context == ctx) { + g_state.contexts[i].used = false; + + GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n", + __func__, i, ggml_used_mem(ctx)); + + if (ctx->mem_buffer_owned) { + GGML_ALIGNED_FREE(ctx->mem_buffer); + } + + found = true; + break; + } + } + + if (!found) { + GGML_PRINT_DEBUG("%s: context not found\n", __func__); + } + + ggml_critical_section_end(); +} + +size_t ggml_used_mem(const struct ggml_context * ctx) { + return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size; +} + +size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) { + const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0; + + ctx->scratch = scratch; + + return result; +} + +bool ggml_get_no_alloc(struct ggml_context * ctx) { + return ctx->no_alloc; +} + +void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) { + ctx->no_alloc = no_alloc; +} + +void * ggml_get_mem_buffer(const struct ggml_context * ctx) { + return ctx->mem_buffer; +} + +int64_t ggml_get_mem_size(const struct ggml_context * ctx) { + return ctx->mem_size; +} + +size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) { + size_t max_size = 0; + + struct ggml_object * obj = ctx->objects_begin; + + while (obj != NULL) { + if (obj->type == GGML_OBJECT_TENSOR) { + struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs); + + const size_t size = ggml_nbytes(tensor); + + if (max_size < size) { + max_size = size; + } + } + + obj = obj->next; + } + + return max_size; +} + +// IMPORTANT: +// when creating "opt" tensors, always save and load the scratch buffer +// this is an error prone process, but it is necessary to support inplace +// operators when using scratch buffers +// TODO: implement a better way +static void ggml_scratch_save(struct ggml_context * ctx) { + // this is needed to allow opt tensors to store their data + // TODO: again, need to find a better way + ctx->no_alloc_save = ctx->no_alloc; + ctx->no_alloc = false; + + ctx->scratch_save = ctx->scratch; + ctx->scratch.data = NULL; +} + +static void ggml_scratch_load(struct ggml_context * ctx) { + ctx->no_alloc = ctx->no_alloc_save; + + ctx->scratch = ctx->scratch_save; +} + +//////////////////////////////////////////////////////////////////////////////// + +static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) { + // always insert objects at the end of the context's memory pool + struct ggml_object * obj_cur = ctx->objects_end; + + const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs; + const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size; + const size_t cur_end = cur_offs + cur_size; + + // align to GGML_MEM_ALIGN + size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN); + + char * const mem_buffer = ctx->mem_buffer; + struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end); + + if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { + GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", + __func__, cur_end + size_needed, ctx->mem_size); + assert(false); + return NULL; + } + + *obj_new = (struct ggml_object) { + .offs = cur_end + GGML_OBJECT_SIZE, + .size = size_needed, + .next = NULL, + .type = type, + }; + + ggml_assert_aligned(mem_buffer + obj_new->offs); + + if (obj_cur != NULL) { + obj_cur->next = obj_new; + } else { + // this is the first object in this context + ctx->objects_begin = obj_new; + } + + ctx->objects_end = obj_new; + + //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size); + + return obj_new; +} + +static struct ggml_tensor * ggml_new_tensor_impl( + struct ggml_context * ctx, + enum ggml_type type, + int n_dims, + const int64_t * ne, + struct ggml_tensor * view_src, + size_t view_offs) { + + assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS); + + // find the base tensor and absolute offset + if (view_src != NULL && view_src->view_src != NULL) { + view_offs += view_src->view_offs; + view_src = view_src->view_src; + } + + size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type)); + for (int i = 1; i < n_dims; i++) { + data_size *= ne[i]; + } + + GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src)); + + void * data = view_src != NULL ? view_src->data : NULL; + if (data != NULL) { + data = (char *) data + view_offs; + } + + size_t obj_alloc_size = 0; + + if (view_src == NULL && !ctx->no_alloc) { + if (ctx->scratch.data != NULL) { + // allocate tensor data in the scratch buffer + if (ctx->scratch.offs + data_size > ctx->scratch.size) { + GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", + __func__, ctx->scratch.offs + data_size, ctx->scratch.size); + assert(false); + return NULL; + } + + data = (char * const) ctx->scratch.data + ctx->scratch.offs; + + ctx->scratch.offs += data_size; + } else { + // allocate tensor data in the context's memory pool + obj_alloc_size = data_size; + } + } + + struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size); + + // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here + + struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs); + + *result = (struct ggml_tensor) { + /*.type =*/ type, + /*.backend =*/ GGML_BACKEND_CPU, + /*.n_dims =*/ n_dims, + /*.ne =*/ { 1, 1, 1, 1 }, + /*.nb =*/ { 0, 0, 0, 0 }, + /*.op =*/ GGML_OP_NONE, + /*.op_params =*/ { 0 }, + /*.is_param =*/ false, + /*.grad =*/ NULL, + /*.src =*/ { NULL }, + /*.perf_runs =*/ 0, + /*.perf_cycles =*/ 0, + /*.perf_time_us =*/ 0, + /*.view_src =*/ view_src, + /*.view_offs =*/ view_offs, + /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data, + /*.name =*/ { 0 }, + /*.extra =*/ NULL, + /*.padding =*/ { 0 }, + }; + + // TODO: this should not be needed as long as we don't rely on aligned SIMD loads + //ggml_assert_aligned(result->data); + + for (int i = 0; i < n_dims; i++) { + result->ne[i] = ne[i]; + } + + result->nb[0] = ggml_type_size(type); + result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type)); + for (int i = 2; i < GGML_MAX_DIMS; i++) { + result->nb[i] = result->nb[i - 1]*result->ne[i - 1]; + } + + ctx->n_objects++; + + return result; +} + +struct ggml_tensor * ggml_new_tensor( + struct ggml_context * ctx, + enum ggml_type type, + int n_dims, + const int64_t * ne) { + return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0); +} + +struct ggml_tensor * ggml_new_tensor_1d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0) { + return ggml_new_tensor(ctx, type, 1, &ne0); +} + +struct ggml_tensor * ggml_new_tensor_2d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1) { + const int64_t ne[2] = { ne0, ne1 }; + return ggml_new_tensor(ctx, type, 2, ne); +} + +struct ggml_tensor * ggml_new_tensor_3d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1, + int64_t ne2) { + const int64_t ne[3] = { ne0, ne1, ne2 }; + return ggml_new_tensor(ctx, type, 3, ne); +} + +struct ggml_tensor * ggml_new_tensor_4d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3) { + const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; + return ggml_new_tensor(ctx, type, 4, ne); +} + +struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) { + ggml_scratch_save(ctx); + + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1); + + ggml_scratch_load(ctx); + + ggml_set_i32(result, value); + + return result; +} + +struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) { + ggml_scratch_save(ctx); + + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + + ggml_scratch_load(ctx); + + ggml_set_f32(result, value); + + return result; +} + +struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) { + return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne); +} + +static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) { + GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings + assert(params_size <= GGML_MAX_OP_PARAMS); + memcpy(tensor->op_params, params, params_size); +} + +static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) { + assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); + return ((const int32_t *)(tensor->op_params))[i]; +} + +static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) { + assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); + ((int32_t *)(tensor->op_params))[i] = value; +} + +struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) { + memset(tensor->data, 0, ggml_nbytes(tensor)); + return tensor; +} + +struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { + const int n = ggml_nrows(tensor); + const int nc = tensor->ne[0]; + const size_t n1 = tensor->nb[1]; + + char * const data = tensor->data; + + switch (tensor->type) { + case GGML_TYPE_I8: + { + assert(tensor->nb[0] == sizeof(int8_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_I16: + { + assert(tensor->nb[0] == sizeof(int16_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_I32: + { + assert(tensor->nb[0] == sizeof(int32_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_F16: + { + assert(tensor->nb[0] == sizeof(ggml_fp16_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); + } + } break; + case GGML_TYPE_F32: + { + assert(tensor->nb[0] == sizeof(float)); + for (int i = 0; i < n; i++) { + ggml_vec_set_f32(nc, (float *)(data + i*n1), value); + } + } break; + default: + { + GGML_ASSERT(false); + } break; + } + + return tensor; +} + +struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { + const int n = ggml_nrows(tensor); + const int nc = tensor->ne[0]; + const size_t n1 = tensor->nb[1]; + + char * const data = tensor->data; + + switch (tensor->type) { + case GGML_TYPE_I8: + { + assert(tensor->nb[0] == sizeof(int8_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_I16: + { + assert(tensor->nb[0] == sizeof(int16_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_I32: + { + assert(tensor->nb[0] == sizeof(int32_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_F16: + { + assert(tensor->nb[0] == sizeof(ggml_fp16_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); + } + } break; + case GGML_TYPE_F32: + { + assert(tensor->nb[0] == sizeof(float)); + for (int i = 0; i < n; i++) { + ggml_vec_set_f32(nc, (float *)(data + i*n1), value); + } + } break; + default: + { + GGML_ASSERT(false); + } break; + } + + return tensor; +} + +int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { + switch (tensor->type) { + case GGML_TYPE_I8: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); + return ((int8_t *)(tensor->data))[i]; + } break; + case GGML_TYPE_I16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); + return ((int16_t *)(tensor->data))[i]; + } break; + case GGML_TYPE_I32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); + return ((int32_t *)(tensor->data))[i]; + } break; + case GGML_TYPE_F16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); + return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); + } break; + case GGML_TYPE_F32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(float)); + return ((float *)(tensor->data))[i]; + } break; + default: + { + GGML_ASSERT(false); + } break; + } + + return 0.0f; +} + +void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { + switch (tensor->type) { + case GGML_TYPE_I8: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); + ((int8_t *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_I16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); + ((int16_t *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_I32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); + ((int32_t *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_F16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); + ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); + } break; + case GGML_TYPE_F32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(float)); + ((float *)(tensor->data))[i] = value; + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { + switch (tensor->type) { + case GGML_TYPE_I8: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); + return ((int8_t *)(tensor->data))[i]; + } break; + case GGML_TYPE_I16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); + return ((int16_t *)(tensor->data))[i]; + } break; + case GGML_TYPE_I32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); + return ((int32_t *)(tensor->data))[i]; + } break; + case GGML_TYPE_F16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); + return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); + } break; + case GGML_TYPE_F32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(float)); + return ((float *)(tensor->data))[i]; + } break; + default: + { + GGML_ASSERT(false); + } break; + } + + return 0.0f; +} + +void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { + switch (tensor->type) { + case GGML_TYPE_I8: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); + ((int8_t *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_I16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); + ((int16_t *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_I32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); + ((int32_t *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_F16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); + ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); + } break; + case GGML_TYPE_F32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(float)); + ((float *)(tensor->data))[i] = value; + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +void * ggml_get_data(const struct ggml_tensor * tensor) { + return tensor->data; +} + +float * ggml_get_data_f32(const struct ggml_tensor * tensor) { + assert(tensor->type == GGML_TYPE_F32); + return (float *)(tensor->data); +} + +enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { + GGML_ASSERT(tensor->op == GGML_OP_UNARY); + return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0); +} + +const char * ggml_get_name(const struct ggml_tensor * tensor) { + return tensor->name; +} + +struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) { + strncpy(tensor->name, name, sizeof(tensor->name)); + tensor->name[sizeof(tensor->name) - 1] = '\0'; + return tensor; +} + +struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) { + va_list args; + va_start(args, fmt); + vsnprintf(tensor->name, sizeof(tensor->name), fmt, args); + va_end(args); + return tensor; +} + +struct ggml_tensor * ggml_view_tensor( + struct ggml_context * ctx, + struct ggml_tensor * src) { + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0); + ggml_format_name(result, "%s (view)", src->name); + + for (int i = 0; i < GGML_MAX_DIMS; i++) { + result->nb[i] = src->nb[i]; + } + + return result; +} + +struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) { + struct ggml_object * obj = ctx->objects_begin; + + char * const mem_buffer = ctx->mem_buffer; + + while (obj != NULL) { + if (obj->type == GGML_OBJECT_TENSOR) { + struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs); + if (strcmp(cur->name, name) == 0) { + return cur; + } + } + + obj = obj->next; + } + + return NULL; +} + +//////////////////////////////////////////////////////////////////////////////// + +// ggml_dup + +static struct ggml_tensor * ggml_dup_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_DUP; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_dup( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_dup_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_dup_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_dup_impl(ctx, a, true); +} + +// ggml_add + +static struct ggml_tensor * ggml_add_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { + // TODO: support less-strict constraint + // GGML_ASSERT(ggml_can_repeat(b, a)); + GGML_ASSERT(ggml_can_repeat_rows(b, a)); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + // TODO: support backward pass for broadcasting + GGML_ASSERT(ggml_are_same_shape(a, b)); + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_ADD; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_add( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_add_impl(ctx, a, b, false); +} + +struct ggml_tensor * ggml_add_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_add_impl(ctx, a, b, true); +} + +// ggml_add1 + +static struct ggml_tensor * ggml_add1_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { + GGML_ASSERT(ggml_is_scalar(b)); + GGML_ASSERT(ggml_is_padded_1d(a)); + + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_ADD1; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_add1( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_add1_impl(ctx, a, b, false); +} + +struct ggml_tensor * ggml_add1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_add1_impl(ctx, a, b, true); +} + +// ggml_acc + +static struct ggml_tensor * ggml_acc_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset, + bool inplace) { + GGML_ASSERT(ggml_nelements(b) <= ggml_nelements(a)); + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(a->type == GGML_TYPE_F32); + GGML_ASSERT(b->type == GGML_TYPE_F32); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_ACC; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_acc( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset) { + return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false); +} + +struct ggml_tensor * ggml_acc_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset) { + return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true); +} + +// ggml_sub + +static struct ggml_tensor * ggml_sub_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { + GGML_ASSERT(ggml_are_same_shape(a, b)); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_SUB; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_sub( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_sub_impl(ctx, a, b, false); +} + +struct ggml_tensor * ggml_sub_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_sub_impl(ctx, a, b, true); +} + +// ggml_mul + +static struct ggml_tensor * ggml_mul_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { + // TODO: support less-strict constraint + // GGML_ASSERT(ggml_can_repeat(b, a)); + GGML_ASSERT(ggml_can_repeat_rows(b, a)); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + // TODO: support backward pass for broadcasting + GGML_ASSERT(ggml_are_same_shape(a, b)); + is_node = true; + } + + if (inplace) { + GGML_ASSERT(!is_node); + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_MUL; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_mul( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_mul_impl(ctx, a, b, false); +} + +struct ggml_tensor * ggml_mul_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_mul_impl(ctx, a, b, true); +} + +// ggml_div + +static struct ggml_tensor * ggml_div_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { + GGML_ASSERT(ggml_are_same_shape(a, b)); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + if (inplace) { + GGML_ASSERT(!is_node); + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_DIV; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_div( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_div_impl(ctx, a, b, false); +} + +struct ggml_tensor * ggml_div_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_div_impl(ctx, a, b, true); +} + +// ggml_sqr + +static struct ggml_tensor * ggml_sqr_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_SQR; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_sqr( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_sqr_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_sqr_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_sqr_impl(ctx, a, true); +} + +// ggml_sqrt + +static struct ggml_tensor * ggml_sqrt_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_SQRT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_sqrt( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_sqrt_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_sqrt_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_sqrt_impl(ctx, a, true); +} + + +// ggml_log + +static struct ggml_tensor * ggml_log_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_LOG; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_log( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_log_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_log_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_log_impl(ctx, a, true); +} + +// ggml_sum + +struct ggml_tensor * ggml_sum( + struct ggml_context * ctx, + struct ggml_tensor * a) { + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1); + + result->op = GGML_OP_SUM; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + + +// ggml_sum_rows + +struct ggml_tensor * ggml_sum_rows( + struct ggml_context * ctx, + struct ggml_tensor * a) { + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + int64_t ne[4] = {1,1,1,1}; + for (int i=1; in_dims; ++i) { + ne[i] = a->ne[i]; + } + + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, a->n_dims, ne); + + result->op = GGML_OP_SUM_ROWS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +// ggml_mean + +struct ggml_tensor * ggml_mean( + struct ggml_context * ctx, + struct ggml_tensor * a) { + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement + is_node = true; + } + + int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne); + + result->op = GGML_OP_MEAN; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +// ggml_argmax + +struct ggml_tensor * ggml_argmax( + struct ggml_context * ctx, + struct ggml_tensor * a) { + GGML_ASSERT(ggml_is_matrix(a)); + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); + is_node = true; + } + + int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne); + + result->op = GGML_OP_ARGMAX; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +// ggml_repeat + +struct ggml_tensor * ggml_repeat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_can_repeat(a, b)); + + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); + + result->op = GGML_OP_REPEAT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +// ggml_repeat_back + +struct ggml_tensor * ggml_repeat_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_can_repeat(b, a)); + + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + if (ggml_are_same_shape(a, b) && !is_node) { + return a; + } + + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); + + result->op = GGML_OP_REPEAT_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +// ggml_concat + +struct ggml_tensor * ggml_concat( + struct ggml_context* ctx, + struct ggml_tensor* a, + struct ggml_tensor* b) { + GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]); + + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]); + + result->op = GGML_OP_CONCAT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +// ggml_abs + +struct ggml_tensor * ggml_abs( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_ABS); +} + +struct ggml_tensor * ggml_abs_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ABS); +} + +// ggml_sgn + +struct ggml_tensor * ggml_sgn( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_SGN); +} + +struct ggml_tensor * ggml_sgn_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SGN); +} + +// ggml_neg + +struct ggml_tensor * ggml_neg( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_NEG); +} + +struct ggml_tensor * ggml_neg_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_NEG); +} + +// ggml_step + +struct ggml_tensor * ggml_step( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_STEP); +} + +struct ggml_tensor * ggml_step_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_STEP); +} + +// ggml_tanh + +struct ggml_tensor * ggml_tanh( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_TANH); +} + +struct ggml_tensor * ggml_tanh_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_TANH); +} + +// ggml_elu + +struct ggml_tensor * ggml_elu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_ELU); +} + +struct ggml_tensor * ggml_elu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ELU); +} + +// ggml_relu + +struct ggml_tensor * ggml_relu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_RELU); +} + +struct ggml_tensor * ggml_relu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU); +} + +// ggml_gelu + +struct ggml_tensor * ggml_gelu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_GELU); +} + +struct ggml_tensor * ggml_gelu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU); +} + +// ggml_gelu_quick + +struct ggml_tensor * ggml_gelu_quick( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_QUICK); +} + +struct ggml_tensor * ggml_gelu_quick_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_QUICK); +} + +// ggml_silu + +struct ggml_tensor * ggml_silu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_SILU); +} + +struct ggml_tensor * ggml_silu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU); +} + +// ggml_silu_back + +struct ggml_tensor * ggml_silu_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + bool is_node = false; + + if (a->grad || b->grad) { + // TODO: implement backward + is_node = true; + } + + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_SILU_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +// ggml_glu + +struct ggml_tensor * ggml_glu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_GLU); +} + + +// ggml_norm + +static struct ggml_tensor * ggml_norm_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_set_op_params(result, &eps, sizeof(eps)); + + result->op = GGML_OP_NORM; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps) { + return ggml_norm_impl(ctx, a, eps, false); +} + +// ggml_batch_norm + +struct ggml_tensor * ggml_batch_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * gamma, + struct ggml_tensor * beta, + struct ggml_tensor * running_mean, + struct ggml_tensor * running_var, + float eps) { + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + + ggml_set_op_params(result, &eps, sizeof(eps)); + + result->op = GGML_OP_BATCH_NORM; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = gamma; + result->src[2] = beta; + result->src[3] = running_mean; + result->src[4] = running_var; + + return result; +} + +struct ggml_tensor * ggml_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps) { + return ggml_norm_impl(ctx, a, eps, true); +} + +// ggml_rms_norm + +static struct ggml_tensor * ggml_rms_norm_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_set_op_params(result, &eps, sizeof(eps)); + + result->op = GGML_OP_RMS_NORM; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_rms_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps) { + return ggml_rms_norm_impl(ctx, a, eps, false); +} + +struct ggml_tensor * ggml_rms_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps) { + return ggml_rms_norm_impl(ctx, a, eps, true); +} + +// ggml_rms_norm_back + +struct ggml_tensor * ggml_rms_norm_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + float eps) { + bool is_node = false; + + if (a->grad) { + // TODO: implement backward + is_node = true; + } + + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + + ggml_set_op_params(result, &eps, sizeof(eps)); + + result->op = GGML_OP_RMS_NORM_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +// ggml_group_norm + +static struct ggml_tensor * ggml_group_norm_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups, + bool inplace) { + + bool is_node = false; + if (!inplace && (a->grad)) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_GROUP_NORM; + result->op_params[0] = n_groups; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = NULL; // TODO: maybe store epsilon here? + + return result; +} + +struct ggml_tensor * ggml_group_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups) { + return ggml_group_norm_impl(ctx, a, n_groups, false); +} + +struct ggml_tensor * ggml_group_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups) { + return ggml_group_norm_impl(ctx, a, n_groups, true); +} + +// ggml_mul_mat + +struct ggml_tensor * ggml_mul_mat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_can_mul_mat(a, b)); + GGML_ASSERT(!ggml_is_transposed(a)); + +#if defined(GGML_USE_OPENBLAS) && GGML_DEBUG + + const int64_t i = a->ne[1]; + const int64_t j = b->ne[1]; + const int64_t k = a->ne[0]; // = b->ne[0] + + bool big = (i >= 32 && j >= 32 && k >= 32); + big = big || (i >= 512 && k >= 512); + + if (!big) { + printf("Not using Openblas for small matmul (%d, %d) @ (%d, %d) \n", i, k, j, k); + } + if (!ggml_is_contiguous(a) || !ggml_is_contiguous(b)) { + printf("Not using Openblas for matmul (%d, %d) @ (%d, %d) because of non contiguous\n", i, k, j, k); + } +#endif + + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne); + + result->op = GGML_OP_MUL_MAT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +// ggml_out_prod + +struct ggml_tensor * ggml_out_prod( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_can_out_prod(a, b)); + GGML_ASSERT(!ggml_is_transposed(a)); + + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + const int64_t ne[4] = { a->ne[0], b->ne[0], a->ne[2], b->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne); + + result->op = GGML_OP_OUT_PROD; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +// ggml_scale + +static struct ggml_tensor * ggml_scale_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { + GGML_ASSERT(ggml_is_scalar(b)); + GGML_ASSERT(ggml_is_padded_1d(a)); + + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_SCALE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_scale( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_scale_impl(ctx, a, b, false); +} + +struct ggml_tensor * ggml_scale_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_scale_impl(ctx, a, b, true); +} + +// ggml_set + +static struct ggml_tensor * ggml_set_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset, + bool inplace) { + GGML_ASSERT(ggml_nelements(a) >= ggml_nelements(b)); + + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + // make a view of the destination + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_SET; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_set( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset) { + return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, false); +} + +struct ggml_tensor * ggml_set_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset) { + return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, true); +} + +struct ggml_tensor * ggml_set_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset) { + return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false); +} + +struct ggml_tensor * ggml_set_1d_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset) { + return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true); +} + +struct ggml_tensor * ggml_set_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t offset) { + return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false); +} + +struct ggml_tensor * ggml_set_2d_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t offset) { + return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true); +} + + +// ggml_cpy + +static struct ggml_tensor * ggml_cpy_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { + GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b)); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + // make a view of the destination + struct ggml_tensor * result = ggml_view_tensor(ctx, b); + if (strlen(b->name) > 0) { + ggml_format_name(result, "%s (copy of %s)", b->name, a->name); + } else { + ggml_format_name(result, "%s (copy)", a->name); + } + + result->op = GGML_OP_CPY; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_cpy( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_cpy_impl(ctx, a, b, false); +} + +struct ggml_tensor * ggml_cpy_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_cpy_impl(ctx, a, b, true); +} + +// ggml_cont + +static struct ggml_tensor * ggml_cont_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && a->grad) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + ggml_format_name(result, "%s (cont)", a->name); + + result->op = GGML_OP_CONT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_cont( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_cont_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_cont_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_cont_impl(ctx, a, true); +} + +// ggml_reshape + +struct ggml_tensor * ggml_reshape( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_is_contiguous(b)); + GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b)); + + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + if (b->grad) { + // gradient propagation is not supported + //GGML_ASSERT(false); + } + + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0); + ggml_format_name(result, "%s (reshaped)", a->name); + + result->op = GGML_OP_RESHAPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_reshape_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0) { + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_nelements(a) == ne0); + + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + const int64_t ne[1] = { ne0 }; + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0); + ggml_format_name(result, "%s (reshaped)", a->name); + + result->op = GGML_OP_RESHAPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_reshape_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1) { + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_nelements(a) == ne0*ne1); + + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + const int64_t ne[2] = { ne0, ne1 }; + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0); + ggml_format_name(result, "%s (reshaped)", a->name); + + result->op = GGML_OP_RESHAPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_reshape_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2) { + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2); + + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + const int64_t ne[3] = { ne0, ne1, ne2 }; + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0); + ggml_format_name(result, "%s (reshaped)", a->name); + + result->op = GGML_OP_RESHAPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_reshape_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3) { + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3); + + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0); + ggml_format_name(result, "%s (reshaped)", a->name); + + result->op = GGML_OP_RESHAPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +static struct ggml_tensor * ggml_view_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_dims, + const int64_t * ne, + size_t offset) { + + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset); + ggml_format_name(result, "%s (view)", a->name); + + ggml_set_op_params(result, &offset, sizeof(offset)); + + result->op = GGML_OP_VIEW; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +// ggml_view_1d + +struct ggml_tensor * ggml_view_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + size_t offset) { + + struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset); + + return result; +} + +// ggml_view_2d + +struct ggml_tensor * ggml_view_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + size_t nb1, + size_t offset) { + + const int64_t ne[2] = { ne0, ne1 }; + + struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset); + + result->nb[1] = nb1; + result->nb[2] = result->nb[1]*ne1; + result->nb[3] = result->nb[2]; + + return result; +} + +// ggml_view_3d + +struct ggml_tensor * ggml_view_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + size_t nb1, + size_t nb2, + size_t offset) { + + const int64_t ne[3] = { ne0, ne1, ne2 }; + + struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset); + + result->nb[1] = nb1; + result->nb[2] = nb2; + result->nb[3] = result->nb[2]*ne2; + + return result; +} + +// ggml_view_4d + +struct ggml_tensor * ggml_view_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset) { + + const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; + + struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset); + + result->nb[1] = nb1; + result->nb[2] = nb2; + result->nb[3] = nb3; + + return result; +} + +// ggml_permute + +struct ggml_tensor * ggml_permute( + struct ggml_context * ctx, + struct ggml_tensor * a, + int axis0, + int axis1, + int axis2, + int axis3) { + GGML_ASSERT(axis0 >= 0 && axis0 < GGML_MAX_DIMS); + GGML_ASSERT(axis1 >= 0 && axis1 < GGML_MAX_DIMS); + GGML_ASSERT(axis2 >= 0 && axis2 < GGML_MAX_DIMS); + GGML_ASSERT(axis3 >= 0 && axis3 < GGML_MAX_DIMS); + + GGML_ASSERT(axis0 != axis1); + GGML_ASSERT(axis0 != axis2); + GGML_ASSERT(axis0 != axis3); + GGML_ASSERT(axis1 != axis2); + GGML_ASSERT(axis1 != axis3); + GGML_ASSERT(axis2 != axis3); + + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + struct ggml_tensor * result = ggml_view_tensor(ctx, a); + ggml_format_name(result, "%s (permuted)", a->name); + + int ne[GGML_MAX_DIMS]; + int nb[GGML_MAX_DIMS]; + + ne[axis0] = a->ne[0]; + ne[axis1] = a->ne[1]; + ne[axis2] = a->ne[2]; + ne[axis3] = a->ne[3]; + + nb[axis0] = a->nb[0]; + nb[axis1] = a->nb[1]; + nb[axis2] = a->nb[2]; + nb[axis3] = a->nb[3]; + + result->ne[0] = ne[0]; + result->ne[1] = ne[1]; + result->ne[2] = ne[2]; + result->ne[3] = ne[3]; + + result->nb[0] = nb[0]; + result->nb[1] = nb[1]; + result->nb[2] = nb[2]; + result->nb[3] = nb[3]; + + result->op = GGML_OP_PERMUTE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + int32_t params[] = { axis0, axis1, axis2, axis3 }; + ggml_set_op_params(result, params, sizeof(params)); + + return result; +} + +// ggml_transpose + +struct ggml_tensor * ggml_transpose( + struct ggml_context * ctx, + struct ggml_tensor * a) { + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + struct ggml_tensor * result = ggml_view_tensor(ctx, a); + ggml_format_name(result, "%s (transposed)", a->name); + + result->ne[0] = a->ne[1]; + result->ne[1] = a->ne[0]; + + result->nb[0] = a->nb[1]; + result->nb[1] = a->nb[0]; + + result->op = GGML_OP_TRANSPOSE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +// ggml_get_rows + +struct ggml_tensor * ggml_get_rows( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32); + + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); + + result->op = GGML_OP_GET_ROWS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +// ggml_get_rows_back + +struct ggml_tensor * ggml_get_rows_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c) { + GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0])); + + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + // TODO: implement non F32 return + //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); + struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]); + + result->op = GGML_OP_GET_ROWS_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; + + return result; +} + +// ggml_diag + +struct ggml_tensor * ggml_diag( + struct ggml_context * ctx, + struct ggml_tensor * a) { + GGML_ASSERT(a->ne[1] == 1); + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne); + + result->op = GGML_OP_DIAG; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + + +// ggml_diag_mask_inf + +static struct ggml_tensor * ggml_diag_mask_inf_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + bool inplace) { + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + int32_t params[] = { n_past }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_DIAG_MASK_INF; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_diag_mask_inf( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past) { + return ggml_diag_mask_inf_impl(ctx, a, n_past, false); +} + +struct ggml_tensor * ggml_diag_mask_inf_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past) { + return ggml_diag_mask_inf_impl(ctx, a, n_past, true); +} + +// ggml_diag_mask_zero + +static struct ggml_tensor * ggml_diag_mask_zero_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + bool inplace) { + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + int32_t params[] = { n_past }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_DIAG_MASK_ZERO; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_diag_mask_zero( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past) { + return ggml_diag_mask_zero_impl(ctx, a, n_past, false); +} + +struct ggml_tensor * ggml_diag_mask_zero_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past) { + return ggml_diag_mask_zero_impl(ctx, a, n_past, true); +} + +// ggml_soft_max + +static struct ggml_tensor * ggml_soft_max_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_SOFT_MAX; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_soft_max( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_soft_max_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_soft_max_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_soft_max_impl(ctx, a, true); +} + + +// ggml_soft_max_back + +static struct ggml_tensor * ggml_soft_max_back_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; // TODO : implement backward pass + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_SOFT_MAX_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_soft_max_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_soft_max_back_impl(ctx, a, b, false); +} + +struct ggml_tensor * ggml_soft_max_back_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_soft_max_back_impl(ctx, a, b, true); +} + +// ggml_rope + +static struct ggml_tensor * ggml_rope_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx, + float freq_base, + float freq_scale, + float xpos_base, + bool xpos_down, + bool inplace) { + GGML_ASSERT(n_past >= 0); + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + int32_t params[8] = { n_past, n_dims, mode, n_ctx }; + memcpy(params + 4, &freq_base, sizeof(float)); + memcpy(params + 5, &freq_scale, sizeof(float)); + memcpy(params + 6, &xpos_base, sizeof(float)); + memcpy(params + 7, &xpos_down, sizeof(bool)); + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_ROPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_rope( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx) { + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false); +} + +struct ggml_tensor * ggml_rope_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx) { + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true); +} + +struct ggml_tensor * ggml_rope_custom( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx, + float freq_base, + float freq_scale) { + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false); +} + +struct ggml_tensor * ggml_rope_custom_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx, + float freq_base, + float freq_scale) { + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true); +} + +struct ggml_tensor * ggml_rope_xpos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + float base, + bool down) { + return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true); +} + +// ggml_rope_back + +struct ggml_tensor * ggml_rope_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx, + float freq_base, + float freq_scale, + float xpos_base, + bool xpos_down) { + GGML_ASSERT(n_past >= 0); + GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet"); + + bool is_node = false; + + if (a->grad) { + is_node = false; // TODO: implement backward + } + + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + + int32_t params[8] = { n_past, n_dims, mode, n_ctx }; + memcpy(params + 4, &freq_base, sizeof(float)); + memcpy(params + 5, &freq_scale, sizeof(float)); + memcpy(params + 6, &xpos_base, sizeof(float)); + memcpy(params + 7, &xpos_down, sizeof(bool)); + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_ROPE_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +// ggml_alibi + +struct ggml_tensor * ggml_alibi( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_head, + float bias_max) { + GGML_ASSERT(n_past >= 0); + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + // TODO: when implement backward, fix this: + //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = ggml_view_tensor(ctx, a); + + int32_t op_params[3] = { n_past, n_head }; + memcpy(op_params + 2, &bias_max, sizeof(float)); + ggml_set_op_params(result, op_params, sizeof(op_params)); + + result->op = GGML_OP_ALIBI; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +// ggml_clamp + +struct ggml_tensor * ggml_clamp( + struct ggml_context * ctx, + struct ggml_tensor * a, + float min, + float max) { + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + // TODO: when implement backward, fix this: + struct ggml_tensor * result = ggml_view_tensor(ctx, a); + + float params[] = { min, max }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_CLAMP; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +// ggml_conv_1d + +static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) { + return (ins + 2 * p - d * (ks - 1) - 1) / s + 1; +} + +static struct ggml_tensor * ggml_conv_1d_stage_0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int p0, + int d0) { + GGML_ASSERT(a->ne[1] == b->ne[1]); + bool is_node = false; + + if (a->grad || b->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + // struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, b->ne[0]+30, b->ne[1]); + + int32_t params[] = { s0, p0, d0 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_CONV_1D_STAGE_0; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +// ggml_conv_1d_stage_1 + +static struct ggml_tensor * ggml_conv_1d_stage_1( + struct ggml_context * ctx, + struct ggml_tensor * a) { + + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); + is_node = true; + } + // TODO: remove hardcoding + struct ggml_tensor * result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 31, a->ne[0]-30, a->ne[1]); // K, S, C + + result->op = GGML_OP_CONV_1D_STAGE_1; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +// ggml_conv_1d_stage_2 + +static struct ggml_tensor * ggml_conv_1d_stage_2( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + + bool is_node = false; + + if (a->grad || b->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, b->ne[1], b->ne[2]); + + result->op = GGML_OP_CONV_1D_STAGE_2; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +// ggml_conv_1d - THIS IS FOR DEPTHWISE CONV ONLY. +// TODO: merge with generic conv1d + +// 3 stages: (1) pad (2) unfold (3) mul +// Python equivalent: +// def pad(x, K): +// C, S = x.shape +// padding_offset = K // 2 +// padded = torch.zeros([C, S + K - 1]) +// for i in range(C): +// for j in range(S): +// padded[i][j+padding_offset] = x[i][j] +// return padded + +// def unfold(x, K): +// C, S = x.shape +// unfolded_tensor = torch.zeros((C, S-K+1, K)) +// for c in range(C): +// for s in range(S-K+1): +// for k in range(K): +// unfolded_tensor[c, s, k] = x[c, s+k] +// return unfolded_tensor.permute(2, 0, 1) + +// def mul(x, kernel): +// K, C, S = x.shape +// res = torch.zeros([C, S]) +// for s in range(S): +// for c in range(C): +// res[c, s] = torch.sum(x[:, c, s].cpu() * kernel[c, :].cpu()) +// return res + +GGML_API struct ggml_tensor * ggml_conv_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int p0, + int d0) { + struct ggml_tensor * result = ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0); + result = ggml_conv_1d_stage_1(ctx, result); + result = ggml_conv_1d_stage_2(ctx, a, result); + return result; +} + + +// im2col: [N, IC, IL] => [N, OL, IC*K] +// a: [OC,IC, K] +// b: [N, IC, IL] +// result: [N, OL, IC*K] +static struct ggml_tensor * ggml_conv_1d_generic_stage_0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int p0, + int d0) { + GGML_ASSERT(a->ne[1] == b->ne[1]); + bool is_node = false; + + if (a->grad || b->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t OL = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); + + const int64_t ne[4] = { + a->ne[1] * a->ne[0], + OL, + b->ne[2], + 1, + }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + int32_t params[] = { s0, p0, d0 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_CONV_1D_GENERIC_STAGE_0; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +// ggml_conv_1d_stage_1 + +// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K] +// a: [OC, IC, K] +// b: [N, OL, IC * K] +// result: [N, OC, OL] +static struct ggml_tensor * ggml_conv_1d_generic_stage_1( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + + bool is_node = false; + + if (a->grad || b->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[4] = { + b->ne[1], + a->ne[2], + b->ne[2], + 1, + }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + result->op = GGML_OP_CONV_1D_GENERIC_STAGE_1; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + + +GGML_API struct ggml_tensor * ggml_conv_1d_generic( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int p0, + int d0) { + struct ggml_tensor * result = ggml_conv_1d_generic_stage_0(ctx, a, b, s0, p0, d0); + result = ggml_conv_1d_generic_stage_1(ctx, a, result); + return result; +} + +// ggml_conv_1d_ph + +struct ggml_tensor* ggml_conv_1d_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s, + int d) { + return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d); +} + +// ggml_conv_2d + +struct ggml_tensor * ggml_conv_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { + + GGML_ASSERT(a->ne[2] == b->ne[2]); + bool is_node = false; + + if (a->grad || b->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[4] = { + ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), + ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1), + a->ne[3], b->ne[3], + }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + int32_t params[] = { s0, s1, p0, p1, d0, d1 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_CONV_2D; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; + +} + +// ggml_conv_2d_sk_p0 + +struct ggml_tensor * ggml_conv_2d_sk_p0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1); +} + +// ggml_conv_2d_s1_ph + +struct ggml_tensor * ggml_conv_2d_s1_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1); +} + +// ggml_conv_transpose_2d_p0 + +static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) { + return (ins - 1) * s - 2 * p + ks; +} + +struct ggml_tensor * ggml_conv_transpose_2d_p0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride) { + GGML_ASSERT(a->ne[3] == b->ne[2]); + + bool is_node = false; + + if (a->grad || b->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[4] = { + ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/), + ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/), + a->ne[2], b->ne[3], + }; + + struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + ggml_set_op_params_i32(result, 0, stride); + + result->op = GGML_OP_CONV_TRANSPOSE_2D; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +// ggml_pool_* + +static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) { + return (ins + 2 * p - ks) / s + 1; +} + +// ggml_pool_1d + +struct ggml_tensor * ggml_pool_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, + int s0, + int p0) { + + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[3] = { + ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), + a->ne[1], + }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + + int32_t params[] = { op, k0, s0, p0 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_POOL_1D; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +// ggml_pool_2d + +struct ggml_tensor * ggml_pool_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, + int k1, + int s0, + int s1, + int p0, + int p1) { + + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[3] = { + ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), + ggml_calc_pool_output_size(a->ne[1], k1, s1, p1), + a->ne[2], + }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); + + int32_t params[] = { op, k0, k1, s0, s1, p0, p1 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_POOL_2D; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +// ggml_upscale + +static struct ggml_tensor * ggml_upscale_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + int scale_factor) { + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, + a->ne[0] * scale_factor, + a->ne[1] * scale_factor, + a->ne[2], a->ne[3]); + + result->op = GGML_OP_UPSCALE; + result->op_params[0] = scale_factor; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = NULL; + + return result; +} + +struct ggml_tensor * ggml_upscale( + struct ggml_context * ctx, + struct ggml_tensor * a, + int scale_factor) { + return ggml_upscale_impl(ctx, a, scale_factor); +} + +// ggml_flash_attn + +struct ggml_tensor * ggml_flash_attn( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + bool masked) { + GGML_ASSERT(ggml_can_mul_mat(k, q)); + // TODO: check if vT can be multiplied by (k*qT) + + bool is_node = false; + + if (q->grad || k->grad || v->grad) { + is_node = true; + } + + //struct ggml_tensor * result = ggml_dup_tensor(ctx, q); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne); + + int32_t t = masked ? 1 : 0; + ggml_set_op_params(result, &t, sizeof(t)); + + result->op = GGML_OP_FLASH_ATTN; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = q; + result->src[1] = k; + result->src[2] = v; + + return result; +} + +// ggml_flash_ff + +struct ggml_tensor * ggml_flash_ff( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b0, + struct ggml_tensor * b1, + struct ggml_tensor * c0, + struct ggml_tensor * c1) { + GGML_ASSERT(ggml_can_mul_mat(b0, a)); + // TODO: more checks + + bool is_node = false; + + if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) { + is_node = true; + } + + //struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne); + + result->op = GGML_OP_FLASH_FF; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b0; + result->src[2] = b1; + result->src[3] = c0; + result->src[4] = c1; + + return result; +} + +// ggml_flash_attn_back + +struct ggml_tensor * ggml_flash_attn_back( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * d, + bool masked) { + GGML_ASSERT(ggml_can_mul_mat(k, q)); + // TODO: check if vT can be multiplied by (k*qT) + + // d shape [D,N,ne2,ne3] + // q shape [D,N,ne2,ne3] + // k shape [D,M,ne2,ne3] + // v shape [M,D,ne2,ne3] + + const int64_t D = q->ne[0]; + const int64_t N = q->ne[1]; + const int64_t M = k->ne[1]; + const int64_t ne2 = q->ne[2]; + const int64_t ne3 = q->ne[3]; + + GGML_ASSERT(k->ne[0] == D); + GGML_ASSERT(v->ne[0] == M); + GGML_ASSERT(v->ne[1] == D); + GGML_ASSERT(d->ne[0] == D); + GGML_ASSERT(d->ne[1] == N); + GGML_ASSERT(k->ne[2] == ne2); + GGML_ASSERT(k->ne[3] == ne3); + GGML_ASSERT(v->ne[2] == ne2); + GGML_ASSERT(v->ne[3] == ne3); + GGML_ASSERT(d->ne[2] == ne2); + GGML_ASSERT(d->ne[3] == ne3); + + bool is_node = false; + + if (q->grad || k->grad || v->grad) { + // when using this operation (in backwards pass) these grads are set. + // we don't want to create (big) grad of our result, so is_node is false. + is_node = false; + } + + // store gradients of q, k and v as continuous tensors concatenated in result. + // q shape[D,N,ne2,ne3] ; k shape [D,M,ne2,ne3] ; v shape [M,D,ne2,ne3] + // gradq->data = result->data + // gradk->data = result->data + nb0*D*N*ne2*ne3 + // gradv->data = result->data + nb0*D*N*ne2*ne3 + nb0*D*M*ne2*ne3 + // note: v and gradv are actually transposed, i.e. v->ne[0] != D. + int64_t ne[4] = {D,M+N+M,ne2,ne3}; + + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + int32_t masked_i = masked ? 1 : 0; + ggml_set_op_params(result, &masked_i, sizeof(masked_i)); + + result->op = GGML_OP_FLASH_ATTN_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = q; + result->src[1] = k; + result->src[2] = v; + result->src[3] = d; + + return result; +} + +// ggml_win_part + +struct ggml_tensor * ggml_win_part( + struct ggml_context * ctx, + struct ggml_tensor * a, + int w) { + GGML_ASSERT(a->ne[3] == 1); + GGML_ASSERT(a->type == GGML_TYPE_F32); + + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + // padding + const int px = (w - a->ne[1]%w)%w; + const int py = (w - a->ne[2]%w)%w; + + const int npx = (px + a->ne[1])/w; + const int npy = (py + a->ne[2])/w; + const int np = npx*npy; + + const int64_t ne[4] = { a->ne[0], w, w, np, }; + + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + int32_t params[] = { npx, npy, w }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_WIN_PART; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +// ggml_win_unpart + +struct ggml_tensor * ggml_win_unpart( + struct ggml_context * ctx, + struct ggml_tensor * a, + int w0, + int h0, + int w) { + GGML_ASSERT(a->type == GGML_TYPE_F32); + + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[4] = { a->ne[0], w0, h0, 1, }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); + + int32_t params[] = { w }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_WIN_UNPART; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +// ggml_get_rel_pos + +struct ggml_tensor * ggml_get_rel_pos( + struct ggml_context * ctx, + struct ggml_tensor * a, + int qh, + int kh) { + GGML_ASSERT(qh == kh); + GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]); + + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[4] = { a->ne[0], kh, qh, 1, }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne); + + result->op = GGML_OP_GET_REL_POS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = NULL; + + return result; +} + +// ggml_add_rel_pos + +static struct ggml_tensor * ggml_add_rel_pos_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph, + bool inplace) { + GGML_ASSERT(ggml_are_same_shape(pw, ph)); + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_is_contiguous(pw)); + GGML_ASSERT(ggml_is_contiguous(ph)); + GGML_ASSERT(ph->type == GGML_TYPE_F32); + GGML_ASSERT(pw->type == GGML_TYPE_F32); + GGML_ASSERT(pw->ne[3] == a->ne[2]); + GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]); + GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]); + + bool is_node = false; + + if (!inplace && (a->grad || pw->grad || ph->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + ggml_set_op_params_i32(result, 0, inplace ? 1 : 0); + + result->op = GGML_OP_ADD_REL_POS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = pw; + result->src[2] = ph; + + return result; +} + + +struct ggml_tensor * ggml_add_rel_pos( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph) { + return ggml_add_rel_pos_impl(ctx, a, pw, ph, false); +} + +struct ggml_tensor * ggml_add_rel_pos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph) { + return ggml_add_rel_pos_impl(ctx, a, pw, ph, true); +} + +// gmml_unary + +static struct ggml_tensor * ggml_unary_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_unary_op op, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + if (op == GGML_UNARY_OP_GLU) { + result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, a->ne[0] / 2, a->ne[1]); + } + + ggml_set_op_params_i32(result, 0, (int32_t) op); + + result->op = GGML_OP_UNARY; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_unary( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_unary_op op) { + return ggml_unary_impl(ctx, a, op, false); +} + +struct ggml_tensor * ggml_unary_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_unary_op op) { + return ggml_unary_impl(ctx, a, op, true); +} + +// ggml_map_unary + +static struct ggml_tensor * ggml_map_unary_impl_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_unary_op_f32_t fun, + bool inplace) { + bool is_node = false; + + if (!inplace && a->grad) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); + + result->op = GGML_OP_MAP_UNARY; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_map_unary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_unary_op_f32_t fun) { + return ggml_map_unary_impl_f32(ctx, a, fun, false); +} + +struct ggml_tensor * ggml_map_unary_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_unary_op_f32_t fun) { + return ggml_map_unary_impl_f32(ctx, a, fun, true); +} + +// ggml_map_binary + +static struct ggml_tensor * ggml_map_binary_impl_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_binary_op_f32_t fun, + bool inplace) { + GGML_ASSERT(ggml_are_same_shape(a, b)); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); + + result->op = GGML_OP_MAP_BINARY; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_map_binary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_binary_op_f32_t fun) { + return ggml_map_binary_impl_f32(ctx, a, b, fun, false); +} + +struct ggml_tensor * ggml_map_binary_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_binary_op_f32_t fun) { + return ggml_map_binary_impl_f32(ctx, a, b, fun, true); +} + +// ggml_map_custom1_f32 + +static struct ggml_tensor * ggml_map_custom1_impl_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_f32_t fun, + bool inplace) { + bool is_node = false; + + if (!inplace && a->grad) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); + + result->op = GGML_OP_MAP_CUSTOM1_F32; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_map_custom1_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_f32_t fun) { + return ggml_map_custom1_impl_f32(ctx, a, fun, false); +} + +struct ggml_tensor * ggml_map_custom1_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_f32_t fun) { + return ggml_map_custom1_impl_f32(ctx, a, fun, true); +} + +// ggml_map_custom2_f32 + +static struct ggml_tensor * ggml_map_custom2_impl_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_f32_t fun, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); + + result->op = GGML_OP_MAP_CUSTOM2_F32; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_map_custom2_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_f32_t fun) { + return ggml_map_custom2_impl_f32(ctx, a, b, fun, false); +} + +struct ggml_tensor * ggml_map_custom2_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_f32_t fun) { + return ggml_map_custom2_impl_f32(ctx, a, b, fun, true); +} + +// ggml_map_custom3_f32 + +static struct ggml_tensor * ggml_map_custom3_impl_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_f32_t fun, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad || b->grad || c->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); + + result->op = GGML_OP_MAP_CUSTOM3_F32; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; + + return result; +} + +struct ggml_tensor * ggml_map_custom3_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_f32_t fun) { + return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false); +} + +struct ggml_tensor * ggml_map_custom3_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_f32_t fun) { + return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true); +} + +// ggml_map_custom1 +struct ggml_map_custom1_op_params { + ggml_custom1_op_t fun; + int n_tasks; + void * userdata; +}; + +static struct ggml_tensor * ggml_map_custom1_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { + GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); + + bool is_node = false; + + if (!inplace && a->grad) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + struct ggml_map_custom1_op_params params = { + /*.fun =*/ fun, + /*.n_tasks =*/ n_tasks, + /*.userdata =*/ userdata + }; + ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); + + result->op = GGML_OP_MAP_CUSTOM1; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_map_custom1( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false); +} + +struct ggml_tensor * ggml_map_custom1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true); +} + +// ggml_map_custom2 + +struct ggml_map_custom2_op_params { + ggml_custom2_op_t fun; + int n_tasks; + void * userdata; +}; + +static struct ggml_tensor * ggml_map_custom2_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { + GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + struct ggml_map_custom2_op_params params = { + /*.fun =*/ fun, + /*.n_tasks =*/ n_tasks, + /*.userdata =*/ userdata + }; + ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); + + result->op = GGML_OP_MAP_CUSTOM2; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_map_custom2( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false); +} + +struct ggml_tensor * ggml_map_custom2_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true); +} + +// ggml_map_custom3 + +struct ggml_map_custom3_op_params { + ggml_custom3_op_t fun; + int n_tasks; + void * userdata; +}; + +static struct ggml_tensor * ggml_map_custom3_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { + GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad || c->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + struct ggml_map_custom3_op_params params = { + /*.fun =*/ fun, + /*.n_tasks =*/ n_tasks, + /*.userdata =*/ userdata + }; + ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); + + result->op = GGML_OP_MAP_CUSTOM3; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; + + return result; +} + +struct ggml_tensor * ggml_map_custom3( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false); +} + +struct ggml_tensor * ggml_map_custom3_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true); +} + + + +// ggml_cross_entropy_loss + +struct ggml_tensor * ggml_cross_entropy_loss( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_are_same_shape(a, b)); + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1); + + result->op = GGML_OP_CROSS_ENTROPY_LOSS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +// ggml_cross_entropy_loss_back + +struct ggml_tensor * ggml_cross_entropy_loss_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c) { + GGML_ASSERT(ggml_are_same_shape(a, b)); + GGML_ASSERT(ggml_is_scalar(c)); + + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK; + result->grad = NULL; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; + + return result; +} + +//////////////////////////////////////////////////////////////////////////////// + +void ggml_set_param( + struct ggml_context * ctx, + struct ggml_tensor * tensor) { + tensor->is_param = true; + + GGML_ASSERT(tensor->grad == NULL); + tensor->grad = ggml_dup_tensor(ctx, tensor); +} + +// ggml_compute_forward_dup + +static void ggml_compute_forward_dup_same_cont( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + GGML_ASSERT(src0->type == dst->type); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const size_t nb00 = src0->nb[0]; + const size_t nb0 = dst->nb[0]; + + const int ith = params->ith; // thread index + const int nth = params->nth; // number of threads + + // parallelize by elements + const int ne = ggml_nelements(dst); + const int dr = (ne + nth - 1) / nth; + const int ie0 = dr * ith; + const int ie1 = MIN(ie0 + dr, ne); + + if (ie0 < ie1) { + memcpy( + ((char *) dst->data + ie0*nb0), + ((char *) src0->data + ie0*nb00), + (ie1 - ie0) * ggml_type_size(src0->type)); + } + +} +static void ggml_compute_forward_dup_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_TENSOR_UNARY_OP_LOCALS; + + const int ith = params->ith; // thread index + const int nth = params->nth; // number of threads + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { + ggml_compute_forward_dup_same_cont(params, src0, dst); + return; + } + + // parallelize by rows + const int nr = ne01; + // number of rows per thread + const int dr = (nr + nth - 1) / nth; + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (src0->type == dst->type && + ne00 == ne0 && + nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) { + // copy by rows + const size_t rs = ne00*nb00; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ir0; i01 < ir1; i01++) { + memcpy( + ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + rs); + } + } + } + return; + } + + // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy + + if (ggml_is_contiguous(dst)) { + if (nb00 == sizeof(ggml_fp16_t)) { + if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + const size_t rs = ne00 * nb00; + char * dst_ptr = (char *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + memcpy(dst_ptr + id, src0_ptr, rs); + id += rs; + } + id += rs * (ne01 - ir1); + } + } + } else if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + for (int i00 = 0; i00 < ne00; i00++) { + dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else if (type_traits[dst->type].from_float) { + ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; + float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; + + size_t id = 0; + size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); + char * dst_ptr = (char *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + + for (int i00 = 0; i00 < ne00; i00++) { + src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]); + } + + quantize_row_q(src0_f32, dst_ptr + id, ne00); + id += rs; + } + id += rs * (ne01 - ir1); + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } else { + //printf("%s: this is not optimal - fix me\n", __func__); + + if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = *src0_ptr; + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } + return; + } + + // dst counters + int64_t i10 = 0; + int64_t i11 = 0; + int64_t i12 = 0; + int64_t i13 = 0; + + if (dst->type == GGML_TYPE_F16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t)); + + if (++i10 == ne00) { + i10 = 0; + if (++i11 == ne01) { + i11 = 0; + if (++i12 == ne02) { + i12 = 0; + if (++i13 == ne03) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else if (dst->type == GGML_TYPE_F32) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr); + + if (++i10 == ne0) { + i10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } +} + +static void ggml_compute_forward_dup_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_TENSOR_UNARY_OP_LOCALS; + + const int ith = params->ith; // thread index + const int nth = params->nth; // number of threads + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { + ggml_compute_forward_dup_same_cont(params, src0, dst); + return; + } + + // parallelize by rows + const int nr = ne01; + // number of rows per thread + const int dr = (nr + nth - 1) / nth; + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (src0->type == dst->type && + ne00 == ne0 && + nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) { + // copy by rows + const size_t rs = ne00*nb00; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ir0; i01 < ir1; i01++) { + memcpy( + ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + rs); + } + } + } + return; + } + + if (ggml_is_contiguous(dst)) { + // TODO: simplify + if (nb00 == sizeof(float)) { + if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + const size_t rs = ne00 * nb00; + char * dst_ptr = (char *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + memcpy(dst_ptr + id, src0_ptr, rs); + id += rs; + } + id += rs * (ne01 - ir1); + } + } + } else if (type_traits[dst->type].from_float) { + ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; + + size_t id = 0; + size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); + char * dst_ptr = (char *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + quantize_row_q(src0_ptr, dst_ptr + id, ne00); + id += rs; + } + id += rs * (ne01 - ir1); + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } else { + //printf("%s: this is not optimal - fix me\n", __func__); + + if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = *src0_ptr; + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } + + return; + } + + // dst counters + + int64_t i10 = 0; + int64_t i11 = 0; + int64_t i12 = 0; + int64_t i13 = 0; + + if (dst->type == GGML_TYPE_F32) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + memcpy(dst_ptr, src0_ptr, sizeof(float)); + + if (++i10 == ne0) { + i10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else if (dst->type == GGML_TYPE_F16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr); + + if (++i10 == ne0) { + i10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } +} + +static void ggml_compute_forward_dup( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { + ggml_compute_forward_dup_same_cont(params, src0, dst); + return; + } + switch (src0->type) { + case GGML_TYPE_F16: + case GGML_TYPE_I16: + { + ggml_compute_forward_dup_f16(params, src0, dst); + } break; + case GGML_TYPE_F32: + case GGML_TYPE_I32: + { + ggml_compute_forward_dup_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_add + +static void ggml_compute_forward_add_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + // GGML_ASSERT( nb0 == sizeof(float)); + // GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (nb10 == sizeof(float)) { + for (int ir = ir0; ir < ir1; ++ir) { + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); + +#ifdef GGML_USE_ACCELERATE + vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00); +#elif GGML_USE_OPENBLAS + // In saxpy adds a*x to y. + if (dst_ptr == src0_ptr) { + cblas_saxpy(ne00, 1.0f, src1_ptr, 1, dst_ptr, 1); + } else if (dst_ptr == src1_ptr) { + cblas_saxpy(ne00, 1.0f, src0_ptr, 1, dst_ptr, 1); + } else { + // Fallback to manual loop. + ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr); + } +# else + ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr); +#endif + // } + // } + } + } else { + // src1 is not contiguous + for (int ir = ir0; ir < ir1; ++ir) { + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + +#if GGML_USE_OPENBLAS + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); + if (dst_ptr == src0_ptr) { + cblas_saxpy(ne0, 1.0f, src1_ptr, nb10 / sizeof(float), dst_ptr, 1); + return; + } else if (dst_ptr == src1_ptr) { + cblas_saxpy(ne0, 1.0f, src0_ptr, 1, dst_ptr, nb10 / sizeof(float)); + return; + } else { + // Fallback to manual loop. + abort(); + } +#else + for (int i0 = 0; i0 < ne0; i0++) { + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10); + dst_ptr[i0] = src0_ptr[i0] + *src1_ptr; + } +#endif + } + } +} + +static void ggml_compute_forward_add_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F16); + + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (nb10 == sizeof(float)) { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); + } + } + } + else { + // src1 is not contiguous + GGML_ASSERT(false); + } +} + +static void ggml_compute_forward_add_f16_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); + + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (nb10 == sizeof(ggml_fp16_t)) { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(src1_ptr[i])); + } + } + } + else { + // src1 is not contiguous + GGML_ASSERT(false); + } +} + +static void ggml_compute_forward_add_q_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; + ggml_from_float_t const quantize_row_q = type_traits[type].from_float; + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ggml_is_quantized(src0->type)); + GGML_ASSERT(dst->type == src0->type); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 indices + const int i03 = ir/(ne02*ne01); + const int i02 = (ir - i03*ne02*ne01)/ne01; + const int i01 = (ir - i03*ne02*ne01 - i02*ne01); + + // src1 and dst are same shape as src0 => same indices + const int i13 = i03; + const int i12 = i02; + const int i11 = i01; + + const int i3 = i03; + const int i2 = i02; + const int i1 = i01; + + void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); + float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)); + void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + + assert(ne00 % 32 == 0); + + // unquantize row from src0 to temp buffer + dequantize_row_q(src0_row, wdata, ne00); + // add src1 + ggml_vec_acc_f32(ne00, wdata, src1_row); + // quantize row to dst + quantize_row_q(wdata, dst_row, ne00); + } +} + +static void ggml_compute_forward_add( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_add_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F16: + { + if (src1->type == GGML_TYPE_F16) { + ggml_compute_forward_add_f16_f16(params, src0, src1, dst); + } + else if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add_f16_f32(params, src0, src1, dst); + } + else { + GGML_ASSERT(false); + } + } break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + { + ggml_compute_forward_add_q_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_add1 + +static void ggml_compute_forward_add1_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_UNARY_OP_LOCALS; + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + +#ifdef GGML_USE_ACCELERATE + UNUSED(ggml_vec_add1_f32); + + vDSP_vadd( + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, + (float *) ((char *) src1->data), 0, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, + ne0); +#else + ggml_vec_add1_f32(ne0, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), + *(float *) src1->data); +#endif + } +} + +static void ggml_compute_forward_add1_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // scalar to add + const float v = *(float *) src1->data; + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_UNARY_OP_LOCALS; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F16); + + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); + } + } +} + +static void ggml_compute_forward_add1_f16_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // scalar to add + const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_UNARY_OP_LOCALS; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); + + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); + } + } +} + +static void ggml_compute_forward_add1_q_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // scalar to add + const float v = *(float *) src1->data; + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_UNARY_OP_LOCALS; + + const enum ggml_type type = src0->type; + ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; + ggml_from_float_t const quantize_row_q = type_traits[type].from_float; + + // we don't support permuted src0 + GGML_ASSERT(nb00 == ggml_type_size(type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ggml_is_quantized(src0->type)); + GGML_ASSERT(dst->type == src0->type); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith; + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + void * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03)); + void * dst_row = (void *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb0 )); + + assert(ne0 % 32 == 0); + + // unquantize row from src0 to temp buffer + dequantize_row_q(src0_row, wdata, ne0); + // add src1 + ggml_vec_acc1_f32(ne0, wdata, v); + // quantize row to dst + quantize_row_q(wdata, dst_row, ne0); + } +} + +static void ggml_compute_forward_add1( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_add1_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F16: + { + if (src1->type == GGML_TYPE_F16) { + ggml_compute_forward_add1_f16_f16(params, src0, src1, dst); + } + else if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add1_f16_f32(params, src0, src1, dst); + } + else { + GGML_ASSERT(false); + } + } break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + { + ggml_compute_forward_add1_q_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + + +// ggml_compute_forward_acc + +static void ggml_compute_forward_acc_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + + // view src0 and dst with these strides and data offset inbytes during acc + // nb0 is implicitely element_size because src0 and dst are contiguous + size_t nb1 = ((int32_t *) dst->op_params)[0]; + size_t nb2 = ((int32_t *) dst->op_params)[1]; + size_t nb3 = ((int32_t *) dst->op_params)[2]; + size_t offset = ((int32_t *) dst->op_params)[3]; + bool inplace = (bool) ((int32_t *) dst->op_params)[4]; + + if (!inplace && (params->type == GGML_TASK_INIT)) { + // memcpy needs to be synchronized across threads to avoid race conditions. + // => do it in INIT phase + memcpy( + ((char *) dst->data), + ((char *) src0->data), + ggml_nbytes(dst)); + } + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src1); + const int nc = src1->ne[0]; + + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); + + // src0 and dst as viewed during acc + const size_t nb0 = ggml_element_size(src0); + + const size_t nb00 = nb0; + const size_t nb01 = nb1; + const size_t nb02 = nb2; + const size_t nb03 = nb3; + + GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0 + (ne11 == 0 ? 0 : ne11-1)*nb1 + (ne12 == 0 ? 0 : ne12-1)*nb2 + (ne13 == 0 ? 0 : ne13-1)*nb3 < ggml_nbytes(dst)); + GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < ggml_nbytes(src0)); + + GGML_ASSERT(nb10 == sizeof(float)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are viewed with shape of src1 and offset + // => same indices + const int i3 = ir/(ne12*ne11); + const int i2 = (ir - i3*ne12*ne11)/ne11; + const int i1 = (ir - i3*ne12*ne11 - i2*ne11); + +#ifdef GGML_USE_ACCELERATE + vDSP_vadd( + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), 1, + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), 1, nc); +#else + ggml_vec_add_f32(nc, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); +#endif + } +} + +static void ggml_compute_forward_acc( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_acc_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_sub + +static void ggml_compute_forward_sub_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + if (nb10 == sizeof(float)) { + for (int ir = 0; ir < nr; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + +#ifdef GGML_USE_ACCELERATE + vDSP_vsub( + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, + ne0); +#else + ggml_vec_sub_f32(ne0, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); +#endif + // } + // } + } + } else { + // src1 is not contiguous + for (int ir = 0; ir < nr; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + for (int i0 = 0; i0 < ne0; i0++) { + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10); + + dst_ptr[i0] = src0_ptr[i0] - *src1_ptr; + } + } + } +} + +static void ggml_compute_forward_sub( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sub_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_mul + +static void ggml_compute_forward_mul_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + const int ith = params->ith; + const int nth = params->nth; + +#ifdef GGML_USE_CLBLAST + if (src1->backend == GGML_BACKEND_GPU) { + if (ith == 0) { + ggml_cl_mul(src0, src1, dst); + } + return; + } +#endif + + const int64_t nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(ne00 == ne10); + + if (nb10 == sizeof(float)) { + for (int64_t ir = ith; ir < nr; ir += nth) { + // src0 and dst are same shape => same indices + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); + +#ifdef GGML_USE_ACCELERATE + UNUSED(ggml_vec_mul_f32); + + vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00); +#else + ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr); +#endif + // } + // } + } + } else { + // src1 is not contiguous + for (int64_t ir = ith; ir < nr; ir += nth) { + // src0 and dst are same shape => same indices + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + + for (int64_t i0 = 0; i0 < ne00; i0++) { + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10); + + dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr); + } + } + } +} + +static void ggml_compute_forward_mul( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now"); + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_mul_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_div + +static void ggml_compute_forward_div_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + if (nb10 == sizeof(float)) { + for (int ir = 0; ir < nr; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + +#ifdef GGML_USE_ACCELERATE + UNUSED(ggml_vec_div_f32); + + vDSP_vdiv( + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, + ne0); +#else + ggml_vec_div_f32(ne0, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), + (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); +#endif + // } + // } + } + } else { + // src1 is not contiguous + for (int ir = 0; ir < nr; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + for (int i0 = 0; i0 < ne0; i0++) { + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10); + + dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr); + } + } + } +} + +static void ggml_compute_forward_div( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_div_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_sqr + +static void ggml_compute_forward_sqr_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_sqr_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_sqr( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sqr_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_sqrt + +static void ggml_compute_forward_sqrt_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_sqrt_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_sqrt( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sqrt_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + + +// ggml_compute_forward_log + +static void ggml_compute_forward_log_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + GGML_ASSERT( dst->nb[0] == sizeof(float)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_log_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_log( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_log_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_sum + +static void ggml_compute_forward_sum_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_is_scalar(dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + assert(ggml_is_scalar(dst)); + assert(src0->nb[0] == sizeof(float)); + + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); + + ggml_float sum = 0; + ggml_float row_sum = 0; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + ggml_vec_sum_f32_ggf(ne00, + &row_sum, + (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); + sum += row_sum; + } + } + } + ((float *) dst->data)[0] = sum; +} + +static void ggml_compute_forward_sum_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_is_scalar(dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + assert(src0->nb[0] == sizeof(ggml_fp16_t)); + + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); + + float sum = 0; + float row_sum = 0; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + ggml_vec_sum_f16_ggf(ne00, + &row_sum, + (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03)); + sum += row_sum; + } + } + } + ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum); +} + +static void ggml_compute_forward_sum( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sum_f32(params, src0, dst); + } break; + case GGML_TYPE_F16: + { + ggml_compute_forward_sum_f16(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_sum_rows + +static void ggml_compute_forward_sum_rows_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT(dst->nb[0] == sizeof(float)); + + GGML_TENSOR_UNARY_OP_LOCALS; + + GGML_ASSERT(ne0 == 1); + GGML_ASSERT(ne1 == ne01); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne3 == ne03); + + for (int64_t i3 = 0; i3 < ne03; i3++) { + for (int64_t i2 = 0; i2 < ne02; i2++) { + for (int64_t i1 = 0; i1 < ne01; i1++) { + float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03); + float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3); + float row_sum = 0; + ggml_vec_sum_f32(ne00, &row_sum, src_row); + dst_row[0] = row_sum; + } + } + } +} + +static void ggml_compute_forward_sum_rows( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sum_rows_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_mean + +static void ggml_compute_forward_mean_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + assert(src0->nb[0] == sizeof(float)); + + GGML_TENSOR_UNARY_OP_LOCALS; + + assert(ne0 == 1); + assert(ne1 == ne01); + assert(ne2 == ne02); + assert(ne3 == ne03); + + UNUSED(ne0); + UNUSED(ne1); + UNUSED(ne2); + UNUSED(ne3); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + ggml_vec_sum_f32(ne00, + (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); + + *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00; + } + } + } +} + +static void ggml_compute_forward_mean( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_mean_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_argmax + +static void ggml_compute_forward_argmax_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + assert(src0->nb[0] == sizeof(float)); + assert(dst->nb[0] == sizeof(float)); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + + const size_t nb01 = src0->nb[1]; + const size_t nb0 = dst->nb[0]; + + for (int64_t i1 = 0; i1 < ne01; i1++) { + float * src = (float *) ((char *) src0->data + i1*nb01); + int32_t * dst_ = (int32_t *) ((char *) dst->data + i1*nb0); + int v = 0; + ggml_vec_argmax_f32(ne00, &v, src); + dst_[0] = v; + } +} + +static void ggml_compute_forward_argmax( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_argmax_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_repeat + +static void ggml_compute_forward_repeat_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_can_repeat(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_TENSOR_UNARY_OP_LOCALS; + + // guaranteed to be an integer due to the check in ggml_can_repeat + const int nr0 = (int)(ne0/ne00); + const int nr1 = (int)(ne1/ne01); + const int nr2 = (int)(ne2/ne02); + const int nr3 = (int)(ne3/ne03); + + // TODO: support for transposed / permuted tensors + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + // TODO: maybe this is not optimal? + for (int i3 = 0; i3 < nr3; i3++) { + for (int k3 = 0; k3 < ne03; k3++) { + for (int i2 = 0; i2 < nr2; i2++) { + for (int k2 = 0; k2 < ne02; k2++) { + for (int i1 = 0; i1 < nr1; i1++) { + for (int k1 = 0; k1 < ne01; k1++) { + for (int i0 = 0; i0 < nr0; i0++) { + ggml_vec_cpy_f32(ne00, + (float *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0), + (float *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01)); + } + } + } + } + } + } + } +} + +static void ggml_compute_forward_repeat( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + case GGML_TYPE_I32: + { + ggml_compute_forward_repeat_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_repeat_back + +static void ggml_compute_forward_repeat_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_can_repeat(dst, src0)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_TENSOR_UNARY_OP_LOCALS; + + // guaranteed to be an integer due to the check in ggml_can_repeat + const int nr0 = (int)(ne00/ne0); + const int nr1 = (int)(ne01/ne1); + const int nr2 = (int)(ne02/ne2); + const int nr3 = (int)(ne03/ne3); + + // TODO: support for transposed / permuted tensors + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + if (ggml_is_contiguous(dst)) { + ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + } else { + for (int k3 = 0; k3 < ne3; k3++) { + for (int k2 = 0; k2 < ne2; k2++) { + for (int k1 = 0; k1 < ne1; k1++) { + ggml_vec_set_f32(ne0, + (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3), + 0); + } + } + } + } + + // TODO: maybe this is not optimal? + for (int i3 = 0; i3 < nr3; i3++) { + for (int k3 = 0; k3 < ne3; k3++) { + for (int i2 = 0; i2 < nr2; i2++) { + for (int k2 = 0; k2 < ne2; k2++) { + for (int i1 = 0; i1 < nr1; i1++) { + for (int k1 = 0; k1 < ne1; k1++) { + for (int i0 = 0; i0 < nr0; i0++) { + ggml_vec_acc_f32(ne0, + (float *) ((char *) dst->data + ( k3)*nb3 + ( k2)*nb2 + ( k1)*nb1), + (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00)); + } + } + } + } + } + } + } +} + +static void ggml_compute_forward_repeat_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_repeat_back_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_concat + +static void ggml_compute_forward_concat_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + + GGML_TENSOR_BINARY_OP_LOCALS; + + // TODO: support for transposed / permuted tensors + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = ith; i2 < ne2; i2++) { + if (i2 < ne02) { // src0 + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03); + + float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3); + *y = *x; + } + } + } // src1 + else { + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13); + + float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3); + *y = *x; + } + } + } + } + } +} + +static void ggml_compute_forward_concat( + const struct ggml_compute_params* params, + const struct ggml_tensor* src0, + const struct ggml_tensor* src1, + struct ggml_tensor* dst) { + switch (src0->type) { + case GGML_TYPE_F32: + case GGML_TYPE_I32: + { + ggml_compute_forward_concat_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_abs + +static void ggml_compute_forward_abs_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_abs_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_abs( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_abs_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_sgn + +static void ggml_compute_forward_sgn_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_sgn_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_sgn( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sgn_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_neg + +static void ggml_compute_forward_neg_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_neg_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_neg( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_neg_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_step + +static void ggml_compute_forward_step_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_step_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_step( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_step_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_tanh + +static void ggml_compute_forward_tanh_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_tanh_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_tanh( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_tanh_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_elu + +static void ggml_compute_forward_elu_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_elu_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_elu( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_elu_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_relu + +static void ggml_compute_forward_relu_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_relu_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_relu( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_relu_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_gelu + +static void ggml_compute_forward_gelu_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); + GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_vec_gelu_f32(nc, + (float *) ((char *) dst->data + i1*( dst->nb[1])), + (float *) ((char *) src0->data + i1*(src0->nb[1]))); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_gelu( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_gelu_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_gelu_quick + +static void ggml_compute_forward_gelu_quick_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); + GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_vec_gelu_quick_f32(nc, + (float *) ((char *) dst->data + i1*( dst->nb[1])), + (float *) ((char *) src0->data + i1*(src0->nb[1]))); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_gelu_quick( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_gelu_quick_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_silu + +static void ggml_compute_forward_silu_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); + GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_vec_silu_f32(nc, + (float *) ((char *) dst->data + i1*( dst->nb[1])), + (float *) ((char *) src0->data + i1*(src0->nb[1]))); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_silu( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_silu_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_silu_back + +static void ggml_compute_forward_silu_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * grad, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad)); + GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); + GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_are_same_shape(src0, grad)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_vec_silu_backward_f32(nc, + (float *) ((char *) dst->data + i1*( dst->nb[1])), + (float *) ((char *) src0->data + i1*(src0->nb[1])), + (float *) ((char *) grad->data + i1*(grad->nb[1]))); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_silu_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * grad, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_silu_back_f32(params, src0, grad, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_glu + +static void ggml_compute_forward_glu_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int nc = src0->ne[0] / 2; + const int nr = src0->ne[1]; + for (int i1 = 0; i1 < nr; i1++) { + for (int i0 = 0; i0 < nc; i0++) { + float *linear_part = (float *)((char *)src0->data + i0 * src0->nb[0] + i1 * src0->nb[1]); + float *gate = (float *) ((char *) src0->data + (i0+nc) * (src0->nb[0]) + i1 * src0->nb[1]); + + *gate = 1.0f / (1.0f + expf(-*gate)); + float *output = (float *) ((char *) dst->data + i0*(dst->nb[0]) + i1 * dst->nb[1]); + *output = (*linear_part) * (*gate); + + } + } +} + +static void ggml_compute_forward_glu( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_glu_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_norm + +static void ggml_compute_forward_norm_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + GGML_TENSOR_UNARY_OP_LOCALS; + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + // TODO: optimize + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ith; i01 < ne01; i01 += nth) { + const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + + ggml_float sum = 0.0; + for (int64_t i00 = 0; i00 < ne00; i00++) { + sum += (ggml_float)x[i00]; + } + + float mean = sum/ne00; + + float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); + + ggml_float sum2 = 0.0; + for (int64_t i00 = 0; i00 < ne00; i00++) { + float v = x[i00] - mean; + y[i00] = v; + sum2 += (ggml_float)(v*v); + } + + float variance = sum2/ne00; + const float scale = 1.0f/sqrtf(variance + eps); + + ggml_vec_scale_f32(ne00, y, scale); + } + } + } +} + +static void ggml_compute_forward_norm( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_norm_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_batch_norm + +static void ggml_compute_forward_batch_norm_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * src2, + const struct ggml_tensor * src3, + const struct ggml_tensor * src4, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + const float * test_val_0 = (float *) ((char *) src0->data); + const float * test_val_1 = (float *) ((char *) src0->data + 4); + const float * gamma = (float *) ((char *) src1->data); + const float * beta = (float *) ((char *) src2->data); + const float * mean = (float *) ((char *) src3->data); + const float * variance = (float *) ((char *) src4->data); + + // TODO: optimize & generalize + for (int64_t i01 = 0; i01 < src0->ne[1]; i01++) { + for (int64_t i00 = 0; i00 < src0->ne[0]; i00++) { + const float * x = (float *) ((char *) src0->data + i00*src0->nb[0] + i01*src0->nb[1]); + float * y = (float *) ((char *) dst->data + i00*src0->nb[0] + i01*src0->nb[1]); + *y = gamma[i01] * (*x - mean[i01]) / sqrt(variance[i01] + eps) + beta[i01]; + } + } +} + +static void ggml_compute_forward_batch_norm( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * src2, + const struct ggml_tensor * src3, + const struct ggml_tensor * src4, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_batch_norm_f32(params, src0, src1, src2, src3, src4, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_group_rms_norm + +static void ggml_compute_forward_rms_norm_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + GGML_TENSOR_UNARY_OP_LOCALS; + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + // TODO: optimize + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ith; i01 < ne01; i01 += nth) { + const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + + ggml_float sum = 0.0; + for (int64_t i00 = 0; i00 < ne00; i00++) { + sum += (ggml_float)(x[i00] * x[i00]); + } + + const float mean = sum/ne00; + + float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); + + memcpy(y, x, ne00 * sizeof(float)); + // for (int i00 = 0; i00 < ne00; i00++) { + // y[i00] = x[i00]; + // } + + const float scale = 1.0f/sqrtf(mean + eps); + + ggml_vec_scale_f32(ne00, y, scale); + } + } + } +} + +static void ggml_compute_forward_rms_norm( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_rms_norm_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +static void ggml_compute_forward_rms_norm_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + GGML_TENSOR_BINARY_OP_LOCALS; + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + // TODO: optimize + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ith; i01 < ne01; i01 += nth) { + // src1 is same shape as src0 => same indices + const int64_t i11 = i01; + const int64_t i12 = i02; + const int64_t i13 = i03; + + const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13); + + ggml_float sum_xx = 0.0; + ggml_float sum_xdz = 0.0; + + for (int64_t i00 = 0; i00 < ne00; i00++) { + sum_xx += (ggml_float)(x[i00] * x[i00]); + sum_xdz += (ggml_float)(x[i00] * dz[i00]); + } + + //const float mean = (float)(sum_xx)/ne00; + const float mean_eps = (float)(sum_xx)/ne00 + eps; + const float sum_eps = (float)(sum_xx) + eps*ne00; + //const float mean_xdz = (float)(sum_xdz)/ne00; + // we could cache rms from forward pass to improve performance. + // to do this implement ggml_rms and compose ggml_rms_norm using ggml_rms. + //const float rms = sqrtf(mean_eps); + const float rrms = 1.0f / sqrtf(mean_eps); + //const float scale = -rrms/(ne00 * mean_eps); // -1/(n*rms**3) + + { + // z = rms_norm(x) + // + // rms_norm(src0) = + // scale( + // src0, + // div( + // 1, + // sqrt( + // add( + // scale( + // sum( + // sqr( + // src0)), + // (1.0/N)), + // eps)))); + + // postorder: + // ## op args grad + // 00 param src0 grad[#00] + // 01 const 1 + // 02 sqr (#00) grad[#02] + // 03 sum (#02) grad[#03] + // 04 const 1/N + // 05 scale (#03, #04) grad[#05] + // 06 const eps + // 07 add (#05, #06) grad[#07] + // 08 sqrt (#07) grad[#08] + // 09 div (#01,#08) grad[#09] + // 10 scale (#00,#09) grad[#10] + // + // backward pass, given grad[#10] + // #10: scale + // grad[#00] += scale(grad[#10],#09) + // grad[#09] += sum(mul(grad[#10],#00)) + // #09: div + // grad[#08] += neg(mul(grad[#09], div(#09,#08))) + // #08: sqrt + // grad[#07] += mul(grad[#08], div(0.5, #08)) + // #07: add + // grad[#05] += grad[#07] + // #05: scale + // grad[#03] += scale(grad[#05],#04) + // #03: sum + // grad[#02] += repeat(grad[#03], #02) + // #02: + // grad[#00] += scale(mul(#00, grad[#02]), 2.0) + // + // substitute and simplify: + // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0) + // grad[#02] = repeat(grad[#03], #02) + // grad[#02] = repeat(scale(grad[#05],#04), #02) + // grad[#02] = repeat(scale(grad[#07],#04), #02) + // grad[#02] = repeat(scale(mul(grad[#08], div(0.5, #08)),#04), #02) + // grad[#02] = repeat(scale(mul(neg(mul(grad[#09], div(#09,#08))), div(0.5, #08)),#04), #02) + // grad[#02] = repeat(scale(mul(neg(mul(sum(mul(grad[#10],#00)), div(#09,#08))), div(0.5, #08)),#04), #02) + // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(#09,#08) * div(0.5, #08) * (1/N)), #02) + // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(div(#01,#08),#08) * div(0.5, #08) * (1/N)), #02) + // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#08*#08) * div(0.5, #08) * (1/N)), #02) + // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02) + // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0) + // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)), 2.0) + // grad[#00] = scale(grad(#10), #09) + scale(scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N))), 2.0) + // grad[#00] = scale(grad(#10), #09) + scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(1,#08) * (1/N))) + // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N)) + // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N)) + // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,mean_eps*rms) * (-1/N)) + // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*mean_eps)) + // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*(sum_xx/N+eps))) + // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*sum_xx+rms*N*eps)) + // grad[#00] = scale(dz, rrms) + scale(x, sum(mul(dz,x)) * div(-1,rms*N*mean_eps)) + // grad[#00] = scale(dz, rrms) + scale(x, sum_xdz * div(-1,rms*N*mean_eps)) + // a = b*c + d*e + // a = b*c*f/f + d*e*f/f + // a = (b*c*f + d*e*f)*(1/f) + // a = (b*c*(1/c) + d*e*(1/c))*(1/(1/c)) + // a = (b + d*e/c)*c + // b = dz, c = rrms, d = x, e = sum_xdz * div(-1,rms*N*mean_eps) + // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)/rrms)*rrms + // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)*rms)*rrms + // a = (dz + x*sum_xdz * div(-rms,rms*N*mean_eps))*rrms + // a = (dz + x*sum_xdz * div(-1,N*mean_eps))*rrms + // a = (dz + x*div(-sum_xdz,N*mean_eps))*rrms + // a = (dz + x*div(-mean_xdz,mean_eps))*rrms + // grad[#00] = scale(dz + scale(x, div(-mean_xdz,mean_eps)),rrms) + // grad[#00] = scale(dz + scale(x, -mean_xdz/mean_eps),rrms) + // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms) + } + // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms) + // post-order: + // dx := x + // dx := scale(dx,-mean_xdz/mean_eps) + // dx := add(dx, dz) + // dx := scale(dx, rrms) + float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); + + ggml_vec_cpy_f32 (ne00, dx, x); + // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps); + ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps); + ggml_vec_acc_f32 (ne00, dx, dz); + ggml_vec_scale_f32(ne00, dx, rrms); + } + } + } +} + +static void ggml_compute_forward_rms_norm_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_group_norm + +static void ggml_compute_forward_group_norm_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + GGML_TENSOR_UNARY_OP_LOCALS; + + const float eps = 1e-6f; // TODO: make this a parameter + + // TODO: optimize + + int n_channels = src0->ne[2]; + int n_groups = dst->op_params[0]; + int n_channels_per_group = (n_channels + n_groups - 1) / n_groups; + for (int i = ith; i < n_groups; i+=nth) { + int start = i * n_channels_per_group; + int end = start + n_channels_per_group; + if (end > n_channels) { + end = n_channels; + } + int step = end - start; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + ggml_float sum = 0.0; + for (int64_t i02 = start; i02 < end; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); + + for (int64_t i00 = 0; i00 < ne00; i00++) { + sum += (ggml_float)x[i00]; + } + } + } + float mean = sum / (ne00 * ne01 * step); + ggml_float sum2 = 0.0; + + for (int64_t i02 = start; i02 < end; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); + + float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); + + for (int64_t i00 = 0; i00 < ne00; i00++) { + float v = x[i00] - mean; + y[i00] = v; + sum2 += (ggml_float)(v * v); + } + } + } + float variance = sum2 / (ne00 * ne01 * step); + const float scale = 1.0f / sqrtf(variance + eps); + + for (int64_t i02 = start; i02 < end; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); + ggml_vec_scale_f32(ne00, y, scale); + } + } + } + } +} + +static void ggml_compute_forward_group_norm( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_group_norm_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_mul_mat + +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +// helper function to determine if it is better to use BLAS or not +// for large matrices, BLAS is faster +static bool ggml_compute_forward_mul_mat_use_blas( + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + //const int64_t ne00 = src0->ne[0]; + //const int64_t ne01 = src0->ne[1]; + + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + // TODO: find the optimal values for these + if (ggml_is_contiguous(src0) && + ggml_is_contiguous(src1)) { + + bool big = (ne0 >= 32 && ne1 >= 32 && ne10 >= 32); + big = big || (ne0 >= 512 && ne10 >= 512); + + /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ + return big; + } + + return false; +} +#endif + +static void ggml_compute_forward_mul_mat( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + + const bool src1_cont = ggml_is_contiguous(src1); + + ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; + ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + // GGML_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + +#if defined(GGML_USE_CLBLAST) + if (ggml_cl_can_mul_mat(src0, src1, dst)) { + // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension + // ref: https://github.com/ggerganov/ggml/pull/224 + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); + } + return; + } +#endif + +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { + if (params->ith != 0) { + return; + } + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + for (int64_t i13 = 0; i13 < ne13; i13++) { + for (int64_t i12 = 0; i12 < ne12; i12++) { + // broadcast src0 into src1 across 2nd,3rd dimension + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); + + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + + if (type != GGML_TYPE_F32) { + float * const wdata = params->wdata; + ggml_to_float_t const to_float = type_traits[type].to_float; + + size_t id = 0; + for (int64_t i01 = 0; i01 < ne01; ++i01) { + to_float((const char *) x + i01*nb01, wdata + id, ne00); + id += ne00; + } + + assert(id*sizeof(float) <= params->wsize); + x = wdata; + } + + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne00, + 0.0f, d, ne01); + } + } + + //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); + + return; + } +#endif + + if (params->type == GGML_TASK_INIT) { + if (src1->type != vec_dot_type) { + char * wdata = params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; + } + } + } + } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = ne11*ne12*ne13; // src1 rows + + //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); + + // distribute the thread work across the inner or outer loop based on which one is larger + + const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows + const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows + + const int64_t ith0 = ith % nth0; + const int64_t ith1 = ith / nth0; + + const int64_t dr0 = (nr0 + nth0 - 1)/nth0; + const int64_t dr1 = (nr1 + nth1 - 1)/nth1; + + const int64_t ir010 = dr0*ith0; + const int64_t ir011 = MIN(ir010 + dr0, nr0); + + const int64_t ir110 = dr1*ith1; + const int64_t ir111 = MIN(ir110 + dr1, nr1); + + //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111); + + // threads with no work simply yield (not sure if it helps) + if (ir010 >= ir011 || ir110 >= ir111) { + sched_yield(); + return; + } + + assert(ne12 % ne02 == 0); + assert(ne13 % ne03 == 0); + + // block-tiling attempt + const int64_t blck_0 = 16; + const int64_t blck_1 = 16; + + // attempt to reduce false-sharing (does not seem to make a difference) + float tmp[16]; + + for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { + for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { + const int64_t i13 = (ir1/(ne12*ne11)); + const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11; + const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11); + + // broadcast src0 into src1 + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; + + const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03); + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + // TODO: this is a bit of a hack, we should probably have a better way to handle this + const char * src1_col = (const char *) wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size + : (i11*nb11 + i12*nb12 + i13*nb13)); + + float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); + + //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); + //} + + for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col); + } + memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); + } + } + } +} + +// ggml_compute_forward_out_prod + +static void ggml_compute_forward_out_prod_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + // GGML_ASSERT(nb0 <= nb1); + // GGML_ASSERT(nb1 <= nb2); + // GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ne0 == ne00); + GGML_ASSERT(ne1 == ne10); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne3 == ne03); + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + + // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod + // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) + + if (params->type == GGML_TASK_INIT) { + ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // parallelize by last three dimensions + + // total rows in dst + const int64_t nr = ne1*ne2*ne3; + + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; + + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); + + // dst[:,:,:,:] = 0 + // for i2,i3: + // for i1: + // for i01: + // for i0: + // dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3] + + for (int64_t ir = ir0; ir < ir1; ++ir) { + // dst indices + const int64_t i3 = ir/(ne2*ne1); + const int64_t i2 = (ir - i3*ne2*ne1)/ne1; + const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1); + + const int64_t i02 = i2; + const int64_t i03 = i3; + + //const int64_t i10 = i1; + const int64_t i12 = i2; + const int64_t i13 = i3; + + for (int64_t i01 = 0; i01 < ne01; ++i01) { + const int64_t i11 = i01; + + float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); + float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + + ggml_vec_mad_f32(ne0, d, s0, *s1); + // for (int64_t i0 = 0; i0 < ne0; ++i0) { + // d[i0] += s0[i0] * s1[i1]; + // } + } + } + + //int64_t t1 = ggml_perf_time_us(); + //static int64_t acc = 0; + //acc += t1 - t0; + //if (t1 - t0 > 10) { + // printf("\n"); + // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); + // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); + // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); + // printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13); + + // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); + //} +} + +static void ggml_compute_forward_out_prod( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + { + GGML_ASSERT(false); // todo + // ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F16: + { + GGML_ASSERT(false); // todo + // ggml_compute_forward_out_prod_f16_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_out_prod_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_scale + +static void ggml_compute_forward_scale_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // scale factor + const float v = *(float *) src1->data; + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + const size_t nb01 = src0->nb[1]; + + const size_t nb1 = dst->nb[1]; + + + for (int i1 = ir0; i1 < ir1; i1++) { + if (dst->data != src0->data) { + // src0 is same shape as dst => same indices + memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float)); + } + ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v); + } +} + +static void ggml_compute_forward_scale( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_scale_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_set + +static void ggml_compute_forward_set_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + + // view src0 and dst with these strides and data offset inbytes during set + // nb0 is implicitely element_size because src0 and dst are contiguous + size_t nb1 = ((int32_t *) dst->op_params)[0]; + size_t nb2 = ((int32_t *) dst->op_params)[1]; + size_t nb3 = ((int32_t *) dst->op_params)[2]; + size_t offset = ((int32_t *) dst->op_params)[3]; + bool inplace = (bool) ((int32_t *) dst->op_params)[4]; + + if (!inplace && (params->type == GGML_TASK_INIT)) { + // memcpy needs to be synchronized across threads to avoid race conditions. + // => do it in INIT phase + memcpy( + ((char *) dst->data), + ((char *) src0->data), + ggml_nbytes(dst)); + } + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src1); + const int nc = src1->ne[0]; + + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); + + // src0 and dst as viewed during set + const size_t nb0 = ggml_element_size(src0); + + const int im0 = (ne10 == 0 ? 0 : ne10-1); + const int im1 = (ne11 == 0 ? 0 : ne11-1); + const int im2 = (ne12 == 0 ? 0 : ne12-1); + const int im3 = (ne13 == 0 ? 0 : ne13-1); + + GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst)); + + GGML_ASSERT(nb10 == sizeof(float)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are viewed with shape of src1 and offset + // => same indices + const int i3 = ir/(ne12*ne11); + const int i2 = (ir - i3*ne12*ne11)/ne11; + const int i1 = (ir - i3*ne12*ne11 - i2*ne11); + + ggml_vec_cpy_f32(nc, + (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), + (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); + } +} + +static void ggml_compute_forward_set( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_set_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_cpy + +static void ggml_compute_forward_cpy( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + ggml_compute_forward_dup(params, src0, dst); +} + +// ggml_compute_forward_cont + +static void ggml_compute_forward_cont( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + ggml_compute_forward_dup(params, src0, dst); +} + +// ggml_compute_forward_reshape + +static void ggml_compute_forward_reshape( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + // NOP + UNUSED(params); + UNUSED(src0); + UNUSED(dst); +} + +// ggml_compute_forward_view + +static void ggml_compute_forward_view( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0) { + // NOP + UNUSED(params); + UNUSED(src0); +} + +// ggml_compute_forward_permute + +static void ggml_compute_forward_permute( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0) { + // NOP + UNUSED(params); + UNUSED(src0); +} + +// ggml_compute_forward_transpose + +static void ggml_compute_forward_transpose( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0) { + // NOP + UNUSED(params); + UNUSED(src0); +} + +// ggml_compute_forward_get_rows + +static void ggml_compute_forward_get_rows_q( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int nc = src0->ne[0]; + const int nr = ggml_nelements(src1); + const enum ggml_type type = src0->type; + ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; + + assert( dst->ne[0] == nc); + assert( dst->ne[1] == nr); + assert(src0->nb[0] == ggml_type_size(type)); + + for (int i = 0; i < nr; ++i) { + const int r = ((int32_t *) src1->data)[i]; + + dequantize_row_q( + (const void *) ((char *) src0->data + r*src0->nb[1]), + (float *) ((char *) dst->data + i*dst->nb[1]), nc); + } +} + +static void ggml_compute_forward_get_rows_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int nc = src0->ne[0]; + const int nr = ggml_nelements(src1); + + assert( dst->ne[0] == nc); + assert( dst->ne[1] == nr); + assert(src0->nb[0] == sizeof(ggml_fp16_t)); + + for (int i = 0; i < nr; ++i) { + const int r = ((int32_t *) src1->data)[i]; + + for (int j = 0; j < nc; ++j) { + ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j]; + ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v); + } + } +} + +static void ggml_compute_forward_get_rows_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int nc = src0->ne[0]; + const int nr = ggml_nelements(src1); + + assert( dst->ne[0] == nc); + assert( dst->ne[1] == nr); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < nr; ++i) { + const int r = ((int32_t *) src1->data)[i]; + + ggml_vec_cpy_f32(nc, + (float *) ((char *) dst->data + i*dst->nb[1]), + (float *) ((char *) src0->data + r*src0->nb[1])); + } +} + +static void ggml_compute_forward_get_rows( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + { + ggml_compute_forward_get_rows_q(params, src0, src1, dst); + } break; + case GGML_TYPE_F16: + case GGML_TYPE_I16: + { + ggml_compute_forward_get_rows_f16(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + case GGML_TYPE_I32: + { + ggml_compute_forward_get_rows_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } + + //static bool first = true; + //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]); + //if (first) { + // first = false; + //} else { + // for (int k = 0; k < dst->ne[1]; ++k) { + // for (int j = 0; j < dst->ne[0]/16; ++j) { + // for (int i = 0; i < 16; ++i) { + // printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + // printf("\n"); + // exit(0); + //} +} + +// ggml_compute_forward_get_rows_back + +static void ggml_compute_forward_get_rows_back_f32_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_are_same_shape(opt0, dst)); + GGML_ASSERT(ggml_is_contiguous(opt0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + ggml_compute_forward_dup_same_cont(params, opt0, dst); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int nc = src0->ne[0]; + const int nr = ggml_nelements(src1); + + GGML_ASSERT( dst->ne[0] == nc); + GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t)); + + for (int i = 0; i < nr; ++i) { + const int r = ((int32_t *) src1->data)[i]; + + for (int j = 0; j < nc; ++j) { + ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j]; + ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v); + } + } +} + +static void ggml_compute_forward_get_rows_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_are_same_shape(opt0, dst)); + GGML_ASSERT(ggml_is_contiguous(opt0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + // ggml_compute_forward_dup_same_cont(params, opt0, dst); + + if (params->type == GGML_TASK_INIT) { + memset(dst->data, 0, ggml_nbytes(dst)); + } + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int nc = src0->ne[0]; + const int nr = ggml_nelements(src1); + + GGML_ASSERT( dst->ne[0] == nc); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < nr; ++i) { + const int r = ((int32_t *) src1->data)[i]; + + ggml_vec_add_f32(nc, + (float *) ((char *) dst->data + r*dst->nb[1]), + (float *) ((char *) dst->data + r*dst->nb[1]), + (float *) ((char *) src0->data + i*src0->nb[1])); + } +} + + +static void ggml_compute_forward_get_rows_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, opt0, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_get_rows_back_f32(params, src0, src1, opt0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } + + //static bool first = true; + //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]); + //if (first) { + // first = false; + //} else { + // for (int k = 0; k < dst->ne[1]; ++k) { + // for (int j = 0; j < dst->ne[0]/16; ++j) { + // for (int i = 0; i < 16; ++i) { + // printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + // printf("\n"); + // exit(0); + //} +} + +// ggml_compute_forward_diag + +static void ggml_compute_forward_diag_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // TODO: handle transposed/permuted matrices + + GGML_TENSOR_UNARY_OP_LOCALS; + + GGML_ASSERT(ne00 == ne0); + GGML_ASSERT(ne00 == ne1); + GGML_ASSERT(ne01 == 1); + GGML_ASSERT(ne02 == ne2); + GGML_ASSERT(ne03 == ne3); + + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb0 == sizeof(float)); + + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = 0; i2 < ne2; i2++) { + for (int i1 = 0; i1 < ne1; i1++) { + float * d = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + float * s = (float *)((char *) src0->data + i3*nb03 + i2*nb02); + for (int i0 = 0; i0 < i1; i0++) { + d[i0] = 0; + } + d[i1] = s[i1]; + for (int i0 = i1+1; i0 < ne0; i0++) { + d[i0] = 0; + } + } + } + } +} + +static void ggml_compute_forward_diag( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_diag_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_diag_mask_inf + +static void ggml_compute_forward_diag_mask_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst, + const float value) { + + const int ith = params->ith; + const int nth = params->nth; + + const int n_past = ((int32_t *) dst->op_params)[0]; + const bool inplace = src0->data == dst->data; + + GGML_ASSERT(n_past >= 0); + + if (!inplace && (params->type == GGML_TASK_INIT)) { + // memcpy needs to be synchronized across threads to avoid race conditions. + // => do it in INIT phase + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + memcpy( + ((char *) dst->data), + ((char *) src0->data), + ggml_nbytes(dst)); + } + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // TODO: handle transposed/permuted matrices + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + const int nr = src0->ne[1]; + const int nz = n/nr; + + GGML_ASSERT( dst->nb[0] == sizeof(float)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + for (int k = 0; k < nz; k++) { + for (int j = ith; j < nr; j += nth) { + for (int i = n_past; i < nc; i++) { + if (i > n_past + j) { + *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value; + } + } + } + } +} + +static void ggml_compute_forward_diag_mask_inf( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +static void ggml_compute_forward_diag_mask_zero( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_diag_mask_f32(params, src0, dst, 0); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_soft_max + +static void ggml_compute_forward_soft_max_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // TODO: handle transposed/permuted matrices + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float *sp = (float *)((char *) src0->data + i1*src0->nb[1]); + float *dp = (float *)((char *) dst->data + i1*dst->nb[1]); + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + //printf("p[%d] = %f\n", i, p[i]); + assert(!isnan(sp[i])); + } +#endif + + float max = -INFINITY; + ggml_vec_max_f32(nc, &max, sp); + + ggml_float sum = 0.0; + + uint16_t scvt; + for (int i = 0; i < nc; i++) { + if (sp[i] == -INFINITY) { + dp[i] = 0.0f; + } else { + // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max); + ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max); + memcpy(&scvt, &s, sizeof(scvt)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); + sum += (ggml_float)val; + dp[i] = val; + } + } + + assert(sum > 0.0); + + sum = 1.0/sum; + ggml_vec_scale_f32(nc, dp, sum); + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + assert(!isnan(dp[i])); + assert(!isinf(dp[i])); + } +#endif + } +} + +static void ggml_compute_forward_soft_max( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_soft_max_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_soft_max_back + +static void ggml_compute_forward_soft_max_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_are_same_shape(src1, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // TODO: handle transposed/permuted matrices + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float *dy = (float *)((char *) src0->data + i1*src0->nb[1]); + float *y = (float *)((char *) src1->data + i1*src1->nb[1]); + float *dx = (float *)((char *) dst->data + i1*dst->nb[1]); + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + //printf("p[%d] = %f\n", i, p[i]); + assert(!isnan(dy[i])); + assert(!isnan(y[i])); + } +#endif + // Jii = yi - yi*yi + // Jij = -yi*yj + // J = diag(y)-y.T*y + // dx = J * dy + // dxk = sum_i(Jki * dyi) + // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk + // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk + // dxk = sum_i(-yk*yi * dyi) + yk*dyk + // dxk = -yk * sum_i(yi * dyi) + yk*dyk + // dxk = -yk * dot(y, dy) + yk*dyk + // dxk = yk * (- dot(y, dy) + dyk) + // dxk = yk * (dyk - dot(y, dy)) + // + // post-order: + // dot_y_dy := dot(y, dy) + // dx := dy + // dx := dx - dot_y_dy + // dx := dx * y + + // linear runtime, no additional memory + float dot_y_dy = 0; + ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy); + ggml_vec_cpy_f32 (nc, dx, dy); + ggml_vec_acc1_f32(nc, dx, -dot_y_dy); + ggml_vec_mul_f32 (nc, dx, dx, y); + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + assert(!isnan(dx[i])); + assert(!isinf(dx[i])); + } +#endif + } +} + +static void ggml_compute_forward_soft_max_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_soft_max_back_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_alibi + +static void ggml_compute_forward_alibi_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_head = ((int32_t *) dst->op_params)[1]; + float max_bias; + memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); + + assert(n_past >= 0); + + const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1 + const int ne1 = src0->ne[1]; // seq_len_without_past + const int ne2 = src0->ne[2]; // n_head -> this is k + //const int ne3 = src0->ne[3]; // 1 -> bsz + + const int n = ggml_nrows(src0); + const int ne2_ne3 = n/ne1; // ne2*ne3 + + const int nb0 = src0->nb[0]; + const int nb1 = src0->nb[1]; + const int nb2 = src0->nb[2]; + //const int nb3 = src0->nb[3]; + + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(ne1 + n_past == ne0); + GGML_ASSERT(n_head == ne2); + + // add alibi to src0 (KQ_scaled) + const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); + + const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); + + for (int i = 0; i < ne0; i++) { + for (int j = 0; j < ne1; j++) { + for (int k = 0; k < ne2_ne3; k++) { + float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); + float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); + + // TODO: k*nb2 or k*nb3 + + float m_k; + + if (k < n_heads_log2_floor) { + m_k = powf(m0, k + 1); + } else { + m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); + } + + pdst[0] = i * m_k + src[0]; + + } + } + } +} + +static void ggml_compute_forward_alibi_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_head = ((int32_t *) dst->op_params)[1]; + float max_bias; + memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); + + assert(n_past >= 0); + + const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1 + const int ne1 = src0->ne[1]; // seq_len_without_past + const int ne2 = src0->ne[2]; // n_head -> this is k + //const int ne3 = src0->ne[3]; // 1 -> bsz + + const int n = ggml_nrows(src0); + const int ne2_ne3 = n/ne1; // ne2*ne3 + + const int nb0 = src0->nb[0]; + const int nb1 = src0->nb[1]; + const int nb2 = src0->nb[2]; + //const int nb3 = src0->nb[3]; + + GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(ne1 + n_past == ne0); (void) n_past; + GGML_ASSERT(n_head == ne2); + + // add alibi to src0 (KQ_scaled) + const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); + + const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); + + for (int i = 0; i < ne0; i++) { + for (int j = 0; j < ne1; j++) { + for (int k = 0; k < ne2_ne3; k++) { + ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); + float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); + + // TODO: k*nb2 or k*nb3 + + float m_k; + + if (k < n_heads_log2_floor) { + m_k = powf(m0, k + 1); + } else { + m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); + } + + // we return F32 + pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]); + } + } + } +} + +static void ggml_compute_forward_alibi( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_alibi_f16(params, src0, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_alibi_f32(params, src0, dst); + } break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_Q8_K: + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_COUNT: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_clamp + +static void ggml_compute_forward_clamp_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + float min; + float max; + memcpy(&min, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + for (int j = ith; j < n; j += nth) { + float * dst_ptr = (float *) ((char *) dst->data + j*nb1); + float * src0_ptr = (float *) ((char *) src0->data + j*nb01); + + for (int i = 0; i < nc; i++) { + dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min); + } + } +} + +static void ggml_compute_forward_clamp( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_clamp_f32(params, src0, dst); + } break; + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_Q8_K: + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_COUNT: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_rope + +static void ggml_compute_forward_rope_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + float freq_base; + float freq_scale; + + // these two only relevant for xPos RoPE: + float xpos_base; + bool xpos_down; + + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); + + assert(n_past >= 0); + + GGML_TENSOR_UNARY_OP_LOCALS; + + //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); + //printf("n_past = %d, ne2 = %d\n", n_past, ne2); + + GGML_ASSERT(nb00 == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(dst); + + GGML_ASSERT(n_dims <= ne0); + GGML_ASSERT(n_dims % 2 == 0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + // row index used to determine which thread to use + int ir = 0; + + const float theta_scale = powf(freq_base, -2.0f/n_dims); + + const bool is_neox = mode & 2; + const bool is_glm = mode & 4; + + for (int64_t i3 = 0; i3 < ne3; i3++) { + for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { + const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); + for (int64_t i1 = 0; i1 < ne1; i1++) { + if (ir++ < ir0) continue; + if (ir > ir1) break; + + float theta = freq_scale * (float)p; + + if (is_glm) { + theta = MIN(p, n_ctx - 2); + float block_theta = MAX(p - (n_ctx - 2), 0); + for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { + const float cos_theta = cosf(theta); + const float sin_theta = sinf(theta); + const float cos_block_theta = cosf(block_theta); + const float sin_block_theta = sinf(block_theta); + + theta *= theta_scale; + block_theta *= theta_scale; + + const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float x0 = src[0]; + const float x1 = src[n_dims/2]; + const float x2 = src[n_dims]; + const float x3 = src[n_dims/2*3]; + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; + dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta; + dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta; + } + } else if (!is_neox) { + for (int64_t i0 = 0; i0 < ne0; i0 += 2) { + const float cos_theta = cosf(theta); + const float sin_theta = sinf(theta); + // zeta scaling for xPos only: + float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f; + if (xpos_down) zeta = 1.0f / zeta; + + theta *= theta_scale; + + const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float x0 = src[0]; + const float x1 = src[1]; + + dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta; + dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta; + } + } else { + // TODO: this might be wrong for ne0 != n_dims - need double check + // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 + for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { + for (int64_t ic = 0; ic < n_dims; ic += 2) { + const float cos_theta = cosf(theta); + const float sin_theta = sinf(theta); + + theta *= theta_scale; + + const int64_t i0 = ib*n_dims + ic/2; + + const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float x0 = src[0]; + const float x1 = src[n_dims/2]; + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; + } + } + } + } + } + } +} + +static void ggml_compute_forward_rope_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + float freq_base; + float freq_scale; + + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + + assert(n_past >= 0); + + GGML_TENSOR_UNARY_OP_LOCALS; + + //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); + //printf("n_past = %d, ne2 = %d\n", n_past, ne2); + + GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(dst); + + GGML_ASSERT(n_dims <= ne0); + GGML_ASSERT(n_dims % 2 == 0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + // row index used to determine which thread to use + int ir = 0; + + const float theta_scale = powf(freq_base, -2.0f/n_dims); + + const bool is_neox = mode & 2; + const bool is_glm = mode & 4; + + for (int64_t i3 = 0; i3 < ne3; i3++) { + for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { + const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); + for (int64_t i1 = 0; i1 < ne1; i1++) { + if (ir++ < ir0) continue; + if (ir > ir1) break; + + float theta = freq_scale * (float)p; + + if (is_glm) { + theta = MIN(p, n_ctx - 2); + float block_theta = MAX(p - (n_ctx - 2), 0); + for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { + const float cos_theta = cosf(theta); + const float sin_theta = sinf(theta); + const float cos_block_theta = cosf(block_theta); + const float sin_block_theta = sinf(block_theta); + + theta *= theta_scale; + block_theta *= theta_scale; + + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float x0 = GGML_FP16_TO_FP32(src[0]); + const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]); + const float x2 = GGML_FP16_TO_FP32(src[n_dims]); + const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]); + + dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta); + dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta); + } + } if (!is_neox) { + for (int64_t i0 = 0; i0 < ne0; i0 += 2) { + const float cos_theta = cosf(theta); + const float sin_theta = sinf(theta); + + theta *= theta_scale; + + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float x0 = GGML_FP16_TO_FP32(src[0]); + const float x1 = GGML_FP16_TO_FP32(src[1]); + + dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + } + } else { + // TODO: this might be wrong for ne0 != n_dims - need double check + // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 + for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { + for (int64_t ic = 0; ic < n_dims; ic += 2) { + const float cos_theta = cosf(theta); + const float sin_theta = sinf(theta); + + theta *= theta_scale; + + const int64_t i0 = ib*n_dims + ic/2; + + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float x0 = GGML_FP16_TO_FP32(src[0]); + const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]); + + dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + } + } + } + } + } + } +} + +static void ggml_compute_forward_rope( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_rope_f16(params, src0, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_rope_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_rope_back + +static void ggml_compute_forward_rope_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // y = rope(x, src1) + // dx = rope_back(dy, src1) + // src0 is dy, src1 contains options + + float freq_base; + float freq_scale; + + // these two only relevant for xPos RoPE: + float xpos_base; + bool xpos_down; + + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx); + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); + + assert(n_past >= 0); + + GGML_TENSOR_UNARY_OP_LOCALS; + + //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); + //printf("n_past = %d, ne2 = %d\n", n_past, ne2); + + assert(nb0 == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(dst); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + // row index used to determine which thread to use + int ir = 0; + + const float theta_scale = powf(freq_base, -2.0f/n_dims); + + const bool is_neox = mode & 2; + + for (int64_t i3 = 0; i3 < ne3; i3++) { + for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { + const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); + for (int64_t i1 = 0; i1 < ne1; i1++) { + if (ir++ < ir0) continue; + if (ir > ir1) break; + + float theta = freq_scale * (float)p; + + if (!is_neox) { + for (int64_t i0 = 0; i0 < ne0; i0 += 2) { + const float cos_theta = cosf(theta); + const float sin_theta = sinf(theta); + // zeta scaling for xPos only: + float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f; + if (xpos_down) zeta = 1.0f / zeta; + + theta *= theta_scale; + + const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float dy0 = dy[0]; + const float dy1 = dy[1]; + + dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta; + dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta; + } + } else { + for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { + for (int64_t ic = 0; ic < n_dims; ic += 2) { + const float cos_theta = cosf(theta); + const float sin_theta = sinf(theta); + + theta *= theta_scale; + + const int64_t i0 = ib*n_dims + ic/2; + + const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float dy0 = dy[0]; + const float dy1 = dy[n_dims/2]; + + dx[0] = dy0*cos_theta + dy1*sin_theta; + dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta; + } + } + } + } + } + } +} + +static void ggml_compute_forward_rope_back_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // y = rope(x, src1) + // dx = rope_back(dy, src1) + // src0 is dy, src1 contains options + + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + + assert(n_past >= 0); + + GGML_TENSOR_UNARY_OP_LOCALS; + + //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); + //printf("n_past = %d, ne2 = %d\n", n_past, ne2); + + assert(nb0 == sizeof(ggml_fp16_t)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(dst); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + // row index used to determine which thread to use + int ir = 0; + + const float theta_scale = powf(10000.0, -2.0f/n_dims); + + const bool is_neox = mode & 2; + + for (int64_t i3 = 0; i3 < ne3; i3++) { + for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { + const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); + for (int64_t i1 = 0; i1 < ne1; i1++) { + if (ir++ < ir0) continue; + if (ir > ir1) break; + + float theta = (float)p; + + if (!is_neox) { + for (int64_t i0 = 0; i0 < ne0; i0 += 2) { + const float cos_theta = cosf(theta); + const float sin_theta = sinf(theta); + + theta *= theta_scale; + + const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float dy0 = GGML_FP16_TO_FP32(dy[0]); + const float dy1 = GGML_FP16_TO_FP32(dy[1]); + + dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta); + dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta); + } + } else { + for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { + for (int64_t ic = 0; ic < n_dims; ic += 2) { + const float cos_theta = cosf(theta); + const float sin_theta = sinf(theta); + + theta *= theta_scale; + + const int64_t i0 = ib*n_dims + ic/2; + + const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float dy0 = GGML_FP16_TO_FP32(dy[0]); + const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]); + + dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta); + dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta); + } + } + } + } + } + } +} + +static void ggml_compute_forward_rope_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_rope_back_f16(params, src0, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_rope_back_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_conv_1d + +static void ggml_compute_forward_conv_1d_s1_ph_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const int nk = ne00; + const int nh = nk/2; + + const int ew0 = ggml_up32(ne01); + + GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + // TODO: fix this memset (wsize is overestimated) + memset(params->wdata, 0, params->wsize); + + // prepare kernel data (src0) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); + ggml_fp16_t * dst_data = wdata + i02*ew0*ne00; + for (int64_t i00 = 0; i00 < ne00; i00++) { + dst_data[i00*ew0 + i01] = src[i00]; + } + } + } + } + + // prepare source data (src1) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00; + + for (int64_t i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + ggml_fp16_t * dst_data = wdata; + for (int64_t i10 = 0; i10 < ne10; i10++) { + dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]); + } + } + } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // total rows in dst + const int nr = ne02; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i1*nb1); + for (int64_t i0 = 0; i0 < ne10; ++i0) { + dst_data[i0] = 0; + for (int k = -nh; k <= nh; k++) { + float v = 0.0f; + ggml_vec_dot_f16(ew0, &v, + (ggml_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, + (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); + + dst_data[i0] += v; + } + } + } +} + + +static void ggml_compute_forward_conv_1d_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const int nk = ne00; + + const int ew0 = nk*ne01; + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[1]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[2]; + + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + memset(params->wdata, 0, params->wsize); + + float * const wdata = (float *) params->wdata + 0; + + for (int64_t i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + float * dst_data = wdata; + + for (int64_t i0 = 0; i0 < ne0; i0++) { + for (int64_t ik = 0; ik < nk; ik++) { + const int idx0 = i0*s0 + ik*d0 - p0; + if(!(idx0 < 0 || idx0 >= ne10)) { + dst_data[i0*ew0 + i11*nk + ik] = src[idx0]; + } + } + } + } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // total rows in dst + const int nr = ne02; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + float * const wdata = (float *) params->wdata + 0; + + for (int i2 = 0; i2 < ne2; i2++) { + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1); + + for (int i0 = 0; i0 < ne0; i0++) { + ggml_vec_dot_f32(ew0, dst_data + i0, + (float *) ((char *) src0->data + i1*nb02), + (float *) wdata + i2*nb2 + i0*ew0); + } + } + } +} + +static void ggml_compute_forward_conv_1d_stage_0_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + memset(dst->data, 0, ggml_nbytes(dst)); + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + // Padding + for (int i0 = 0; i0 < ne10; i0++) { + for (int i1 = 0; i1 < ne11; i1++) { + float *output = (float *) ((char *) dst->data + (i0+15)*(dst->nb[0]) + i1 * dst->nb[1]); + float * src = (float *)((char *) src1->data + i0*nb10 + i1*nb11); + *output = *src; + } + } +} + +static void ggml_compute_forward_conv_1d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch(src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_conv_1d_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +static void ggml_compute_forward_conv_1d_stage_0( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch(src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +static void ggml_compute_forward_conv_1d_stage_1_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_TENSOR_UNARY_OP_LOCALS; + GGML_ASSERT(nb0 == sizeof(float)); + // K, S, C + for (int i2 = 0; i2 < ne2; i2++) { + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + float *output = (float *) ((char *) dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2); + float * src = (float *)((char *) src0->data + (i0+i1)*nb00 + i2*nb01); + *output = *src; + } + } + } +} + +static void ggml_compute_forward_conv_1d_stage_2_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + GGML_ASSERT(nb0 == sizeof(float)); + + for (int i2 = 0; i2 < ne12; i2++) { // c + for (int i1 = 0; i1 < ne11; i1++) { // s + float sum = 0.0f; + for (int i0 = 0; i0 < ne10; i0++) { // k + float *src0_data_offset = (float *)((char *)src0->data + i0*nb00 + i2*nb01); + float *src1_data_offset = (float *)((char *)src1->data + i0*nb10 + i1*nb11 + i2*nb12); + sum += (*src1_data_offset) * (*src0_data_offset); + } + float *output = (float *) ((char *) dst->data + i1*(dst->nb[0]) + i2 * (dst->nb[1])); + *output = sum; + + } + } +} + +static void ggml_compute_forward_conv_1d_stage_1( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch(src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_conv_1d_stage_1_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +static void ggml_compute_forward_conv_1d_stage_2( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch(src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_conv_1d_stage_2_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_conv_1d_generic + +static void ggml_compute_forward_conv_1d_generic_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + const int nk = ne00; + + // size of the convolution row - the kernel size unrolled across all input channels + const int ew0 = nk*ne01; + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[1]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[2]; + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + memset(params->wdata, 0, params->wsize); + + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int64_t i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + ggml_fp16_t * dst_data = wdata; + + for (int64_t i0 = 0; i0 < ne0; i0++) { + for (int64_t ik = 0; ik < nk; ik++) { + const int idx0 = i0*s0 + ik*d0 - p0; + + if(!(idx0 < 0 || idx0 >= ne10)) { + dst_data[i0*ew0 + i11*nk + ik] = GGML_FP32_TO_FP16(src[idx0]); + } + } + } + } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // total rows in dst + const int nr = ne2; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int i2 = 0; i2 < ne2; i2++) { + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1); + + for (int i0 = 0; i0 < ne0; i0++) { + ggml_vec_dot_f16(ew0, dst_data + i0, + (ggml_fp16_t *) ((char *) src0->data + i1*nb02), + (ggml_fp16_t *) wdata + i2*nb2 + i0*ew0); + } + } + } +} + +static void ggml_compute_forward_conv_1d_generic_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + const int nk = ne00; + + const int ew0 = nk*ne01; + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[1]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[2]; + + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + memset(params->wdata, 0, params->wsize); + + float * const wdata = (float *) params->wdata + 0; + + for (int64_t i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + float * dst_data = wdata; + + for (int64_t i0 = 0; i0 < ne0; i0++) { + for (int64_t ik = 0; ik < nk; ik++) { + const int idx0 = i0*s0 + ik*d0 - p0; + + if(!(idx0 < 0 || idx0 >= ne10)) { + dst_data[i0*ew0 + i11*nk + ik] = src[idx0]; + } + } + } + } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // total rows in dst + const int nr = ne02; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + float * const wdata = (float *) params->wdata + 0; + + for (int i2 = 0; i2 < ne2; i2++) { + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1); + + for (int i0 = 0; i0 < ne0; i0++) { + ggml_vec_dot_f32(ew0, dst_data + i0, + (float *) ((char *) src0->data + i1*nb02), + (float *) wdata + i2*nb2 + i0*ew0); + } + } + } +} + +// TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1 +static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k, + float * A, + float * B, + float * C, + const int ith, const int nth) { + // does not seem to make a difference + int64_t m0, m1, n0, n1; + // patches per thread + if (m > n) { + n0 = 0; + n1 = n; + + // total patches in dst + const int np = m; + + // patches per thread + const int dp = (np + nth - 1)/nth; + + // patch range for this thread + m0 = dp*ith; + m1 = MIN(m0 + dp, np); + } else { + m0 = 0; + m1 = m; + + // total patches in dst + const int np = n; + + // patches per thread + const int dp = (np + nth - 1)/nth; + + // patch range for this thread + n0 = dp*ith; + n1 = MIN(n0 + dp, np); + } + + // block-tiling attempt + int64_t blck_n = 16; + int64_t blck_m = 16; + + // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB + // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(ggml_fp16_t) * K); + // if (blck_size > 0) { + // blck_0 = 4; + // blck_1 = blck_size / blck_0; + // if (blck_1 < 0) { + // blck_1 = 1; + // } + // // blck_0 = (int64_t)sqrt(blck_size); + // // blck_1 = blck_0; + // } + // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1); + + for (int j = n0; j < n1; j+=blck_n) { + for (int i = m0; i < m1; i+=blck_m) { + // printf("i j k => %d %d %d\n", i, j, K); + for (int ii = i; ii < i + blck_m && ii < m1; ii++) { + for (int jj = j; jj < j + blck_n && jj < n1; jj++) { + ggml_vec_dot_f32(k, + C + ii*n + jj, + A + ii * k, + B + jj * k); + } + } + } + } +} + +// src0: kernel [OC, IC, K] +// src1: signal [N, IC, IL] +// dst: result [N, OL, IC*K] +static void ggml_compute_forward_conv_1d_generic_stage_0_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int64_t N = ne12; + const int64_t IC = ne11; + const int64_t IL = ne10; + + const int64_t K = ne00; + + const int64_t OL = ne1; + + const int ith = params->ith; + const int nth = params->nth; + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[1]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[2]; + + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + memset(dst->data, 0, ggml_nbytes(dst)); + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // im2col: [N, IC, IL] => [N, OL, IC*K] + { + float * const wdata = (float *) dst->data; + + for (int64_t in = 0; in < N; in++) { + for (int64_t iol = 0; iol < OL; iol++) { + for (int64_t iic = ith; iic < IC; iic+=nth) { + + // micro kernel + float * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K] + const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL] + + for (int64_t ik = 0; ik < K; ik++) { + const int64_t iil = iol*s0 + ik*d0 - p0; + + if (!(iil < 0 || iil >= IL)) { + dst_data[iic*K + ik] = src_data[iil]; + } + } + } + } + } + } +} + +// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K] +// src0: [OC, IC, K] +// src1: [N, OL, IC * K] +// result: [N, OC, OL] +static void ggml_compute_forward_conv_1d_generic_stage_1_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + GGML_ASSERT(nb0 == sizeof(float)); + + const int N = ne12; + const int OL = ne11; + + const int OC = ne02; + const int IC = ne01; + const int K = ne00; + + const int ith = params->ith; + const int nth = params->nth; + + int64_t m = OC; + int64_t n = OL; + int64_t k = IC * K; + + // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K] + for (int i = 0; i < N; i++) { + float * A = (float *)src0->data; // [m, k] + float * B = (float *)src1->data + i * m * k; // [n, k] + float * C = (float *)dst->data + i * m * n; // [m, n] + + gemm_f16_out_f32(m, n, k, A, B, C, ith, nth); + } +} + +static void ggml_compute_forward_conv_1d_generic( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch(src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_conv_1d_generic_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +static void ggml_compute_forward_conv_1d_generic_stage_0( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch(src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_conv_1d_generic_stage_0_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +static void ggml_compute_forward_conv_1d_generic_stage_1( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch(src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_conv_1d_generic_stage_1_f16(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +static void ggml_compute_forward_conv_1d_s1_ph_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const int nk = ne00; + const int nh = nk/2; + + const int ew0 = ggml_up32(ne01); + + GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + // TODO: fix this memset (wsize is overestimated) + memset(params->wdata, 0, params->wsize); + + // prepare kernel data (src0) + { + float * const wdata = (float *) params->wdata + 0; + + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); + float * dst_data = wdata + i02*ew0*ne00; + for (int64_t i00 = 0; i00 < ne00; i00++) { + dst_data[i00*ew0 + i01] = src[i00]; + } + } + } + } + + // prepare source data (src1) + { + float * const wdata = (float *) params->wdata + ne02*ew0*ne00; + + for (int64_t i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + float * dst_data = wdata; + for (int64_t i10 = 0; i10 < ne10; i10++) { + dst_data[(i10 + nh)*ew0 + i11] = src[i10]; + } + } + } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // total rows in dst + const int nr = ne02; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i1*nb1); + for (int64_t i0 = 0; i0 < ne10; ++i0) { + dst_data[i0] = 0; + for (int k = -nh; k <= nh; k++) { + float v = 0.0f; + ggml_vec_dot_f32(ew0, &v, + (float *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, + (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); + + dst_data[i0] += v; + } + } + } +} + +static void ggml_compute_forward_conv_1d_s1_ph( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +static void ggml_compute_forward_conv_1d_s2_ph_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const int nk = ne00; + const int nh = nk/2; + + const int ew0 = ggml_up32(ne01); + + GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + // TODO: fix this memset (wsize is overestimated) + memset(params->wdata, 0, params->wsize); + + // prepare kernel data (src0) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); + ggml_fp16_t * dst_data = wdata + i02*ew0*ne00; + for (int64_t i00 = 0; i00 < ne00; i00++) { + dst_data[i00*ew0 + i01] = src[i00]; + } + } + } + } + + // prepare source data (src1) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00; + + for (int64_t i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + ggml_fp16_t * dst_data = wdata; + for (int64_t i10 = 0; i10 < ne10; i10++) { + dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]); + } + } + } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // total rows in dst + const int nr = ne02; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i1*nb1); + for (int64_t i0 = 0; i0 < ne10; i0 += 2) { + dst_data[i0/2] = 0; + for (int k = -nh; k <= nh; k++) { + float v = 0.0f; + ggml_vec_dot_f16(ew0, &v, + (ggml_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, + (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); + + dst_data[i0/2] += v; + } + } + } +} + +static void ggml_compute_forward_conv_1d_s2_ph_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const int nk = ne00; + const int nh = nk/2; + + const int ew0 = ggml_up32(ne01); + + GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + // TODO: fix this memset (wsize is overestimated) + memset(params->wdata, 0, params->wsize); + + // prepare kernel data (src0) + { + float * const wdata = (float *) params->wdata + 0; + + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); + float * dst_data = wdata + i02*ew0*ne00; + for (int64_t i00 = 0; i00 < ne00; i00++) { + dst_data[i00*ew0 + i01] = src[i00]; + } + } + } + } + + // prepare source data (src1) + { + float * const wdata = (float *) params->wdata + ne02*ew0*ne00; + + for (int64_t i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + float * dst_data = wdata; + for (int64_t i10 = 0; i10 < ne10; i10++) { + dst_data[(i10 + nh)*ew0 + i11] = src[i10]; + } + } + } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // total rows in dst + const int nr = ne02; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i1*nb1); + for (int64_t i0 = 0; i0 < ne10; i0 += 2) { + dst_data[i0/2] = 0; + for (int k = -nh; k <= nh; k++) { + float v = 0.0f; + ggml_vec_dot_f32(ew0, &v, + (float *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, + (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); + + dst_data[i0/2] += v; + } + } + } +} + +static void ggml_compute_forward_conv_1d_s2_ph( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + + +// ggml_compute_forward_conv_2d + +static void ggml_compute_forward_conv_2d_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const int nk0 = ne00; + const int nk1 = ne01; + + // size of the convolution row - the kernel size unrolled across all channels + const int ew0 = nk0*nk1*ne02; + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + memset(params->wdata, 0, params->wsize); + + // prepare source data (src1) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int i12 = 0; i12 < ne12; i12++) { + const float * const src = (float *)((char *) src1->data + i12*nb12); + ggml_fp16_t * dst_data = wdata; + + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + for (int ik1 = 0; ik1 < nk1; ik1++) { + for (int ik0 = 0; ik0 < nk0; ik0++) { + const int idx0 = i0*s0 + ik0*d0 - p0; + const int idx1 = i1*s1 + ik1*d1 - p1; + + if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) { + dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = + GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]); + } + } + } + } + } + } + } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // total patches in dst + const int np = ne2; + + // patches per thread + const int dp = (np + nth - 1)/nth; + + // patch range for this thread + const int ip0 = dp*ith; + const int ip1 = MIN(ip0 + dp, np); + + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = ip0; i2 < ip1; i2++) { + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2); + + for (int i1 = 0; i1 < ne1; ++i1) { + for (int i0 = 0; i0 < ne0; ++i0) { + ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0, + (ggml_fp16_t *) ((char *) src0->data + i2*nb03), + (ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0); + } + } + } + } +} + +static void ggml_compute_forward_conv_2d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst); + GGML_ASSERT(false); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_conv_transpose_2d + +static void ggml_compute_forward_conv_transpose_2d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const int nk = ne00*ne01*ne02*ne03; + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + memset(params->wdata, 0, params->wsize); + + // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02); + ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03; + for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00]; + } + } + } + } + } + + // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk; + for (int i12 = 0; i12 < ne12; i12++) { + for (int i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11); + ggml_fp16_t * dst_data = wdata + i11*ne10*ne12; + for (int i10 = 0; i10 < ne10; i10++) { + dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]); + } + } + } + } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + const int32_t stride = ggml_get_op_params_i32(dst, 0); + + // total patches in dst + const int np = ne2; + + // patches per thread + const int dp = (np + nth - 1)/nth; + + // patch range for this thread + const int ip0 = dp*ith; + const int ip1 = MIN(ip0 + dp, np); + + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + ggml_fp16_t * const wdata_src = wdata + nk; + + for (int i2 = ip0; i2 < ip1; i2++) { // Cout + float * dst_data = (float *)((char *) dst->data + i2*nb2); + ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03; + for (int i11 = 0; i11 < ne11; i11++) { + for (int i10 = 0; i10 < ne10; i10++) { + const int i1n = i11*ne10*ne12 + i10*ne12; + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + float v = 0; + ggml_vec_dot_f16(ne03, &v, + wdata_src + i1n, + wdata_kernel + i01*ne00*ne03 + i00*ne03); + dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v; + } + } + } + } + } +} + +// ggml_compute_forward_pool_1d_sk_p0 + +static void ggml_compute_forward_pool_1d_sk_p0( + const struct ggml_compute_params * params, + const enum ggml_op_pool op, + const struct ggml_tensor * src, + const int k, + struct ggml_tensor * dst) { + assert(src->type == GGML_TYPE_F32); + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const char * cdata = (const char *)src->data; + const char * const data_end = cdata + ggml_nbytes(src); + float * drow = (float *)dst->data; + + const int64_t rs = dst->ne[0]; + + while (cdata < data_end) { + const float * const srow = (const float *)cdata; + + int j = 0; + + for (int64_t i = 0; i < rs; ++i) { + switch (op) { + case GGML_OP_POOL_AVG: drow[i] = 0; break; + case GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } + for (int ki = 0; ki < k; ++ki) { + switch (op) { + case GGML_OP_POOL_AVG: drow[i] += srow[j]; break; + case GGML_OP_POOL_MAX: if (srow[j] > drow[i]) drow[i] = srow[j]; break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } + ++j; + } + switch (op) { + case GGML_OP_POOL_AVG: drow[i] /= k; break; + case GGML_OP_POOL_MAX: break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } + } + + cdata += src->nb[1]; + drow += rs; + } +} + +// ggml_compute_forward_pool_1d + +static void ggml_compute_forward_pool_1d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + + const int32_t * opts = (const int32_t *)dst->op_params; + enum ggml_op_pool op = opts[0]; + const int k0 = opts[1]; + const int s0 = opts[2]; + const int p0 = opts[3]; + GGML_ASSERT(p0 == 0); // padding not supported + GGML_ASSERT(k0 == s0); // only s = k supported + + ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst); +} + +// ggml_compute_forward_pool_2d_sk_p0 + +static void ggml_compute_forward_pool_2d_sk_p0( + const struct ggml_compute_params * params, + const enum ggml_op_pool op, + const struct ggml_tensor * src, + const int k0, + const int k1, + struct ggml_tensor * dst) { + assert(src->type == GGML_TYPE_F32); + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const char * cdata = (const char*)src->data; + const char * const data_end = cdata + ggml_nbytes(src); + + const int64_t px = dst->ne[0]; + const int64_t py = dst->ne[1]; + const int64_t pa = px * py; + + float * dplane = (float *)dst->data; + + const int ka = k0 * k1; + + while (cdata < data_end) { + for (int oy = 0; oy < py; ++oy) { + float * const drow = dplane + oy * px; + for (int ox = 0; ox < px; ++ox) { + float * const out = drow + ox; + switch (op) { + case GGML_OP_POOL_AVG: *out = 0; break; + case GGML_OP_POOL_MAX: *out = -FLT_MAX; break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } + + const int ix = ox * k0; + const int iy = oy * k1; + + for (int ky = 0; ky < k1; ++ky) { + const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky)); + for (int kx = 0; kx < k0; ++kx) { + int j = ix + kx; + switch (op) { + case GGML_OP_POOL_AVG: *out += srow[j]; break; + case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } + } + } + switch (op) { + case GGML_OP_POOL_AVG: *out /= ka; break; + case GGML_OP_POOL_MAX: break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } + } + } + + cdata += src->nb[2]; + dplane += pa; + } +} + +// ggml_compute_forward_pool_2d + +static void ggml_compute_forward_pool_2d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + + const int32_t * opts = (const int32_t *)dst->op_params; + enum ggml_op_pool op = opts[0]; + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + GGML_ASSERT(p0 == 0); + GGML_ASSERT(p1 == 0); // padding not supported + GGML_ASSERT(k0 == s0); + GGML_ASSERT(k1 == s1); // only s = k supported + + ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst); +} + +// ggml_compute_forward_upscale + +static void ggml_compute_forward_upscale_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + + GGML_TENSOR_UNARY_OP_LOCALS; + + const int scale_factor = dst->op_params[0]; + + // TODO: optimize + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = ith; i02 < ne02; i02++) { + for (int m = 0; m < dst->ne[1]; m++) { + int i01 = m / scale_factor; + for (int n = 0; n < dst->ne[0]; n++) { + int i00 = n / scale_factor; + + const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03); + + float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]); + + *y = *x; + } + } + } + } +} + +static void ggml_compute_forward_upscale( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_upscale_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_flash_attn + +static void ggml_compute_forward_flash_attn_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const bool masked, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_LOCALS(int64_t, neq, q, ne); + GGML_TENSOR_LOCALS(size_t, nbq, q, nb); + GGML_TENSOR_LOCALS(int64_t, nek, k, ne); + GGML_TENSOR_LOCALS(size_t, nbk, k, nb); + GGML_TENSOR_LOCALS(int64_t, nev, v, ne); + GGML_TENSOR_LOCALS(size_t, nbv, v, nb); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t D = neq0; + const int64_t N = neq1; + const int64_t P = nek1 - N; + const int64_t M = P + N; + + const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL); + + GGML_ASSERT(ne0 == D); + GGML_ASSERT(ne1 == N); + GGML_ASSERT(P >= 0); + + GGML_ASSERT(nbq0 == sizeof(float)); + GGML_ASSERT(nbk0 == sizeof(float)); + GGML_ASSERT(nbv0 == sizeof(float)); + printf("%d %d %d\n", neq0, nek0, nev1); + GGML_ASSERT(neq0 == D); + GGML_ASSERT(nek0 == D); + GGML_ASSERT(nev1 == D); + + GGML_ASSERT(neq1 == N); + GGML_ASSERT(nek1 == N + P); + GGML_ASSERT(nev1 == D); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // parallelize by q rows using ggml_vec_dot_f32 + + // total rows in q + const int nr = neq1*neq2*neq3; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + const float scale = 1.0f/sqrtf(D); + + //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); + + for (int ir = ir0; ir < ir1; ++ir) { + // q indices + const int iq3 = ir/(neq2*neq1); + const int iq2 = (ir - iq3*neq2*neq1)/neq1; + const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); + + float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32); + + for (int i = M; i < Mup; ++i) { + S[i] = -INFINITY; + } + + for (int64_t ic = 0; ic < nek1; ++ic) { + // k indices + const int ik3 = iq3; + const int ik2 = iq2; + const int ik1 = ic; + + // S indices + const int i1 = ik1; + + ggml_vec_dot_f32(neq0, + S + i1, + (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + } + + // scale + ggml_vec_scale_f32(nek1, S, scale); + + if (masked) { + for (int64_t i = P; i < M; i++) { + if (i > P + iq1) { + S[i] = -INFINITY; + } + } + } + + // softmax + { + float max = -INFINITY; + ggml_vec_max_f32(M, &max, S); + + ggml_float sum = 0.0; + { +#ifdef GGML_SOFT_MAX_ACCELERATE + max = -max; + vDSP_vsadd(S, 1, &max, S, 1, Mup); + vvexpf(S, S, &Mup); + ggml_vec_sum_f32(Mup, &sum, S); +#else + uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt); + ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; + + for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { + float * SS = S + i; + + for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { + if (SS[j] == -INFINITY) { + SS[j] = 0.0f; + } else { +#ifndef GGML_FLASH_ATTN_EXP_FP16 + const float val = expf(SS[j] - max); +#else + ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max); + memcpy(&scvt[j], &s, sizeof(uint16_t)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); +#endif + sump[j] += (ggml_float)val; + SS[j] = val; + } + } + } + + for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { + sum += sump[i]; + } +#endif + } + + assert(sum > 0.0); + + sum = 1.0/sum; + ggml_vec_scale_f32(M, S, sum); + +#ifndef NDEBUG + for (int i = 0; i < M; ++i) { + assert(!isnan(S[i])); + assert(!isinf(S[i])); + } +#endif + } + + for (int64_t ic = 0; ic < nev1; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + ggml_vec_dot_f32(nek1, + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), + (float *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), + S); + } + } +} + +static void ggml_compute_forward_flash_attn_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const bool masked, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_LOCALS(int64_t, neq, q, ne); + GGML_TENSOR_LOCALS(size_t, nbq, q, nb); + GGML_TENSOR_LOCALS(int64_t, nek, k, ne); + GGML_TENSOR_LOCALS(size_t, nbk, k, nb); + GGML_TENSOR_LOCALS(int64_t, nev, v, ne); + GGML_TENSOR_LOCALS(size_t, nbv, v, nb); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t D = neq0; + const int64_t N = neq1; + const int64_t P = nek1 - N; + const int64_t M = P + N; + + const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL); + + GGML_ASSERT(ne0 == D); + GGML_ASSERT(ne1 == N); + GGML_ASSERT(P >= 0); + + GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t)); + + GGML_ASSERT(neq0 == D); + GGML_ASSERT(nek0 == D); + GGML_ASSERT(nev1 == D); + + GGML_ASSERT(neq1 == N); + GGML_ASSERT(nek1 == N + P); + GGML_ASSERT(nev1 == D); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // parallelize by q rows using ggml_vec_dot_f32 + + // total rows in q + const int nr = neq1*neq2*neq3; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + const float scale = 1.0f/sqrtf(D); + + //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); + + for (int ir = ir0; ir < ir1; ++ir) { + // q indices + const int iq3 = ir/(neq2*neq1); + const int iq2 = (ir - iq3*neq2*neq1)/neq1; + const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); + + float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32); + + for (int i = M; i < Mup; ++i) { + S[i] = -INFINITY; + } + + if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) { + for (int64_t ic = 0; ic < nek1; ++ic) { + // k indices + const int ik3 = iq3; + const int ik2 = iq2; + const int ik1 = ic; + + // S indices + const int i1 = ik1; + + ggml_vec_dot_f16(neq0, + S + i1, + (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), + (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + } + } else { + for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) { + // k indices + const int ik3 = iq3; + const int ik2 = iq2; + const int ik1 = ic; + + // S indices + const int i1 = ik1; + + ggml_vec_dot_f16_unroll(neq0, nbk1, + S + i1, + ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), + (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + } + } + + // scale + ggml_vec_scale_f32(nek1, S, scale); + + if (masked) { + for (int64_t i = P; i < M; i++) { + if (i > P + iq1) { + S[i] = -INFINITY; + } + } + } + + // softmax + { + float max = -INFINITY; + ggml_vec_max_f32(M, &max, S); + + ggml_float sum = 0.0; + { +#ifdef GGML_SOFT_MAX_ACCELERATE + max = -max; + vDSP_vsadd(S, 1, &max, S, 1, Mup); + vvexpf(S, S, &Mup); + ggml_vec_sum_f32(Mup, &sum, S); +#else + uint16_t scvt[GGML_SOFT_MAX_UNROLL]; + ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; + + for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { + float * SS = S + i; + + for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { + if (SS[j] == -INFINITY) { + SS[j] = 0.0f; + } else { + ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max); + memcpy(&scvt[j], &s, sizeof(uint16_t)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); + sump[j] += (ggml_float)val; + SS[j] = val; + } + } + } + + for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { + sum += sump[i]; + } +#endif + } + + assert(sum > 0.0); + + sum = 1.0/sum; + ggml_vec_scale_f32(M, S, sum); + +#ifndef NDEBUG + for (int i = 0; i < M; ++i) { + assert(!isnan(S[i])); + assert(!isinf(S[i])); + } +#endif + } + + ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup); + + for (int64_t i = 0; i < M; i++) { + S16[i] = GGML_FP32_TO_FP16(S[i]); + } + + if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) { + for (int64_t ic = 0; ic < nev1; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + ggml_vec_dot_f16(nek1, + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), + (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), + S16); + } + } else { + for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + ggml_vec_dot_f16_unroll(nek1, nbv1, + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), + ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), + S16); + } + } + } +} + +static void ggml_compute_forward_flash_attn( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const bool masked, + struct ggml_tensor * dst) { + switch (q->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_flash_attn_f16(params, q, k, v, masked, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_flash_ff + +static void ggml_compute_forward_flash_ff_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, // F16 + const struct ggml_tensor * b0, // F16 fc_w + const struct ggml_tensor * b1, // F32 fc_b + const struct ggml_tensor * c0, // F16 proj_w + const struct ggml_tensor * c1, // F32 proj_b + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_LOCALS(int64_t, nea, a, ne); + GGML_TENSOR_LOCALS(size_t, nba, a, nb); + GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne); + GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb); + GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne); + GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb); + GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne); + GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb); + GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne); + GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t D = nea0; + //const int64_t N = nea1; + const int64_t M = neb01; + + GGML_ASSERT(ne0 == nea0); + GGML_ASSERT(ne1 == nea1); + GGML_ASSERT(ne2 == nea2); + + GGML_ASSERT(nba0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nbb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nbb10 == sizeof(float)); + GGML_ASSERT(nbc00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nbc10 == sizeof(float)); + + GGML_ASSERT(neb00 == D); + GGML_ASSERT(neb01 == M); + GGML_ASSERT(neb10 == M); + GGML_ASSERT(neb11 == 1); + + GGML_ASSERT(nec00 == M); + GGML_ASSERT(nec01 == D); + GGML_ASSERT(nec10 == D); + GGML_ASSERT(nec11 == 1); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // parallelize by a rows using ggml_vec_dot_f32 + + // total rows in a + const int nr = nea1*nea2*nea3; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // a indices + const int ia3 = ir/(nea2*nea1); + const int ia2 = (ir - ia3*nea2*nea1)/nea1; + const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1); + + float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32); + + for (int64_t ic = 0; ic < neb01; ++ic) { + // b0 indices + const int ib03 = ia3; + const int ib02 = ia2; + const int ib01 = ic; + + // S indices + const int i1 = ib01; + + ggml_vec_dot_f16(nea0, + S + i1, + (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), + (ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3))); + } + + ggml_vec_add_f32(neb01, S, S, (float *) b1->data); + //ggml_vec_gelu_f32(neb01, S, S); + + ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M); + + for (int64_t i = 0; i < M; i++) { + S16[i] = GGML_FP32_TO_FP16(S[i]); + } + + ggml_vec_gelu_f16(neb01, S16, S16); + + { + // dst indices + const int i1 = ia1; + const int i2 = ia2; + const int i3 = ia3; + + for (int64_t ic = 0; ic < nec01; ++ic) { + + ggml_vec_dot_f16(neb01, + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), + (ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), + S16); + } + + ggml_vec_add_f32(nec01, + (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)), + (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)), + (float *) c1->data); + } + } +} + +static void ggml_compute_forward_flash_ff( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + const struct ggml_tensor * b0, + const struct ggml_tensor * b1, + const struct ggml_tensor * c0, + const struct ggml_tensor * c1, + struct ggml_tensor * dst) { + switch (b0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst); + } break; + case GGML_TYPE_F32: + { + GGML_ASSERT(false); // TODO + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_flash_attn_back + +static void ggml_compute_forward_flash_attn_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const struct ggml_tensor * d, + const bool masked, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_LOCALS(int64_t, neq, q, ne); + GGML_TENSOR_LOCALS(size_t, nbq, q, nb); + GGML_TENSOR_LOCALS(int64_t, nek, k, ne); + GGML_TENSOR_LOCALS(size_t, nbk, k, nb); + GGML_TENSOR_LOCALS(int64_t, nev, v, ne); + GGML_TENSOR_LOCALS(size_t, nbv, v, nb); + GGML_TENSOR_LOCALS(int64_t, ned, d, ne); + GGML_TENSOR_LOCALS(size_t, nbd, d, nb); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t D = neq0; + const int64_t N = neq1; + const int64_t P = nek1 - N; + const int64_t M = P + N; + + const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL); + const int mxDM = MAX(D, Mup); + + // GGML_ASSERT(ne0 == D); + // GGML_ASSERT(ne1 == N); + GGML_ASSERT(P >= 0); + + GGML_ASSERT(nbq0 == sizeof(float)); + GGML_ASSERT(nbk0 == sizeof(float)); + GGML_ASSERT(nbv0 == sizeof(float)); + + GGML_ASSERT(neq0 == D); + GGML_ASSERT(nek0 == D); + GGML_ASSERT(nev1 == D); + GGML_ASSERT(ned0 == D); + + GGML_ASSERT(neq1 == N); + GGML_ASSERT(nek1 == N + P); + GGML_ASSERT(nev1 == D); + GGML_ASSERT(ned1 == N); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + if (params->type == GGML_TASK_INIT) { + if (ith == 0) { + memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3); + } + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // parallelize by q rows using ggml_vec_dot_f32 + + // total rows in q + const int nr = neq2*neq3; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + const float scale = 1.0f/sqrtf(D); + + //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); + + for (int ir = ir0; ir < ir1; ++ir) { + // q indices + const int iq3 = ir/(neq2); + const int iq2 = ir - iq3*neq2; + for ( int iq1 = 0; iq1 < neq1; ++iq1) { + + + // not sure about CACHE_LINE_SIZE_F32.. + // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset? + float * S = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32); + float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32); + + for (int i = M; i < Mup; ++i) { + S[i] = -INFINITY; + } + + for (int64_t ic = 0; ic < nek1; ++ic) { + // k indices + const int ik3 = iq3; + const int ik2 = iq2; + const int ik1 = ic; + + // S indices + const int i1 = ik1; + + ggml_vec_dot_f32(neq0, + S + i1, + (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + } + + // scale + ggml_vec_scale_f32(nek1, S, scale); + + if (masked) { + for (int64_t i = P; i < M; i++) { + if (i > P + iq1) { + S[i] = -INFINITY; + } + } + } + + // softmax + { + float max = -INFINITY; + ggml_vec_max_f32(M, &max, S); + + ggml_float sum = 0.0; + { +#ifdef GGML_SOFT_MAX_ACCELERATE + max = -max; + vDSP_vsadd(SM, 1, &max, SM, 1, Mup); + vvexpf(SM, SM, &Mup); + ggml_vec_sum_f32(Mup, &sum, SM); +#else + uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt); + ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; + + for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { + float * SR = S + i; + float * SW = SM + i; + + for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { + if (SR[j] == -INFINITY) { + SW[j] = 0.0f; + } else { +#ifndef GGML_FLASH_ATTN_EXP_FP16 + const float val = expf(SR[j] - max); +#else + ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max); + memcpy(&scvt[j], &s, sizeof(uint16_t)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); +#endif + sump[j] += (ggml_float)val; + SW[j] = val; + } + } + } + + for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { + sum += sump[i]; + } +#endif + } + + assert(sum > 0.0); + + sum = 1.0/sum; + ggml_vec_scale_f32(M, SM, sum); + + } + + // step-by-step explanation + { + // forward-process shape grads from backward process + // parallel_for iq2,iq3: + // k[:D,:M,:,:] [D,M,:,:] grad[k][:D,:M,iq2,iq3] += grad[kcur] + // q[:D,:N,:,:] [D,N,:,:] grad[q][:D,iq1,iq2,iq3] += grad[qcur] + // v[:M,:D,:,:] [M,D,:,:] grad[v][:M,:D,iq2,iq3] += grad[vcur] + // for iq1: + // kcur = k[:D,:M,iq2,iq3] [D,M,1,1] grad[kcur] = grad[S1].T @ qcur + // qcur = q[:D,iq1,iq2,iq3] [D,1,1,1] grad[qcur] = grad[S1] @ kcur + // vcur = v[:M,:D,iq2,iq3] [M,D,1,1] grad[vcur] = grad[S5].T @ S4 + // S0 = -Inf [D,1,1,1] + // ~S1[i] = dot(kcur[:D,i], qcur) + // S1 = qcur @ kcur.T [M,1,1,1] grad[S1] = grad[S2] * scale + // S2 = S1 * scale [M,1,1,1] grad[S2] = diag_mask_zero(grad[S3], P) + // S3 = diag_mask_inf(S2, P) [M,1,1,1] grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // S4 = softmax(S3) [M,1,1,1] grad[S4] = grad[S5] @ vcur + // ~S5[i] = dot(vcur[:,i], S4) + // S5 = S4 @ vcur.T [D,1,1,1] grad[S5] = d[:D,iq1,iq2,iq3] + // ~dst[i,iq1,iq2,iq3] = S5[i] ^ + // dst[:D,iq1,iq2,iq3] = S5 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,iq1,iq2,iq3] + // dst backward-/ grad[dst] = d + // + // output gradients with their dependencies: + // + // grad[kcur] = grad[S1].T @ qcur + // grad[S1] = diag_mask_zero(grad[S3], P) * scale + // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // grad[S4] = grad[S5] @ vcur + // grad[S4] = d[:D,iq1,iq2,iq3] @ vcur + // grad[qcur] = grad[S1] @ kcur + // grad[vcur] = grad[S5].T @ S4 + // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4 + // + // in post-order: + // + // S1 = qcur @ kcur.T + // S2 = S1 * scale + // S3 = diag_mask_inf(S2, P) + // S4 = softmax(S3) + // grad[S4] = d[:D,iq1,iq2,iq3] @ vcur + // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // grad[S1] = diag_mask_zero(grad[S3], P) * scale + // grad[qcur] = grad[S1] @ kcur + // grad[kcur] = grad[S1].T @ qcur + // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4 + // + // using less variables (SM=S4): + // + // S = diag_mask_inf(qcur @ kcur.T * scale, P) + // SM = softmax(S) + // S = d[:D,iq1,iq2,iq3] @ vcur + // dot_SM_gradSM = dot(SM, S) + // S = SM * (S - dot(SM, S)) + // S = diag_mask_zero(S, P) * scale + // + // grad[q][:D,iq1,iq2,iq3] += S @ kcur + // grad[k][:D,:M,iq2,iq3] += S.T @ qcur + // grad[v][:M,:D,iq2,iq3] += d[:D,iq1,iq2,iq3].T @ SM + } + + // S = gradSM = d[:D,iq1,iq2,iq3] @ vcur + // S = d[:D,iq1,iq2,iq3] @ vcur + // S[:M] += vcur[:M,ic] * d[ic,iq1,iq2,iq3] + ggml_vec_set_f32(M, S, 0); + for (int64_t ic = 0; ic < D; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + ggml_vec_mad_f32(M, + S, + (float *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), + *(float *) ((char *) d->data + (ic*nbd0 + i1*nbd1 + i2*nbd2 + i3*nbd3))); + } + + // S = SM * (S - dot(SM, S)) + float dot_SM_gradSM = 0; + ggml_vec_dot_f32 (M, &dot_SM_gradSM, SM, S); + ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); + ggml_vec_mul_f32 (M, S, S, SM); + + // S = diag_mask_zero(S, P) * scale + if (masked) { + // for (int64_t i = P + iq1 + 1; i < M; i++) { + // S[i] = 0; + // } + for (int64_t i = P; i < M; i++) { + if (i > P + iq1) { + S[i] = 0; + } + } + } + ggml_vec_scale_f32(M, S, scale); + + void * grad_q = (char *) dst->data; + void * grad_k = (char *) dst->data + nb0*D*N*neq2*neq3; + void * grad_v = (char *) dst->data + nb0*D*N*neq2*neq3 + nb0*D*M*neq2*neq3; + + const size_t nbgq1 = nb0*neq0; + const size_t nbgq2 = nb0*neq0*neq1; + const size_t nbgq3 = nb0*neq0*neq1*neq2; + + const size_t nbgk1 = nb0*nek0; + const size_t nbgk2 = nb0*nek0*nek1; + const size_t nbgk3 = nb0*nek0*nek1*neq2; + + const size_t nbgv1 = nb0*nev0; + const size_t nbgv2 = nb0*nev0*nev1; + const size_t nbgv3 = nb0*nev0*nev1*neq2; + + // S shape [M,1] + // SM shape [M,1] + // kcur shape [D,M] + // qcur shape [D,1] + // vcur shape [M,D] + // + // grad[q][:D,iq1,iq2,iq3] += S @ kcur + // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M] + // grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic] + // + //// grad[q][ic,iq1,iq2,iq3] += dot(kcur[:,ic],S.T) + //// grad[q][ic,iq1,iq2,iq3] += dot(k[:D,ic,iq2,iq3],S.T) + for (int64_t ic = 0; ic < M; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + ggml_vec_mad_f32(D, + (float *) ((char *) grad_q + (i1*nbgq1 + i2*nbgq2 + i3*nbgq3)), + (float *) ((char *) k->data + (ic*nbk1 + i2*nbk2 + i3*nbk3)), + S[ic]); + } + + // grad[k][:D,:M,iq2,iq3] += S.T @ qcur + // grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0] + // grad[k][:D,ic,iq2,iq3] += S[ic] * qcur[:D,0] + for (int64_t ic = 0; ic < M; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + // ggml_vec_set_f32(D, + // (float *) ((char *) grad_k + (ic*nbgk1 + i2*nbgk2 + i3*nbgk3)), + // 0); + ggml_vec_mad_f32(D, + (float *) ((char *) grad_k + (ic*nbgk1 + i2*nbgk2 + i3*nbgk3)), + (float *) ((char *) q->data + (i1*nbq1 + i2*nbq2 + i3*nbq3)), + S[ic]); + } + + // grad[v][:M,:D,iq2,iq3] += d[:D,iq1,iq2,iq3].T @ SM + // grad[v][:M,ic,iq2,iq3] += d[:D,iq1,iq2,iq3].T[0,ic] * SM[:M] + // grad[v][:M,ic,iq2,iq3] += d[ic,iq1,iq2,iq3] * SM[:M] + for (int64_t ic = 0; ic < D; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + // ggml_vec_set_f32(M, + // (float *) ((char *) grad_v + ( ic*nbgv1 + i2*nbgv2 + i3*nbgv3)), + // 0); + ggml_vec_mad_f32(M, + (float *) ((char *) grad_v + ( ic*nbgv1 + i2*nbgv2 + i3*nbgv3)), + SM, + *(float *) ((char *) d->data + (ic*nbd0 + i1*nbd1 + i2*nbd2 + i3*nbd3))); + } + } + } +} + +static void ggml_compute_forward_flash_attn_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const struct ggml_tensor * d, + const bool masked, + struct ggml_tensor * dst) { + switch (q->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_flash_attn_back_f32(params, q, k, v, d, masked, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_win_part + +static void ggml_compute_forward_win_part_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + + const int32_t nep0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t nep1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t w = ((const int32_t *)(dst->op_params))[2]; + + assert(ne00 == ne0); + assert(ne3 == nep0*nep1); + + // TODO: optimize / multi-thread + for (int py = 0; py < nep1; ++py) { + for (int px = 0; px < nep0; ++px) { + const int64_t i3 = py*nep0 + px; + for (int64_t i2 = 0; i2 < ne2; ++i2) { + for (int64_t i1 = 0; i1 < ne1; ++i1) { + for (int64_t i0 = 0; i0 < ne0; ++i0) { + const int64_t i02 = py*w + i2; + const int64_t i01 = px*w + i1; + const int64_t i00 = i0; + + const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + i0; + const int64_t j = i02*ne01*ne00 + i01*ne00 + i00; + + if (py*w + i2 >= ne02 || px*w + i1 >= ne01) { + ((float *) dst->data)[i] = 0.0f; + } else { + ((float *) dst->data)[i] = ((float *) src0->data)[j]; + } + } + } + } + } + } +} + +static void ggml_compute_forward_win_part( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_win_part_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_win_unpart + +static void ggml_compute_forward_win_unpart_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + + const int32_t w = ((const int32_t *)(dst->op_params))[0]; + + // padding + const int px = (w - ne1%w)%w; + //const int py = (w - ne2%w)%w; + + const int npx = (px + ne1)/w; + //const int npy = (py + ne2)/w; + + assert(ne0 == ne00); + + // TODO: optimize / multi-thread + for (int64_t i2 = 0; i2 < ne2; ++i2) { + for (int64_t i1 = 0; i1 < ne1; ++i1) { + for (int64_t i0 = 0; i0 < ne0; ++i0) { + const int ip2 = i2/w; + const int ip1 = i1/w; + + const int64_t i02 = i2%w; + const int64_t i01 = i1%w; + const int64_t i00 = i0; + + const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00; + const int64_t j = i2*ne1*ne0 + i1*ne0 + i0; + + ((float *) dst->data)[j] = ((float *) src0->data)[i]; + } + } + } +} + +static void ggml_compute_forward_win_unpart( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_win_unpart_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +//gmml_compute_forward_unary + +static void ggml_compute_forward_unary( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + const enum ggml_unary_op op = ggml_get_unary_op(dst); + + switch (op) { + case GGML_UNARY_OP_ABS: + { + ggml_compute_forward_abs(params, src0, dst); + } break; + case GGML_UNARY_OP_SGN: + { + ggml_compute_forward_sgn(params, src0, dst); + } break; + case GGML_UNARY_OP_NEG: + { + ggml_compute_forward_neg(params, src0, dst); + } break; + case GGML_UNARY_OP_STEP: + { + ggml_compute_forward_step(params, src0, dst); + } break; + case GGML_UNARY_OP_TANH: + { + ggml_compute_forward_tanh(params, src0, dst); + } break; + case GGML_UNARY_OP_ELU: + { + ggml_compute_forward_elu(params, src0, dst); + } break; + case GGML_UNARY_OP_RELU: + { + ggml_compute_forward_relu(params, src0, dst); + } break; + case GGML_UNARY_OP_GELU: + { + ggml_compute_forward_gelu(params, src0, dst); + } break; + case GGML_UNARY_OP_GELU_QUICK: + { + ggml_compute_forward_gelu_quick(params, src0, dst); + } break; + case GGML_UNARY_OP_SILU: + { + ggml_compute_forward_silu(params, src0, dst); + } break; + case GGML_UNARY_OP_GLU: + { + ggml_compute_forward_glu(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_get_rel_pos + +static void ggml_compute_forward_get_rel_pos_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322 + + GGML_TENSOR_UNARY_OP_LOCALS; + + const int64_t w = ne1; + + ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data; + ggml_fp16_t * dst_data = (ggml_fp16_t *) dst->data; + + for (int64_t i2 = 0; i2 < ne2; ++i2) { + for (int64_t i1 = 0; i1 < ne1; ++i1) { + const int64_t pos = (w - i1 - 1) + i2; + for (int64_t i0 = 0; i0 < ne0; ++i0) { + dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0]; + } + } + } +} + +static void ggml_compute_forward_get_rel_pos( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_get_rel_pos_f16(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_add_rel_pos + +static void ggml_compute_forward_add_rel_pos_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * src2, + struct ggml_tensor * dst) { + + const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; + if (!inplace && params->type == GGML_TASK_INIT) { + memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst)); + return; + } + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359 + + float * src1_data = (float *) src1->data; + float * src2_data = (float *) src2->data; + float * dst_data = (float *) dst->data; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + const int ith = params->ith; + const int nth = params->nth; + + // total patches in dst + const int np = ne13; + + // patches per thread + const int dp = (np + nth - 1)/nth; + + // patch range for this thread + const int ip0 = dp*ith; + const int ip1 = MIN(ip0 + dp, np); + + + for (int64_t i13 = ip0; i13 < ip1; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10; + for (int64_t i10 = 0; i10 < ne10; ++i10) { + const int64_t jp0 = jp1 + i10; + const float src1_e = src1_data[jp0]; + const float src2_e = src2_data[jp0]; + + const int64_t jdh = jp0 * ne10; + const int64_t jdw = jdh - (ne10 - 1) * i10; + + for (int64_t j = 0; j < ne10; ++j) { + dst_data[jdh + j ] += src2_e; + dst_data[jdw + j*ne10] += src1_e; + } + } + } + } + } +} + +static void ggml_compute_forward_add_rel_pos( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * src2, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_map_unary + +static void ggml_compute_forward_map_unary_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst, + const ggml_unary_op_f32_t fun) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + fun(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + + +static void ggml_compute_forward_map_unary( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst, + const ggml_unary_op_f32_t fun) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_map_unary_f32(params, src0, dst, fun); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_map_binary + +static void ggml_compute_forward_map_binary_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst, + const ggml_binary_op_f32_t fun) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + assert(src1->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + fun(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1])), + (float *) ((char *) src1->data + i*(src1->nb[1]))); + } +} + + +static void ggml_compute_forward_map_binary( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst, + const ggml_binary_op_f32_t fun) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_map_custom1 + +static void ggml_compute_forward_map_custom1_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + struct ggml_tensor * dst, + const ggml_custom1_op_f32_t fun) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + fun(dst, a); +} + +// ggml_compute_forward_map_custom2 + +static void ggml_compute_forward_map_custom2_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + const struct ggml_tensor * b, + struct ggml_tensor * dst, + const ggml_custom2_op_f32_t fun) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + fun(dst, a, b); +} + + +// ggml_compute_forward_map_custom3 + +static void ggml_compute_forward_map_custom3_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + const struct ggml_tensor * b, + const struct ggml_tensor * c, + struct ggml_tensor * dst, + const ggml_custom3_op_f32_t fun) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + fun(dst, a, b, c); +} + +// ggml_compute_forward_map_custom1 + +static void ggml_compute_forward_map_custom1( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params; + + p->fun(dst, a, params->ith, params->nth, p->userdata); +} + +// ggml_compute_forward_map_custom2 + +static void ggml_compute_forward_map_custom2( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + const struct ggml_tensor * b, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params; + + p->fun(dst, a, b, params->ith, params->nth, p->userdata); +} + +// ggml_compute_forward_map_custom3 + +static void ggml_compute_forward_map_custom3( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + const struct ggml_tensor * b, + const struct ggml_tensor * c, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params; + + p->fun(dst, a, b, c, params->ith, params->nth, p->userdata); +} + +// ggml_compute_forward_cross_entropy_loss + +static void ggml_compute_forward_cross_entropy_loss_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_scalar(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, src1)); + + const int ith = params->ith; + const int nth = params->nth; + + float * sums = (float *) params->wdata; + + // TODO: handle transposed/permuted matrices + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc)); + + if (params->type == GGML_TASK_INIT) { + if (ith == 0) { + memset(sums, 0, sizeof(float) * (nth + nth * nc)); + } + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + if (ith == 0) { + float * dp = (float *) dst->data; + ggml_vec_sum_f32(nth, dp, sums); + dp[0] *= -1.0f / (float) nr; + } + return; + } + + const double eps = 1e-9; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]); + float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); + float * st = ((float *) params->wdata) + nth + ith*nc; + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + //printf("p[%d] = %f\n", i, p[i]); + assert(!isnan(s0[i])); + assert(!isnan(s1[i])); + } +#endif + // soft_max + ggml_float sum = 0.0; + { + float max = -INFINITY; + ggml_vec_max_f32(nc, &max, s0); + + uint16_t scvt; UNUSED(scvt); + for (int i = 0; i < nc; i++) { + if (s0[i] == -INFINITY) { + st[i] = 0.0f; + } else { +#ifndef GGML_CROSS_ENTROPY_EXP_FP16 + const float s = s0[i] - max; + const float val = expf(s); +#else + ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max); + memcpy(&scvt, &s, sizeof(scvt)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); +#endif + sum += (ggml_float)val; + st[i] = val; + } + } + + assert(sum > 0.0); + // sum = 1.0/sum; + } + // avoid log(0) by rescaling from [0..1] to [eps..1] + sum = (1.0 - eps) / sum; + ggml_vec_scale_f32(nc, st, sum); + ggml_vec_add1_f32(nc, st, st, eps); + ggml_vec_log_f32(nc, st, st); + ggml_vec_mul_f32(nc, st, st, s1); + + float st_sum = 0; + ggml_vec_sum_f32(nc, &st_sum, st); + sums[ith] += st_sum; + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + assert(!isnan(st[i])); + assert(!isinf(st[i])); + } +#endif + } + +} + +static void ggml_compute_forward_cross_entropy_loss( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_cross_entropy_loss_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_cross_entropy_loss_back + +static void ggml_compute_forward_cross_entropy_loss_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(opt0)); + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + const int64_t ith = params->ith; + const int64_t nth = params->nth; + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const double eps = 1e-9; + + // TODO: handle transposed/permuted matrices + const int64_t nc = src0->ne[0]; + const int64_t nr = ggml_nrows(src0); + + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; + + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); + + float * d = (float *) opt0->data; + + for (int64_t i1 = ir0; i1 < ir1; i1++) { + float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]); + float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]); + float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + //printf("p[%d] = %f\n", i, p[i]); + assert(!isnan(s0[i])); + assert(!isnan(s1[i])); + } +#endif + + // soft_max + ggml_float sum = 0.0; + { + float max = -INFINITY; + ggml_vec_max_f32(nc, &max, s0); + + uint16_t scvt; UNUSED(scvt); + for (int i = 0; i < nc; i++) { + if (s0[i] == -INFINITY) { + ds0[i] = 0.0f; + } else { +#ifndef GGML_CROSS_ENTROPY_EXP_FP16 + const float s = s0[i] - max; + const float val = expf(s); +#else + ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max); + memcpy(&scvt, &s, sizeof(scvt)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); +#endif + sum += (ggml_float)val; + ds0[i] = val; + } + } + + assert(sum > 0.0); + sum = (1.0 - eps)/sum; + } + + // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr + ggml_vec_scale_f32(nc, ds0, sum); + ggml_vec_add1_f32(nc, ds0, ds0, eps); + ggml_vec_sub_f32(nc, ds0, ds0, s1); + ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr); + + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + assert(!isnan(ds0[i])); + assert(!isinf(ds0[i])); + } +#endif + } +} + +static void ggml_compute_forward_cross_entropy_loss_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_cross_entropy_loss_back_f32(params, src0, src1, opt0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + + +///////////////////////////////// + +static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { + GGML_ASSERT(params); + +#ifdef GGML_USE_CUBLAS + bool skip_cpu = ggml_cuda_compute_forward(params, tensor); + if (skip_cpu) { + return; + } + GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); +#endif // GGML_USE_CUBLAS + + switch (tensor->op) { + case GGML_OP_DUP: + { + ggml_compute_forward_dup(params, tensor->src[0], tensor); + } break; + case GGML_OP_ADD: + { + ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_ADD1: + { + ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_ACC: + { + ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_SUB: + { + ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_MUL: + { + ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_DIV: + { + ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_SQR: + { + ggml_compute_forward_sqr(params, tensor->src[0], tensor); + } break; + case GGML_OP_SQRT: + { + ggml_compute_forward_sqrt(params, tensor->src[0], tensor); + } break; + case GGML_OP_LOG: + { + ggml_compute_forward_log(params, tensor->src[0], tensor); + } break; + case GGML_OP_SUM: + { + ggml_compute_forward_sum(params, tensor->src[0], tensor); + } break; + case GGML_OP_SUM_ROWS: + { + ggml_compute_forward_sum_rows(params, tensor->src[0], tensor); + } break; + case GGML_OP_MEAN: + { + ggml_compute_forward_mean(params, tensor->src[0], tensor); + } break; + case GGML_OP_ARGMAX: + { + ggml_compute_forward_argmax(params, tensor->src[0], tensor); + } break; + case GGML_OP_REPEAT: + { + ggml_compute_forward_repeat(params, tensor->src[0], tensor); + } break; + case GGML_OP_REPEAT_BACK: + { + ggml_compute_forward_repeat_back(params, tensor->src[0], tensor); + } break; + case GGML_OP_CONCAT: + { + ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_SILU_BACK: + { + ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_NORM: + { + ggml_compute_forward_norm(params, tensor->src[0], tensor); + } break; + case GGML_OP_BATCH_NORM: + { + ggml_compute_forward_batch_norm(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor); + } break; + case GGML_OP_RMS_NORM: + { + ggml_compute_forward_rms_norm(params, tensor->src[0], tensor); + } break; + case GGML_OP_RMS_NORM_BACK: + { + ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_GROUP_NORM: + { + ggml_compute_forward_group_norm(params, tensor->src[0], tensor); + } break; + case GGML_OP_MUL_MAT: + { + ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_OUT_PROD: + { + ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_SCALE: + { + ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_SET: + { + ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_CPY: + { + ggml_compute_forward_cpy(params, tensor->src[0], tensor); + } break; + case GGML_OP_CONT: + { + ggml_compute_forward_cont(params, tensor->src[0], tensor); + } break; + case GGML_OP_RESHAPE: + { + ggml_compute_forward_reshape(params, tensor->src[0], tensor); + } break; + case GGML_OP_VIEW: + { + ggml_compute_forward_view(params, tensor->src[0]); + } break; + case GGML_OP_PERMUTE: + { + ggml_compute_forward_permute(params, tensor->src[0]); + } break; + case GGML_OP_TRANSPOSE: + { + ggml_compute_forward_transpose(params, tensor->src[0]); + } break; + case GGML_OP_GET_ROWS: + { + ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_GET_ROWS_BACK: + { + ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + } break; + case GGML_OP_DIAG: + { + ggml_compute_forward_diag(params, tensor->src[0], tensor); + } break; + case GGML_OP_DIAG_MASK_INF: + { + ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor); + } break; + case GGML_OP_DIAG_MASK_ZERO: + { + ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor); + } break; + case GGML_OP_SOFT_MAX: + { + ggml_compute_forward_soft_max(params, tensor->src[0], tensor); + } break; + case GGML_OP_SOFT_MAX_BACK: + { + ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_ROPE: + { + ggml_compute_forward_rope(params, tensor->src[0], tensor); + } break; + case GGML_OP_ROPE_BACK: + { + ggml_compute_forward_rope_back(params, tensor->src[0], tensor); + } break; + case GGML_OP_ALIBI: + { + ggml_compute_forward_alibi(params, tensor->src[0], tensor); + } break; + case GGML_OP_CLAMP: + { + ggml_compute_forward_clamp(params, tensor->src[0], tensor); + } break; + case GGML_OP_CONV_1D: + { + ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_CONV_1D_STAGE_0: + { + ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_CONV_1D_STAGE_1: + { + ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor); + } break; + case GGML_OP_CONV_1D_STAGE_2: + { + ggml_compute_forward_conv_1d_stage_2(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_CONV_1D_GENERIC: + { + ggml_compute_forward_conv_1d_generic(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_CONV_1D_GENERIC_STAGE_0: + { + ggml_compute_forward_conv_1d_generic_stage_0(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_CONV_1D_GENERIC_STAGE_1: + { + ggml_compute_forward_conv_1d_generic_stage_1(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_CONV_2D: + { + ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_CONV_TRANSPOSE_2D: + { + ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_POOL_1D: + { + ggml_compute_forward_pool_1d(params, tensor->src[0], tensor); + } break; + case GGML_OP_POOL_2D: + { + ggml_compute_forward_pool_2d(params, tensor->src[0], tensor); + } break; + case GGML_OP_UPSCALE: + { + ggml_compute_forward_upscale(params, tensor->src[0], tensor); + } break; + case GGML_OP_FLASH_ATTN: + { + const int32_t t = ggml_get_op_params_i32(tensor, 0); + GGML_ASSERT(t == 0 || t == 1); + const bool masked = t != 0; + ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor); + } break; + case GGML_OP_FLASH_FF: + { + ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor); + } break; + case GGML_OP_FLASH_ATTN_BACK: + { + int32_t t = ggml_get_op_params_i32(tensor, 0); + GGML_ASSERT(t == 0 || t == 1); + bool masked = t != 0; + ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor); + } break; + case GGML_OP_WIN_PART: + { + ggml_compute_forward_win_part(params, tensor->src[0], tensor); + } break; + case GGML_OP_WIN_UNPART: + { + ggml_compute_forward_win_unpart(params, tensor->src[0], tensor); + } break; + case GGML_OP_UNARY: + { + ggml_compute_forward_unary(params, tensor->src[0], tensor); + } break; + case GGML_OP_GET_REL_POS: + { + ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor); + } break; + case GGML_OP_ADD_REL_POS: + { + ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + } break; + case GGML_OP_MAP_UNARY: + { + ggml_unary_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun); + } + break; + case GGML_OP_MAP_BINARY: + { + ggml_binary_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM1_F32: + { + ggml_custom1_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM2_F32: + { + ggml_custom2_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM3_F32: + { + ggml_custom3_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM1: + { + ggml_compute_forward_map_custom1(params, tensor->src[0], tensor); + } + break; + case GGML_OP_MAP_CUSTOM2: + { + ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor); + } + break; + case GGML_OP_MAP_CUSTOM3: + { + ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + } + break; + case GGML_OP_CROSS_ENTROPY_LOSS: + { + ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor); + } + break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + { + ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + } + break; + case GGML_OP_NONE: + { + // nop + } break; + case GGML_OP_COUNT: + { + GGML_ASSERT(false); + } break; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) { + struct ggml_tensor * src0 = tensor->src[0]; + struct ggml_tensor * src1 = tensor->src[1]; + + switch (tensor->op) { + case GGML_OP_DUP: + { + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + } + } break; + case GGML_OP_ADD: + { + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + } + if (src1->grad) { + src1->grad = ggml_add_impl(ctx, src1->grad, tensor->grad, inplace); + } + } break; + case GGML_OP_ADD1: + { + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + } + if (src1->grad) { + src1->grad = ggml_add_impl(ctx, + src1->grad, + ggml_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean + inplace); + } + } break; + case GGML_OP_ACC: + { + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + } + if (src1->grad) { + const size_t nb1 = ((int32_t *) tensor->op_params)[0]; + const size_t nb2 = ((int32_t *) tensor->op_params)[1]; + const size_t nb3 = ((int32_t *) tensor->op_params)[2]; + const size_t offset = ((int32_t *) tensor->op_params)[3]; + + struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx, + tensor->grad, + src1->grad->ne[0], + src1->grad->ne[1], + src1->grad->ne[2], + src1->grad->ne[3], + nb1, nb2, nb3, offset); + + src1->grad = + ggml_add_impl(ctx, + src1->grad, + ggml_reshape(ctx, + ggml_cont(ctx, tensor_grad_view), + src1->grad), + inplace); + } + } break; + case GGML_OP_SUB: + { + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + } + if (src1->grad) { + src1->grad = ggml_sub_impl(ctx, src1->grad, tensor->grad, inplace); + } + } break; + case GGML_OP_MUL: + { + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_mul(ctx, src1, tensor->grad), + inplace); + } + if (src1->grad) { + src1->grad = + ggml_add_impl(ctx, + src1->grad, + ggml_mul(ctx, src0, tensor->grad), + inplace); + } + } break; + case GGML_OP_DIV: + { + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_div(ctx, tensor->grad, src1), + inplace); + } + if (src1->grad) { + src1->grad = + ggml_sub_impl(ctx, + src1->grad, + ggml_mul(ctx, + tensor->grad, + ggml_div(ctx, tensor, src1)), + inplace); + } + } break; + case GGML_OP_SQR: + { + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_scale(ctx, + ggml_mul(ctx, src0, tensor->grad), + ggml_new_f32(ctx, 2.0f)), + inplace); + } + } break; + case GGML_OP_SQRT: + { + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_scale(ctx, + ggml_div(ctx, + tensor->grad, + tensor), + ggml_new_f32(ctx, 0.5f)), + inplace); + } + } break; + case GGML_OP_LOG: + { + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_div(ctx, + tensor->grad, + src0), + inplace); + } + } break; + case GGML_OP_SUM: + { + if (src0->grad) { + src0->grad = + ggml_add1_impl(ctx, + src0->grad, + tensor->grad, + inplace); + } + } break; + case GGML_OP_SUM_ROWS: + { + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_repeat(ctx, + tensor->grad, + src0->grad), + inplace); + } + } break; + case GGML_OP_MEAN: + case GGML_OP_ARGMAX: + { + GGML_ASSERT(false); // TODO: implement + } break; + case GGML_OP_REPEAT: + { + // necessary for llama + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_repeat_back(ctx, tensor->grad, src0->grad), + inplace); + } + } break; + case GGML_OP_REPEAT_BACK: + { + if (src0->grad) { + // TODO: test this + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_repeat(ctx, tensor->grad, src0->grad), + inplace); + } + } break; + case GGML_OP_CONCAT: + { + GGML_ASSERT(false); // TODO: implement + } break; + case GGML_OP_SILU_BACK: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_NORM: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_BATCH_NORM: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_RMS_NORM: + { + // necessary for llama + if (src0->grad) { + float eps; + memcpy(&eps, tensor->op_params, sizeof(float)); + + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_rms_norm_back(ctx, src0, tensor->grad, eps), + inplace); + } + } break; + case GGML_OP_RMS_NORM_BACK: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_GROUP_NORM: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_MUL_MAT: + { + // https://cs231n.github.io/optimization-2/#staged + // # forward pass + // s0 = np.random.randn(5, 10) + // s1 = np.random.randn(10, 3) + // t = s0.dot(s1) + + // # now suppose we had the gradient on t from above in the circuit + // dt = np.random.randn(*t.shape) # same shape as t + // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix + // ds1 = t.T.dot(dt) + + // tensor.shape [m,p] + // src0.shape [n,m] + // src1.shape [n,p] + + // necessary for llama + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_out_prod(ctx, // [n,m] + src1, // [n,p] + tensor->grad), // [m,p] + inplace); + } + if (src1->grad) { + src1->grad = + ggml_add_impl(ctx, + src1->grad, + // ggml_mul_mat(ctx, // [n,p] + // ggml_cont(ctx, // [m,n] + // ggml_transpose(ctx, src0)), // [m,n] + // tensor->grad), // [m,p] + + // // when src0 is bigger than tensor->grad (this is mostly the case in llama), + // // avoid transpose of src0, rather transpose smaller tensor->grad + // // and then use ggml_out_prod + ggml_out_prod(ctx, // [n,p] + src0, // [n,m] + ggml_transpose(ctx, // [p,m] + tensor->grad)), // [m,p] + inplace); + } + } break; + case GGML_OP_OUT_PROD: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_SCALE: + { + // necessary for llama + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_scale_impl(ctx, tensor->grad, src1, false), + inplace); + } + if (src1->grad) { + src1->grad = + ggml_add_impl(ctx, + src1->grad, + ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)), + inplace); + } + } break; + case GGML_OP_SET: + { + const size_t nb1 = ((int32_t *) tensor->op_params)[0]; + const size_t nb2 = ((int32_t *) tensor->op_params)[1]; + const size_t nb3 = ((int32_t *) tensor->op_params)[2]; + const size_t offset = ((int32_t *) tensor->op_params)[3]; + + struct ggml_tensor * tensor_grad_view = NULL; + + if (src0->grad || src1->grad) { + GGML_ASSERT(src0->type == tensor->type); + GGML_ASSERT(tensor->grad->type == tensor->type); + GGML_ASSERT(tensor->grad->type == src1->grad->type); + + tensor_grad_view = ggml_view_4d(ctx, + tensor->grad, + src1->grad->ne[0], + src1->grad->ne[1], + src1->grad->ne[2], + src1->grad->ne[3], + nb1, nb2, nb3, offset); + } + + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_acc_impl(ctx, + tensor->grad, + ggml_neg(ctx, tensor_grad_view), + nb1, nb2, nb3, offset, false), + inplace); + } + + if (src1->grad) { + src1->grad = + ggml_add_impl(ctx, + src1->grad, + ggml_reshape(ctx, + ggml_cont(ctx, tensor_grad_view), + src1->grad), + inplace); + } + } break; + case GGML_OP_CPY: + { + // necessary for llama + // cpy overwrites value of src1 by src0 and returns view(src1) + // the overwriting is mathematically equivalent to: + // tensor = src0 * 1 + src1 * 0 + if (src0->grad) { + // dsrc0 = dtensor * 1 + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + } + if (src1->grad) { + // dsrc1 = dtensor * 0 -> noop + } + } break; + case GGML_OP_CONT: + { + // same as cpy + if (src0->grad) { + GGML_ASSERT(ggml_is_contiguous(src0->grad)); + GGML_ASSERT(ggml_is_contiguous(tensor->grad)); + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + } + } break; + case GGML_OP_RESHAPE: + { + // necessary for llama + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, src0->grad, + ggml_reshape(ctx, tensor->grad, src0->grad), + inplace); + } + } break; + case GGML_OP_VIEW: + { + // necessary for llama + if (src0->grad) { + size_t offset; + + memcpy(&offset, tensor->op_params, sizeof(offset)); + + size_t nb1 = tensor->nb[1]; + size_t nb2 = tensor->nb[2]; + size_t nb3 = tensor->nb[3]; + + if (src0->type != src0->grad->type) { + // gradient is typically F32, but src0 could be other type + size_t ng = ggml_element_size(src0->grad); + size_t n0 = ggml_element_size(src0); + GGML_ASSERT(offset % n0 == 0); + GGML_ASSERT(nb1 % n0 == 0); + GGML_ASSERT(nb2 % n0 == 0); + GGML_ASSERT(nb3 % n0 == 0); + offset = (offset / n0) * ng; + nb1 = (nb1 / n0) * ng; + nb2 = (nb2 / n0) * ng; + nb3 = (nb3 / n0) * ng; + } + + src0->grad = ggml_acc_impl(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, inplace); + } + } break; + case GGML_OP_PERMUTE: + { + // necessary for llama + if (src0->grad) { + int32_t * axes = (int32_t *) tensor->op_params; + int axis0 = axes[0] & 0x3; + int axis1 = axes[1] & 0x3; + int axis2 = axes[2] & 0x3; + int axis3 = axes[3] & 0x3; + int axes_backward[4] = {0,0,0,0}; + axes_backward[axis0] = 0; + axes_backward[axis1] = 1; + axes_backward[axis2] = 2; + axes_backward[axis3] = 3; + src0->grad = + ggml_add_impl(ctx, src0->grad, + ggml_permute(ctx, + tensor->grad, + axes_backward[0], + axes_backward[1], + axes_backward[2], + axes_backward[3]), + inplace); + } + } break; + case GGML_OP_TRANSPOSE: + { + // necessary for llama + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, src0->grad, + ggml_transpose(ctx, tensor->grad), + inplace); + } + } break; + case GGML_OP_GET_ROWS: + { + // necessary for llama (only for tokenizer) + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, src0->grad, + ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad), + inplace); + } + if (src1->grad) { + // noop + } + } break; + case GGML_OP_GET_ROWS_BACK: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_DIAG: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_DIAG_MASK_INF: + { + // necessary for llama + if (src0->grad) { + const int n_past = ((int32_t *) tensor->op_params)[0]; + src0->grad = + ggml_add_impl(ctx, src0->grad, + ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), + inplace); + } + } break; + case GGML_OP_DIAG_MASK_ZERO: + { + // necessary for llama + if (src0->grad) { + const int n_past = ((int32_t *) tensor->op_params)[0]; + src0->grad = + ggml_add_impl(ctx, src0->grad, + ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), + inplace); + } + } break; + case GGML_OP_SOFT_MAX: + { + // necessary for llama + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, src0->grad, + ggml_soft_max_back(ctx, tensor->grad, tensor), + inplace); + } + + } break; + case GGML_OP_SOFT_MAX_BACK: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_ROPE: + { + // necessary for llama + if (src0->grad) { + const int n_past = ((int32_t *) tensor->op_params)[0]; + const int n_dims = ((int32_t *) tensor->op_params)[1]; + const int mode = ((int32_t *) tensor->op_params)[2]; + const int n_ctx = ((int32_t *) tensor->op_params)[3]; + float freq_base; + float freq_scale; + float xpos_base; + bool xpos_down; + memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float)); + memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); + memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); + + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_rope_back(ctx, + tensor->grad, + n_past, + n_dims, + mode, + n_ctx, + freq_base, + freq_scale, + xpos_base, + xpos_down), + inplace); + } + } break; + case GGML_OP_ROPE_BACK: + { + if (src0->grad) { + const int n_past = ((int32_t *) tensor->op_params)[0]; + const int n_dims = ((int32_t *) tensor->op_params)[1]; + const int mode = ((int32_t *) tensor->op_params)[2]; + const int n_ctx = ((int32_t *) tensor->op_params)[3]; + float freq_base; + float freq_scale; + float xpos_base; + bool xpos_down; + memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float)); + memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); + memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); + + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_rope_impl(ctx, + tensor->grad, + n_past, + n_dims, + mode, + n_ctx, + freq_base, + freq_scale, + xpos_base, + xpos_down, + false), + inplace); + } + } break; + case GGML_OP_ALIBI: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CLAMP: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_1D: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_1D_STAGE_0: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_1D_STAGE_1: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_1D_STAGE_2: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_2D: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_TRANSPOSE_2D: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_POOL_1D: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_POOL_2D: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_UPSCALE: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_FLASH_ATTN: + { + struct ggml_tensor * flash_grad = NULL; + if (src0->grad || src1->grad || tensor->src[2]->grad) { + int32_t t = ggml_get_op_params_i32(tensor, 0); + GGML_ASSERT(t == 0 || t == 1); + bool masked = t != 0; + flash_grad = + ggml_flash_attn_back(ctx, + src0, + src1, + tensor->src[2], + tensor->grad, + masked); + } + + if (src0->grad) { + struct ggml_tensor * grad_q = NULL; + const size_t nb0 = flash_grad->nb[0]; + const size_t offset = 0; + switch(src0->n_dims) { + case 2: + { + grad_q = ggml_view_2d(ctx, + flash_grad, + src0->ne[0], + src0->ne[1], + nb0*src0->ne[0], + offset); + } break; + case 3: + { + grad_q = ggml_view_3d(ctx, + flash_grad, + src0->ne[0], + src0->ne[1], + src0->ne[2], + nb0*src0->ne[0], + nb0*src0->ne[0]*src0->ne[1], + offset); + } break; + case 4: + { + grad_q = ggml_view_4d(ctx, + flash_grad, + src0->ne[0], + src0->ne[1], + src0->ne[2], + src0->ne[3], + nb0*src0->ne[0], + nb0*src0->ne[0]*src0->ne[1], + nb0*src0->ne[0]*src0->ne[1]*src0->ne[2], + offset); + } break; + } + + src0->grad = ggml_add_impl(ctx, + src0->grad, + grad_q, + inplace); + } + + if (src1->grad) { + struct ggml_tensor * grad_k = NULL; + const size_t nb0 = flash_grad->nb[0]; + const size_t offset = nb0*src0->ne[0]*src0->ne[1]*src0->ne[2]*src0->ne[3]; + switch(src1->n_dims) { + case 2: + { + grad_k = ggml_view_2d(ctx, + flash_grad, + src1->ne[0], + src1->ne[1], + nb0*src1->ne[0], + offset); + } break; + case 3: + { + grad_k = ggml_view_3d(ctx, + flash_grad, + src1->ne[0], + src1->ne[1], + src1->ne[2], + nb0*src1->ne[0], + nb0*src1->ne[0]*src1->ne[1], + offset); + } break; + case 4: + { + grad_k = ggml_view_4d(ctx, + flash_grad, + src1->ne[0], + src1->ne[1], + src1->ne[2], + src1->ne[3], + nb0*src1->ne[0], + nb0*src1->ne[0]*src1->ne[1], + nb0*src1->ne[0]*src1->ne[1]*src1->ne[2], + offset); + } break; + } + + src1->grad = ggml_add_impl(ctx, + src1->grad, + grad_k, + inplace); + } + + struct ggml_tensor * opt0 = tensor->src[2]; + + if (opt0->grad) { + struct ggml_tensor * grad_v = NULL; + const size_t nb0 = flash_grad->nb[0]; + const size_t offset = nb0*src0->ne[0]*src0->ne[1]*src0->ne[2]*src0->ne[3] + + nb0*src1->ne[0]*src1->ne[1]*src1->ne[2]*src1->ne[3]; + switch(opt0->n_dims) { + case 2: + { + grad_v = ggml_view_2d(ctx, + flash_grad, + opt0->ne[0], + opt0->ne[1], + nb0*opt0->ne[0], + offset); + } break; + case 3: + { + grad_v = ggml_view_3d(ctx, + flash_grad, + opt0->ne[0], + opt0->ne[1], + opt0->ne[2], + nb0*opt0->ne[0], + nb0*opt0->ne[0]*opt0->ne[1], + offset); + } break; + case 4: + { + grad_v = ggml_view_4d(ctx, + flash_grad, + opt0->ne[0], + opt0->ne[1], + opt0->ne[2], + opt0->ne[3], + nb0*opt0->ne[0], + nb0*opt0->ne[0]*opt0->ne[1], + nb0*opt0->ne[0]*opt0->ne[1]*opt0->ne[2], + offset); + } break; + } + + opt0->grad = ggml_add_impl(ctx, + opt0->grad, + grad_v, + inplace); + } + } break; + case GGML_OP_FLASH_FF: + { + GGML_ASSERT(false); // not supported + } break; + case GGML_OP_FLASH_ATTN_BACK: + { + GGML_ASSERT(false); // not supported + } break; + case GGML_OP_WIN_PART: + case GGML_OP_WIN_UNPART: + case GGML_OP_UNARY: + { + switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_ABS: + { + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_mul(ctx, + ggml_sgn(ctx, src0), + tensor->grad), + inplace); + } + } break; + case GGML_UNARY_OP_SGN: + { + if (src0->grad) { + // noop + } + } break; + case GGML_UNARY_OP_NEG: + { + if (src0->grad) { + src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace); + } + } break; + case GGML_UNARY_OP_STEP: + { + if (src0->grad) { + // noop + } + } break; + case GGML_UNARY_OP_TANH: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_UNARY_OP_ELU: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_UNARY_OP_RELU: + { + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_mul(ctx, + ggml_step(ctx, src0), + tensor->grad), + inplace); + } + } break; + case GGML_UNARY_OP_GELU: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_UNARY_OP_GELU_QUICK: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_UNARY_OP_SILU: + { + // necessary for llama + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_silu_back(ctx, src0, tensor->grad), + inplace); + } + } break; + case GGML_UNARY_OP_GLU: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + default: + GGML_ASSERT(false); + } + } break; + case GGML_OP_GET_REL_POS: + case GGML_OP_ADD_REL_POS: + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1_F32: + case GGML_OP_MAP_CUSTOM2_F32: + case GGML_OP_MAP_CUSTOM3_F32: + case GGML_OP_MAP_CUSTOM1: + case GGML_OP_MAP_CUSTOM2: + case GGML_OP_MAP_CUSTOM3: + { + GGML_ASSERT(false); // not supported + } break; + case GGML_OP_CROSS_ENTROPY_LOSS: + { + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_cross_entropy_loss_back(ctx, + src0, + src1, + tensor->grad), + inplace); + } + } break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + { + GGML_ASSERT(false); // not supported + } break; + case GGML_OP_NONE: + { + // nop + } break; + case GGML_OP_COUNT: + { + GGML_ASSERT(false); + } break; + } +} + +static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small"); + +static size_t hash(void * p) { + return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE; +} + +static bool hash_insert(void * hash_table[], void * p) { + size_t h = hash(p); + + // linear probing + size_t i = h; + while (hash_table[i] != NULL && hash_table[i] != p) { + i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE; + if (i == h) { + // hash table is full + GGML_ASSERT(false); + } + } + + if (hash_table[i] == p) { + return true; + } + + // insert + hash_table[i] = p; + return false; +} + +static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) { + if (node->grad == NULL) { + // this usually happens when we generate intermediate nodes from constants in the backward pass + // it can also happen during forward pass, if the user performs computations with constants + if (node->op != GGML_OP_NONE) { + //GGML_PRINT_DEBUG("%s: warning: node %p has no grad, but op %d\n", __func__, (void *) node, node->op); + } + } + + // check if already visited + if (hash_insert(cgraph->visited_hash_table, node)) { + return; + } + + for (int i = 0; i < GGML_MAX_SRC; ++i) { + if (node->src[i]) { + ggml_visit_parents(cgraph, node->src[i]); + } + } + + if (node->op == GGML_OP_NONE && node->grad == NULL) { + // reached a leaf node, not part of the gradient graph (e.g. a constant) + GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES); + + if (strlen(node->name) == 0) { + ggml_format_name(node, "leaf_%d", cgraph->n_leafs); + } + + cgraph->leafs[cgraph->n_leafs] = node; + cgraph->n_leafs++; + } else { + GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES); + + if (strlen(node->name) == 0) { + ggml_format_name(node, "node_%d", cgraph->n_nodes); + } + + cgraph->nodes[cgraph->n_nodes] = node; + cgraph->grads[cgraph->n_nodes] = node->grad; + cgraph->n_nodes++; + } +} + +static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) { + if (!expand) { + cgraph->n_nodes = 0; + cgraph->n_leafs = 0; + } + + const int n0 = cgraph->n_nodes; + UNUSED(n0); + + ggml_visit_parents(cgraph, tensor); + + const int n_new = cgraph->n_nodes - n0; + GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new); + + if (n_new > 0) { + // the last added node should always be starting point + GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor); + } +} + +void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) { + ggml_build_forward_impl(cgraph, tensor, true); +} + +struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { + struct ggml_cgraph result = { + /*.n_nodes =*/ 0, + /*.n_leafs =*/ 0, + /*.nodes =*/ { NULL }, + /*.grads =*/ { NULL }, + /*.leafs =*/ { NULL }, + /*.hash_table =*/ { NULL }, + /*.perf_runs =*/ 0, + /*.perf_cycles =*/ 0, + /*.perf_time_us =*/ 0, + }; + + ggml_build_forward_impl(&result, tensor, false); + + return result; +} + +void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) { + GGML_ASSERT(gf->n_nodes > 0); + + // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph + if (keep) { + for (int i = 0; i < gf->n_nodes; i++) { + struct ggml_tensor * node = gf->nodes[i]; + + if (node->grad) { + node->grad = ggml_dup_tensor(ctx, node); + gf->grads[i] = node->grad; + } + } + } + + for (int i = gf->n_nodes - 1; i >= 0; i--) { + struct ggml_tensor * node = gf->nodes[i]; + + // because we detached the grad nodes from the original graph, we can afford inplace operations + if (node->grad) { + ggml_compute_backward(ctx, node, keep); + } + } + + for (int i = 0; i < gf->n_nodes; i++) { + struct ggml_tensor * node = gf->nodes[i]; + + if (node->is_param) { + GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node); + ggml_build_forward_expand(gb, node->grad); + } + } +} + +struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) { + struct ggml_cgraph result = *gf; + ggml_build_backward_expand(ctx, gf, &result, keep); + return result; +} + +struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) { + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE); + struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs); + + *cgraph = (struct ggml_cgraph) { + /*.n_nodes =*/ 0, + /*.n_leafs =*/ 0, + /*.nodes =*/ { NULL }, + /*.grads =*/ { NULL }, + /*.leafs =*/ { NULL }, + /*.hash_table =*/ { NULL }, + /*.perf_runs =*/ 0, + /*.perf_cycles =*/ 0, + /*.perf_time_us =*/ 0, + }; + + return cgraph; +} + +struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) { + struct ggml_cgraph * cgraph = ggml_new_graph(ctx); + ggml_build_forward_impl(cgraph, tensor, false); + return cgraph; +} + +size_t ggml_graph_overhead(void) { + return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN); +} + +// +// thread data +// +// synchronization is done via busy loops +// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops +// + +#ifdef __APPLE__ + +//#include +// +//typedef os_unfair_lock ggml_lock_t; +// +//#define ggml_lock_init(x) UNUSED(x) +//#define ggml_lock_destroy(x) UNUSED(x) +//#define ggml_lock_lock os_unfair_lock_lock +//#define ggml_lock_unlock os_unfair_lock_unlock +// +//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT + +typedef int ggml_lock_t; + +#define ggml_lock_init(x) UNUSED(x) +#define ggml_lock_destroy(x) UNUSED(x) +#define ggml_lock_lock(x) UNUSED(x) +#define ggml_lock_unlock(x) UNUSED(x) + +#define GGML_LOCK_INITIALIZER 0 + +typedef pthread_t ggml_thread_t; + +#define ggml_thread_create pthread_create +#define ggml_thread_join pthread_join + +#else + +//typedef pthread_spinlock_t ggml_lock_t; + +//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE) +//#define ggml_lock_destroy pthread_spin_destroy +//#define ggml_lock_lock pthread_spin_lock +//#define ggml_lock_unlock pthread_spin_unlock + +typedef int ggml_lock_t; + +#define ggml_lock_init(x) UNUSED(x) +#define ggml_lock_destroy(x) UNUSED(x) +#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) +#define ggml_lock_lock(x) _mm_pause() +#else +#define ggml_lock_lock(x) UNUSED(x) +#endif +#define ggml_lock_unlock(x) UNUSED(x) + +#define GGML_LOCK_INITIALIZER 0 + +typedef pthread_t ggml_thread_t; + +#define ggml_thread_create pthread_create +#define ggml_thread_join pthread_join + +#endif + +// Android's libc implementation "bionic" does not support setting affinity +#if defined(__linux__) && !defined(__BIONIC__) +static void set_numa_thread_affinity(int thread_n, int n_threads) { + if (!ggml_is_numa()) { + return; + } + + // run thread on node_num thread_n / (threads per node) + const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes); + struct ggml_numa_node * node = &g_state.numa.nodes[node_num]; + size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus); + + cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus); + CPU_ZERO_S(setsize, cpus); + for (size_t i = 0; i < node->n_cpus; ++i) { + CPU_SET_S(node->cpus[i], setsize, cpus); + } + + int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); + if (rv) { + fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", + strerror(rv)); + } + + CPU_FREE(cpus); +} + +static void clear_numa_thread_affinity(void) { + if (!ggml_is_numa()) { + return; + } + + size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus); + + cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus); + CPU_ZERO_S(setsize, cpus); + for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) { + CPU_SET_S(i, setsize, cpus); + } + + int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); + if (rv) { + fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", + strerror(rv)); + } + + CPU_FREE(cpus); +} +#else +// TODO: Windows etc. +// (the linux implementation may also work on BSD, someone should test) +static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); } +static void clear_numa_thread_affinity(void) {} +#endif + +struct ggml_compute_state_shared { + const struct ggml_cgraph * cgraph; + const struct ggml_cplan * cplan; + + int64_t perf_node_start_cycles; + int64_t perf_node_start_time_us; + + const int n_threads; + + // synchronization primitives + atomic_int n_active; // num active threads + atomic_int node_n; // active graph node + + bool (*abort_callback)(void * data); // abort ggml_graph_compute when true + void * abort_callback_data; +}; + +struct ggml_compute_state { + ggml_thread_t thrd; + int ith; + struct ggml_compute_state_shared * shared; +}; + +static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) { + int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles; + int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us; + + node->perf_runs++; + node->perf_cycles += cycles_cur; + node->perf_time_us += time_us_cur; +} + +static thread_ret_t ggml_graph_compute_thread(void * data) { + struct ggml_compute_state * state = (struct ggml_compute_state *) data; + + const struct ggml_cgraph * cgraph = state->shared->cgraph; + const struct ggml_cplan * cplan = state->shared->cplan; + + const int * n_tasks_arr = cplan->n_tasks; + const int n_threads = state->shared->n_threads; + + set_numa_thread_affinity(state->ith, n_threads); + + int node_n = -1; + + while (true) { + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + state->shared->node_n += 1; + return (thread_ret_t) GGML_EXIT_ABORTED; + } + if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { + // all other threads are finished and spinning + // do finalize and init here so we don't have synchronize again + struct ggml_compute_params params = { + /*.type =*/ GGML_TASK_FINALIZE, + /*.ith =*/ 0, + /*.nth =*/ 0, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + }; + + if (node_n != -1) { + /* FINALIZE */ + struct ggml_tensor * node = state->shared->cgraph->nodes[node_n]; + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.nth = n_tasks_arr[node_n]; + ggml_compute_forward(¶ms, node); + } + ggml_graph_compute_perf_stats_node(node, state->shared); + } + + // distribute new work or execute it direct if 1T + while (++node_n < cgraph->n_nodes) { + GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); + + struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = n_tasks_arr[node_n]; + + state->shared->perf_node_start_cycles = ggml_perf_cycles(); + state->shared->perf_node_start_time_us = ggml_perf_time_us(); + + params.nth = n_tasks; + + /* INIT */ + if (GGML_OP_HAS_INIT[node->op]) { + params.type = GGML_TASK_INIT; + ggml_compute_forward(¶ms, node); + } + + if (n_tasks == 1) { + // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, + // they do something more efficient than spinning (?) + params.type = GGML_TASK_COMPUTE; + ggml_compute_forward(¶ms, node); + + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.type = GGML_TASK_FINALIZE; + ggml_compute_forward(¶ms, node); + } + + ggml_graph_compute_perf_stats_node(node, state->shared); + } else { + break; + } + + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + break; + } + } + + atomic_store(&state->shared->n_active, n_threads); + atomic_store(&state->shared->node_n, node_n); + } else { + // wait for other threads to finish + const int last = node_n; + do { + //sched_yield(); + node_n = atomic_load(&state->shared->node_n); + } while (node_n == last); + } + + // check if we should stop + if (node_n >= cgraph->n_nodes) break; + + /* COMPUTE */ + struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = n_tasks_arr[node_n]; + + struct ggml_compute_params params = { + /*.type =*/ GGML_TASK_COMPUTE, + /*.ith =*/ state->ith, + /*.nth =*/ n_tasks, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + }; + + if (state->ith < n_tasks) { + ggml_compute_forward(¶ms, node); + } + } + + return GGML_EXIT_SUCCESS; +} + +struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { + if (n_threads <= 0) { + n_threads = GGML_DEFAULT_N_THREADS; + } + + size_t work_size = 0; + + struct ggml_cplan cplan; + memset(&cplan, 0, sizeof(struct ggml_cplan)); + + // thread scheduling for the different operations + work buffer size estimation + for (int i = 0; i < cgraph->n_nodes; i++) { + int n_tasks = 1; + + struct ggml_tensor * node = cgraph->nodes[i]; + + switch (node->op) { + case GGML_OP_CPY: + case GGML_OP_DUP: + { + n_tasks = n_threads; + + size_t cur = 0; + if (ggml_is_quantized(node->type)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_ADD: + case GGML_OP_ADD1: + { + n_tasks = n_threads; + + size_t cur = 0; + + if (ggml_is_quantized(node->src[0]->type)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_ACC: + { + n_tasks = n_threads; + + size_t cur = 0; + + if (ggml_is_quantized(node->src[0]->type)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks; + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_SUB: + case GGML_OP_DIV: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: + case GGML_OP_ARGMAX: + case GGML_OP_REPEAT: + case GGML_OP_REPEAT_BACK: + { + n_tasks = 1; + } break; + + case GGML_OP_UNARY: + { + switch (ggml_get_unary_op(node)) { + case GGML_UNARY_OP_ABS: + case GGML_UNARY_OP_SGN: + case GGML_UNARY_OP_NEG: + case GGML_UNARY_OP_STEP: + case GGML_UNARY_OP_TANH: + case GGML_UNARY_OP_ELU: + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_GLU: + { + n_tasks = 1; + } break; + + case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_QUICK: + case GGML_UNARY_OP_SILU: + { + n_tasks = n_threads; + } break; + } + } break; + case GGML_OP_SILU_BACK: + case GGML_OP_MUL: + case GGML_OP_NORM: + case GGML_OP_BATCH_NORM: + case GGML_OP_RMS_NORM: + case GGML_OP_RMS_NORM_BACK: + case GGML_OP_GROUP_NORM: + { + n_tasks = n_threads; + } break; + case GGML_OP_CONCAT: + case GGML_OP_MUL_MAT: + case GGML_OP_OUT_PROD: + { + n_tasks = n_threads; + + // TODO: use different scheduling for different matrix sizes + //const int nr0 = ggml_nrows(node->src[0]); + //const int nr1 = ggml_nrows(node->src[1]); + + //n_tasks = MIN(n_threads, MAX(1, nr0/128)); + //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks); + + size_t cur = 0; + const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type; + +#if defined(GGML_USE_CUBLAS) + if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + } else +#elif defined(GGML_USE_CLBLAST) + if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node); + } else +#endif +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + if (node->src[0]->type != GGML_TYPE_F32) { + // here we need memory just for single 2D matrix from src0 + cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]); + } + } else +#endif + if (node->src[1]->type != vec_dot_type) { + cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type); + } else { + cur = 0; + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_SCALE: + { + n_tasks = 1; + } break; + case GGML_OP_SET: + case GGML_OP_CONT: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_GET_ROWS: + case GGML_OP_GET_ROWS_BACK: + case GGML_OP_DIAG: + { + n_tasks = 1; + } break; + case GGML_OP_DIAG_MASK_ZERO: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_SOFT_MAX_BACK: + case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: + case GGML_OP_ADD_REL_POS: + { + n_tasks = n_threads; + } break; + case GGML_OP_ALIBI: + { + n_tasks = 1; //TODO + } break; + case GGML_OP_CLAMP: + { + n_tasks = 1; //TODO + } break; + case GGML_OP_CONV_1D: + { + n_tasks = n_threads; + + GGML_ASSERT(node->src[0]->ne[3] == 1); + GGML_ASSERT(node->src[1]->ne[2] == 1); + GGML_ASSERT(node->src[1]->ne[3] == 1); + + size_t cur = 0; + const int nk = node->src[0]->ne[0]; + + if (node->src[0]->type == GGML_TYPE_F16 && + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(ggml_fp16_t)*( + nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] + + ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1] + ); + } else if (node->src[0]->type == GGML_TYPE_F32 && + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)*( + nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] + + ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1] + ); + } else { + GGML_ASSERT(false); + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CONV_1D_STAGE_0: + { + n_tasks = n_threads; + } break; + case GGML_OP_CONV_1D_STAGE_1: + { + n_tasks = n_threads; + } break; + case GGML_OP_CONV_1D_STAGE_2: + { + n_tasks = n_threads; + } break; + case GGML_OP_CONV_2D: + { + n_tasks = n_threads; + + const int64_t ne00 = node->src[0]->ne[0]; // W + const int64_t ne01 = node->src[0]->ne[1]; // H + const int64_t ne02 = node->src[0]->ne[2]; // C + const int64_t ne03 = node->src[0]->ne[3]; // N + + const int64_t ne10 = node->src[1]->ne[0]; // W + const int64_t ne11 = node->src[1]->ne[1]; // H + const int64_t ne12 = node->src[1]->ne[2]; // C + + const int64_t ne0 = node->ne[0]; + const int64_t ne1 = node->ne[1]; + const int64_t ne2 = node->ne[2]; + const int64_t nk = ne00*ne01; + const int64_t ew0 = nk * ne02; + + UNUSED(ne03); + UNUSED(ne2); + + size_t cur = 0; + + if (node->src[0]->type == GGML_TYPE_F16 && + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0); + } else if (node->src[0]->type == GGML_TYPE_F32 && + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)* (ne10*ne11*ne12); + } else { + GGML_ASSERT(false); + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CONV_TRANSPOSE_2D: + { + n_tasks = n_threads; + + const int64_t ne00 = node->src[0]->ne[0]; // W + const int64_t ne01 = node->src[0]->ne[1]; // H + const int64_t ne02 = node->src[0]->ne[2]; // Channels Out + const int64_t ne03 = node->src[0]->ne[3]; // Channels In + + const int64_t ne10 = node->src[1]->ne[0]; // W + const int64_t ne11 = node->src[1]->ne[1]; // H + const int64_t ne12 = node->src[1]->ne[2]; // Channels In + + size_t cur = 0; + cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03; + cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12; + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_POOL_1D: + case GGML_OP_POOL_2D: + { + n_tasks = 1; + } break; + case GGML_OP_UPSCALE: + { + n_tasks = n_threads; + } break; + case GGML_OP_FLASH_ATTN: + { + n_tasks = n_threads; + + size_t cur = 0; + + const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); + + if (node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 + } + + if (node->src[1]->type == GGML_TYPE_F16) { + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_FF: + { + n_tasks = n_threads; + + size_t cur = 0; + + if (node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 + } + + if (node->src[1]->type == GGML_TYPE_F16) { + cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_ATTN_BACK: + { + n_tasks = n_threads; + + size_t cur = 0; + + const int64_t D = node->src[0]->ne[0]; + const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); + const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back + if (node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } + + if (node->src[1]->type == GGML_TYPE_F16) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_WIN_PART: + case GGML_OP_WIN_UNPART: + case GGML_OP_GET_REL_POS: + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1_F32: + case GGML_OP_MAP_CUSTOM2_F32: + case GGML_OP_MAP_CUSTOM3_F32: + { + n_tasks = 1; + } break; + case GGML_OP_MAP_CUSTOM1: + { + struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params; + if (p->n_tasks == GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p->n_tasks, n_threads); + } + } break; + case GGML_OP_MAP_CUSTOM2: + { + struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params; + if (p->n_tasks == GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p->n_tasks, n_threads); + } + } break; + case GGML_OP_MAP_CUSTOM3: + { + struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params; + if (p->n_tasks == GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p->n_tasks, n_threads); + } + } break; + case GGML_OP_CROSS_ENTROPY_LOSS: + { + n_tasks = n_threads; + + size_t cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks); + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + { + n_tasks = n_threads; + } break; + case GGML_OP_NONE: + { + n_tasks = 1; + } break; + case GGML_OP_COUNT: + { + GGML_ASSERT(false); + } break; + } + + cplan.n_tasks[i] = n_tasks; + } + + if (work_size > 0) { + work_size += CACHE_LINE_SIZE*(n_threads - 1); + } + + cplan.n_threads = n_threads; + cplan.work_size = work_size; + cplan.work_data = NULL; + + return cplan; +} + +int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { + { + GGML_ASSERT(cplan); + GGML_ASSERT(cplan->n_threads > 0); + + if (cplan->work_size > 0) { + GGML_ASSERT(cplan->work_data); + } + + for (int i = 0; i < cgraph->n_nodes; ++i) { + if (cgraph->nodes[i]->op != GGML_OP_NONE) { + GGML_ASSERT(cplan->n_tasks[i] > 0); + } + } + } + + const int n_threads = cplan->n_threads; + + struct ggml_compute_state_shared state_shared = { + /*.cgraph =*/ cgraph, + /*.cgraph_plan =*/ cplan, + /*.perf_node_start_cycles =*/ 0, + /*.perf_node_start_time_us =*/ 0, + /*.n_threads =*/ n_threads, + /*.n_active =*/ n_threads, + /*.node_n =*/ -1, + /*.abort_callback =*/ NULL, + /*.abort_callback_data =*/ NULL, + }; + struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); + + // create thread pool + if (n_threads > 1) { + for (int j = 1; j < n_threads; ++j) { + workers[j] = (struct ggml_compute_state) { + .thrd = 0, + .ith = j, + .shared = &state_shared, + }; + + const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); + GGML_ASSERT(rc == 0); + UNUSED(rc); + } + } + + workers[0].ith = 0; + workers[0].shared = &state_shared; + + const int64_t perf_start_cycles = ggml_perf_cycles(); + const int64_t perf_start_time_us = ggml_perf_time_us(); + + // this is a work thread too + int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]); + + // don't leave affinity set on the main thread + clear_numa_thread_affinity(); + + // join or kill thread pool + if (n_threads > 1) { + for (int j = 1; j < n_threads; j++) { + const int rc = ggml_thread_join(workers[j].thrd, NULL); + GGML_ASSERT(rc == 0); + } + } + + // performance stats (graph) + { + int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles; + int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us; + + cgraph->perf_runs++; + cgraph->perf_cycles += perf_cycles_cur; + cgraph->perf_time_us += perf_time_us_cur; + + GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n", + __func__, cgraph->perf_runs, + (double) perf_cycles_cur / (double) ggml_cycles_per_ms(), + (double) cgraph->perf_cycles / (double) ggml_cycles_per_ms() / (double) cgraph->perf_runs, + (double) perf_time_us_cur / 1000.0, + (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs); + } + + return compute_status; +} + +void ggml_graph_reset(struct ggml_cgraph * cgraph) { + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * grad = cgraph->grads[i]; + + if (grad) { + ggml_set_zero(grad); + } + } +} + +void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { + struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); + + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + + cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; + + ggml_graph_compute(cgraph, &cplan); +} + +struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) { + for (int i = 0; i < cgraph->n_leafs; i++) { + struct ggml_tensor * leaf = cgraph->leafs[i]; + + if (strcmp(leaf->name, name) == 0) { + return leaf; + } + } + + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + if (strcmp(node->name, name) == 0) { + return node; + } + } + + return NULL; +} + +static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) { + const int64_t * ne = tensor->ne; + const size_t * nb = tensor->nb; + + fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n", + ggml_type_name(tensor->type), + ggml_op_name (tensor->op), + tensor->n_dims, + ne[0], ne[1], ne[2], ne[3], + nb[0], nb[1], nb[2], nb[3], + tensor->data, + tensor->name); +} + +static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) { + const int64_t * ne = tensor->ne; + const size_t * nb = tensor->nb; + + fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n", + arg, + ggml_type_name(tensor->type), + ggml_op_name (tensor->op), + tensor->n_dims, + ne[0], ne[1], ne[2], ne[3], + nb[0], nb[1], nb[2], nb[3], + tensor->data, + tensor->name); +} + +void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { + uint64_t size_eval = 0; + + // compute size of intermediate results + // TODO: does not take into account scratch buffers !!!! + for (int i = 0; i < cgraph->n_nodes; ++i) { + size_eval += ggml_nbytes_pad(cgraph->nodes[i]); + } + + // print + { + FILE * fout = stdout; + + fprintf(fout, "\n"); + fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); + fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); + fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); + fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); + fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval); + + // header + fprintf(fout, "\n"); + fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n", + "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME"); + + for (int i = 0; i < cgraph->n_leafs; ++i) { + ggml_graph_export_leaf(cgraph->leafs[i], fout); + + GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE); + GGML_ASSERT(cgraph->leafs[i]->src[0] == NULL); + GGML_ASSERT(cgraph->leafs[i]->src[1] == NULL); + } + + // header + fprintf(fout, "\n"); + fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n", + "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME"); + + for (int i = 0; i < cgraph->n_nodes; ++i) { + ggml_graph_export_node(cgraph->nodes[i], "DST", fout); + + for (int j = 0; j < GGML_MAX_SRC; ++j) { + if (cgraph->nodes[i]->src[j]) { + ggml_graph_export_node(cgraph->nodes[i]->src[j], "SRC", fout); + } + } + + fprintf(fout, "\n"); + } + + fprintf(fout, "\n"); + } + + // write binary data + { + FILE * fout = fopen(fname, "wb"); + + if (!fout) { + fprintf(stderr, "%s: failed to open %s\n", __func__, fname); + return; + } + + // header + { + const uint32_t magic = GGML_FILE_MAGIC; + const uint32_t version = GGML_FILE_VERSION; + const uint32_t n_leafs = cgraph->n_leafs; + const uint32_t nodes = cgraph->n_nodes; + + fwrite(&magic, sizeof(uint32_t), 1, fout); + fwrite(&version, sizeof(uint32_t), 1, fout); + fwrite(&n_leafs, sizeof(uint32_t), 1, fout); + fwrite(&nodes, sizeof(uint32_t), 1, fout); + fwrite(&size_eval, sizeof(uint64_t), 1, fout); + } + + // leafs + { + for (int i = 0; i < cgraph->n_leafs; ++i) { + const struct ggml_tensor * tensor = cgraph->leafs[i]; + + const uint32_t type = tensor->type; + const uint32_t op = tensor->op; + const uint32_t n_dims = tensor->n_dims; + + fwrite(&type, sizeof(uint32_t), 1, fout); + fwrite(&op, sizeof(uint32_t), 1, fout); + fwrite(&n_dims, sizeof(uint32_t), 1, fout); + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + const uint64_t ne = tensor->ne[j]; + const uint64_t nb = tensor->nb[j]; + + fwrite(&ne, sizeof(uint64_t), 1, fout); + fwrite(&nb, sizeof(uint64_t), 1, fout); + } + + fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); + fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout); + + // dump the data + // TODO: pad this to 32 byte boundary + { + const size_t size = ggml_nbytes(tensor); + + fwrite(tensor->data, sizeof(char), size, fout); + } + } + } + + // nodes + { + for (int i = 0; i < cgraph->n_nodes; ++i) { + const struct ggml_tensor * tensor = cgraph->nodes[i]; + + const uint32_t type = tensor->type; + const uint32_t op = tensor->op; + const uint32_t n_dims = tensor->n_dims; + + fwrite(&type, sizeof(uint32_t), 1, fout); + fwrite(&op, sizeof(uint32_t), 1, fout); + fwrite(&n_dims, sizeof(uint32_t), 1, fout); + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + const uint64_t ne = tensor->ne[j]; + const uint64_t nb = tensor->nb[j]; + + fwrite(&ne, sizeof(uint64_t), 1, fout); + fwrite(&nb, sizeof(uint64_t), 1, fout); + } + + fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); + fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout); + + // output the op arguments + { + struct ggml_tensor * args[GGML_MAX_SRC] = { NULL }; + + for (int j = 0; j < GGML_MAX_SRC; ++j) { + args[j] = tensor->src[j]; + } + + for (int j = 0; j < GGML_MAX_SRC; ++j) { + if (args[j]) { + int32_t idx = -1; + + // check if leaf + { + for (int k = 0; k < cgraph->n_leafs; ++k) { + if (args[j] == cgraph->leafs[k]) { + idx = k; + break; + } + } + } + + // check if node + if (idx == -1) { + for (int k = 0; k < cgraph->n_nodes; ++k) { + if (args[j] == cgraph->nodes[k]) { + idx = GGML_MAX_NODES + k; + break; + } + } + } + + if (idx == -1) { + fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i); + return; + } + + fwrite(&idx, sizeof(int32_t), 1, fout); + } else { + const int32_t nul = -1; + + fwrite(&nul, sizeof(int32_t), 1, fout); + } + } + } + } + } + + fclose(fout); + } +} + +struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) { + assert(*ctx_data == NULL); + assert(*ctx_eval == NULL); + + struct ggml_cgraph result = { 0 }; + + struct ggml_tensor * data = NULL; + + // read file into data + { + FILE * fin = fopen(fname, "rb"); + if (!fin) { + fprintf(stderr, "%s: failed to open %s\n", __func__, fname); + return result; + } + + size_t fsize = 0; + + fseek(fin, 0, SEEK_END); + fsize = ftell(fin); + fseek(fin, 0, SEEK_SET); + + // create the data context + { + const size_t overhead = 1*ggml_tensor_overhead(); + + struct ggml_init_params params = { + .mem_size = fsize + overhead, + .mem_buffer = NULL, + .no_alloc = false, + }; + + *ctx_data = ggml_init(params); + + if (!*ctx_data) { + fprintf(stderr, "%s: failed to create ggml context\n", __func__); + fclose(fin); + return result; + } + } + + data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize); + + { + const size_t ret = fread(data->data, sizeof(char), fsize, fin); + if (ret != fsize) { + fprintf(stderr, "%s: failed to read %s\n", __func__, fname); + fclose(fin); + return result; + } + } + + fclose(fin); + } + + // populate result + { + char * ptr = (char *) data->data; + + const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic); + + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic); + return result; + } + + const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version); + + if (version != GGML_FILE_VERSION) { + fprintf(stderr, "%s: invalid version number\n", __func__); + return result; + } + + const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs); + const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes); + const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval); + + result.n_leafs = n_leafs; + result.n_nodes = n_nodes; + + // create the data context + { + const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead(); + + struct ggml_init_params params = { + .mem_size = size_eval + overhead, + .mem_buffer = NULL, + .no_alloc = true, + }; + + *ctx_eval = ggml_init(params); + + if (!*ctx_eval) { + fprintf(stderr, "%s: failed to create ggml context\n", __func__); + return result; + } + } + + // leafs + { + uint32_t type; + uint32_t op; + uint32_t n_dims; + + for (uint32_t i = 0; i < n_leafs; ++i) { + type = *(const uint32_t *) ptr; ptr += sizeof(type); + op = *(const uint32_t *) ptr; ptr += sizeof(op); + n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); + + int64_t ne[GGML_MAX_DIMS]; + size_t nb[GGML_MAX_DIMS]; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + uint64_t ne_cur; + uint64_t nb_cur; + + ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur); + nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur); + + ne[j] = ne_cur; + nb[j] = nb_cur; + } + + struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + + tensor->op = (enum ggml_op) op; + + memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME; + memcpy(tensor->op_params, ptr, GGML_MAX_OP_PARAMS); ptr += GGML_MAX_OP_PARAMS; + + tensor->data = (void *) ptr; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + tensor->nb[j] = nb[j]; + } + + result.leafs[i] = tensor; + + ptr += ggml_nbytes(tensor); + + fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); + } + } + + ggml_set_no_alloc(*ctx_eval, false); + + // nodes + { + uint32_t type; + uint32_t op; + uint32_t n_dims; + + for (uint32_t i = 0; i < n_nodes; ++i) { + type = *(const uint32_t *) ptr; ptr += sizeof(type); + op = *(const uint32_t *) ptr; ptr += sizeof(op); + n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); + + enum ggml_op eop = (enum ggml_op) op; + + int64_t ne[GGML_MAX_DIMS]; + size_t nb[GGML_MAX_DIMS]; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + uint64_t ne_cur; + uint64_t nb_cur; + + ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur); + nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur); + + ne[j] = ne_cur; + nb[j] = nb_cur; + } + + const char * ptr_name = ptr; ptr += GGML_MAX_NAME; + const char * ptr_op_params = ptr; ptr += GGML_MAX_OP_PARAMS; + + const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t); + + struct ggml_tensor * args[GGML_MAX_SRC] = { NULL }; + + // parse args + for (int j = 0; j < GGML_MAX_SRC; ++j) { + const int32_t arg_idx = ptr_arg_idx[j]; + + if (arg_idx == -1) { + continue; + } + + if (arg_idx < GGML_MAX_NODES) { + args[j] = result.leafs[arg_idx]; + } else { + args[j] = result.nodes[arg_idx - GGML_MAX_NODES]; + } + } + + // create the tensor + // "view" operations are handled differently + // TODO: handle inplace ops - currently a copy is always made + + struct ggml_tensor * tensor = NULL; + + switch (eop) { + // TODO: implement other view ops + case GGML_OP_RESHAPE: + { + tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]); + } break; + case GGML_OP_VIEW: + { + tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0); + + size_t offs; + memcpy(&offs, ptr_op_params, sizeof(offs)); + + tensor->data = ((char *) tensor->data) + offs; + } break; + case GGML_OP_TRANSPOSE: + { + tensor = ggml_transpose(*ctx_eval, args[0]); + } break; + case GGML_OP_PERMUTE: + { + tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0); + } break; + default: + { + tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + + tensor->op = eop; + } break; + } + + memcpy(tensor->name, ptr_name, GGML_MAX_NAME); + memcpy(tensor->op_params, ptr_op_params, GGML_MAX_OP_PARAMS); + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + tensor->nb[j] = nb[j]; + } + + for (int j = 0; j < GGML_MAX_SRC; ++j) { + tensor->src[j] = args[j]; + } + + result.nodes[i] = tensor; + + fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); + } + } + } + + return result; +} + +void ggml_graph_print(const struct ggml_cgraph * cgraph) { + int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0}; + + GGML_PRINT("=== GRAPH ===\n"); + + GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes); + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + perf_total_per_op_us[node->op] += MAX(1, node->perf_time_us); + + GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", + i, + node->ne[0], node->ne[1], node->ne[2], + ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, + (double) node->perf_cycles / (double) ggml_cycles_per_ms(), + (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs, + (double) node->perf_time_us / 1000.0, + (double) node->perf_time_us / 1000.0 / node->perf_runs); + } + + GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs); + for (int i = 0; i < cgraph->n_leafs; i++) { + struct ggml_tensor * node = cgraph->leafs[i]; + + GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n", + i, + node->ne[0], node->ne[1], + ggml_op_name(node->op)); + } + + for (int i = 0; i < GGML_OP_COUNT; i++) { + if (perf_total_per_op_us[i] == 0) { + continue; + } + + GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0); + } + + GGML_PRINT("========================================\n"); +} + +// check if node is part of the graph +static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) { + if (cgraph == NULL) { + return true; + } + + for (int i = 0; i < cgraph->n_nodes; i++) { + if (cgraph->nodes[i] == node) { + return true; + } + } + + return false; +} + +static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) { + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * parent = cgraph->nodes[i]; + + if (parent->grad == node) { + return parent; + } + } + + return NULL; +} + +static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) { + struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node); + struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent); + fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n", + gparent0 ? (void *) gparent0 : (void *) parent, + gparent0 ? "g" : "x", + gparent ? (void *) gparent : (void *) node, + gparent ? "g" : "x", + gparent ? "empty" : "vee", + gparent ? "dashed" : "solid", + label); +} + +static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) { + fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n", + (void *) parent, "x", + (void *) node, "x", + label); +} + +void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) { + char color[16]; + + FILE * fp = fopen(filename, "w"); + GGML_ASSERT(fp); + + fprintf(fp, "digraph G {\n"); + fprintf(fp, " newrank = true;\n"); + fprintf(fp, " rankdir = LR;\n"); + + for (int i = 0; i < gb->n_nodes; i++) { + struct ggml_tensor * node = gb->nodes[i]; + + if (ggml_graph_get_parent(gb, node) != NULL) { + continue; + } + + if (node->is_param) { + snprintf(color, sizeof(color), "yellow"); + } else if (node->grad) { + if (ggml_graph_find(gf, node)) { + snprintf(color, sizeof(color), "green"); + } else { + snprintf(color, sizeof(color), "lightblue"); + } + } else { + snprintf(color, sizeof(color), "white"); + } + + fprintf(fp, " \"%p\" [ " + "style = filled; fillcolor = %s; shape = record; " + "label=\"", + (void *) node, color); + + if (strlen(node->name) > 0) { + fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type)); + } else { + fprintf(fp, "(%s)|", ggml_type_name(node->type)); + } + + if (node->n_dims == 2) { + fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op)); + } else { + fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op)); + } + + if (node->grad) { + fprintf(fp, " | %s\"; ]\n", ggml_op_symbol(node->grad->op)); + } else { + fprintf(fp, "\"; ]\n"); + } + } + + for (int i = 0; i < gb->n_leafs; i++) { + struct ggml_tensor * node = gb->leafs[i]; + + snprintf(color, sizeof(color), "pink"); + + fprintf(fp, " \"%p\" [ " + "style = filled; fillcolor = %s; shape = record; " + "label=\"", + (void *) node, color); + + if (strlen(node->name) > 0) { + fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type)); + } else { + fprintf(fp, "(%s)|", ggml_type_name(node->type)); + } + + fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]); + if (ggml_nelements(node) < 5) { + fprintf(fp, " | ("); + for (int j = 0; j < ggml_nelements(node); j++) { + if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) { + fprintf(fp, "%d", ggml_get_i32_1d(node, j)); + } + else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) { + fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j)); + } + else { + fprintf(fp, "#"); + } + if (j < ggml_nelements(node) - 1) { + fprintf(fp, ", "); + } + } + fprintf(fp, ")"); + } + fprintf(fp, "\"; ]\n"); + } + + for (int i = 0; i < gb->n_nodes; i++) { + struct ggml_tensor * node = gb->nodes[i]; + + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j]) { + char label[16]; + snprintf(label, sizeof(label), "src %d", j); + ggml_graph_dump_dot_node_edge(fp, gb, node, node->src[j], label); + } + } + } + + for (int i = 0; i < gb->n_leafs; i++) { + struct ggml_tensor * node = gb->leafs[i]; + + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j]) { + char label[16]; + snprintf(label, sizeof(label), "src %d", j); + ggml_graph_dump_dot_leaf_edge(fp, node, node->src[j], label); + } + } + } + + fprintf(fp, "}\n"); + + fclose(fp); + + GGML_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename); +} + +//////////////////////////////////////////////////////////////////////////////// + +static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) { + int i = 0; + for (int p = 0; p < np; ++p) { + const int64_t ne = ggml_nelements(ps[p]) ; + // TODO: add function to set tensor from array + for (int64_t j = 0; j < ne; ++j) { + ggml_set_f32_1d(ps[p], j, x[i++]); + } + } +} + +static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) { + int i = 0; + for (int p = 0; p < np; ++p) { + const int64_t ne = ggml_nelements(ps[p]) ; + // TODO: add function to get all elements at once + for (int64_t j = 0; j < ne; ++j) { + x[i++] = ggml_get_f32_1d(ps[p], j); + } + } +} + +static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) { + int i = 0; + for (int p = 0; p < np; ++p) { + const int64_t ne = ggml_nelements(ps[p]) ; + // TODO: add function to get all elements at once + for (int64_t j = 0; j < ne; ++j) { + g[i++] = ggml_get_f32_1d(ps[p]->grad, j); + } + } +} + +// +// ADAM +// +// ref: https://arxiv.org/pdf/1412.6980.pdf +// + +static enum ggml_opt_result ggml_opt_adam( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_opt_params params, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + ggml_opt_callback callback, + void * callback_data) { + GGML_ASSERT(ggml_is_scalar(f)); + + // these will store the parameters we want to optimize + struct ggml_tensor * ps[GGML_MAX_PARAMS]; + + int np = 0; + int64_t nx = 0; + for (int i = 0; i < gf->n_nodes; ++i) { + if (gf->nodes[i]->is_param) { + GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); + + GGML_ASSERT(np < GGML_MAX_PARAMS); + + ps[np++] = gf->nodes[i]; + nx += ggml_nelements(gf->nodes[i]); + } + } + + if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past)) { + int iter = opt->iter; + ggml_opt_init(opt->ctx, opt, params, nx); + opt->iter = iter; + } + + // constants + float sched = params.adam.sched; + const float alpha = params.adam.alpha; + const float decay = params.adam.decay * alpha; + const float beta1 = params.adam.beta1; + const float beta2 = params.adam.beta2; + const float eps = params.adam.eps; + const float gclip = params.adam.gclip; + const int decay_min_ndim = params.adam.decay_min_ndim; + + float * m = opt->adam.m->data; // first moment + float * v = opt->adam.v->data; // second moment + + float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values + + if (callback) { + callback(callback_data, &sched); + } + + // compute the function value + ggml_graph_reset (gf); + ggml_set_f32 (f->grad, 1.0f); + + struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; + ggml_graph_compute(gb, &cplan); + + opt->adam.fx_prev = ggml_get_f32_1d(f, 0); + opt->adam.fx_best = opt->adam.fx_prev; + if (pf) { + pf[opt->iter % params.past] = opt->adam.fx_prev; + } + + opt->loss_before = opt->adam.fx_prev; + opt->loss_after = opt->adam.fx_prev; + + // initialize + if (opt->just_initialized) { + opt->adam.n_no_improvement = 0; + opt->just_initialized = false; + } + + float * fx_best = &opt->adam.fx_best; + float * fx_prev = &opt->adam.fx_prev; + int * n_no_improvement = &opt->adam.n_no_improvement; + + int iter0 = opt->iter; + + // run the optimizer + for (int t = 0; t < params.adam.n_iter; ++t) { + opt->iter = iter0 + t + 1; + GGML_PRINT_DEBUG ("=== iter %d ===\n", t); + + GGML_PRINT_DEBUG ("f = %10.6f\n", ggml_get_f32_1d(f, 0)); + GGML_PRINT_DEBUG_5("df/dx0 = %10.6f\n", ggml_get_f32_1d(ps[0]->grad, 0)); + GGML_PRINT_DEBUG_5("df/dx1 = %10.6f\n", ggml_get_f32_1d(ps[1]->grad, 0)); + + for (int i = 0; i < np; ++i) { + GGML_PRINT_DEBUG("param %d: %10.6f, g = %10.6f\n", i, + ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0)); + } + + const int64_t t_start_wall = ggml_time_us(); + const int64_t t_start_cpu = ggml_cycles(); + UNUSED(t_start_wall); + UNUSED(t_start_cpu); + + { + float gnorm = 1.0f; + if (gclip > 0.0f) { + // gradient clipping + ggml_float sum = 0.0; + for (int p = 0; p < np; ++p) { + const int64_t ne = ggml_nelements(ps[p]); + for (int64_t j = 0; j < ne; ++j) { + float g = ggml_get_f32_1d(ps[p]->grad, j); + sum += (ggml_float)(g*g); + } + } + ggml_float norm = sqrt(sum); + if (norm > (ggml_float) gclip) { + gnorm = (float) ((ggml_float) gclip / norm); + } + } + const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter)); + const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter)); + int64_t i = 0; + for (int p = 0; p < np; ++p) { + const int64_t ne = ggml_nelements(ps[p]); + const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched; + for (int64_t j = 0; j < ne; ++j) { + float x = ggml_get_f32_1d(ps[p], j); + float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm; + m[i] = m[i]*beta1 + g*(1.0f - beta1); + v[i] = v[i]*beta2 + g*g*(1.0f - beta2); + float mh = m[i]*beta1h; + float vh = v[i]*beta2h; + vh = sqrtf(vh) + eps; + x = x*(1.0f - p_decay) - mh/vh; + ggml_set_f32_1d(ps[p], j, x); + ++i; + } + } + } + + if (callback) { + callback(callback_data, &sched); + } + + ggml_graph_reset (gf); + ggml_set_f32 (f->grad, 1.0f); + + ggml_graph_compute(gb, &cplan); + + const float fx = ggml_get_f32_1d(f, 0); + opt->loss_after = fx; + + + // check convergence + if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) { + GGML_PRINT_DEBUG("converged\n"); + + return GGML_OPT_OK; + } + + // delta-based convergence test + if (pf != NULL) { + // need at least params.past iterations to start checking for convergence + if (params.past <= iter0 + t) { + const float rate = (pf[(iter0 + t)%params.past] - fx)/fx; + + if (fabsf(rate) < params.delta) { + return GGML_OPT_OK; + } + } + + pf[(iter0 + t)%params.past] = fx; + } + + // check for improvement + if (params.max_no_improvement > 0) { + if (fx_best[0] > fx) { + fx_best[0] = fx; + n_no_improvement[0] = 0; + } else { + ++n_no_improvement[0]; + + if (n_no_improvement[0] >= params.max_no_improvement) { + return GGML_OPT_OK; + } + } + } + + fx_prev[0] = fx; + + { + const int64_t t_end_cpu = ggml_cycles(); + GGML_PRINT_DEBUG("time iter: %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC); + UNUSED(t_end_cpu); + + const int64_t t_end_wall = ggml_time_us(); + GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6); + UNUSED(t_end_wall); + } + } + + return GGML_OPT_DID_NOT_CONVERGE; +} + +// +// L-BFGS +// +// the L-BFGS implementation below is based on the following implementation: +// +// https://github.com/chokkan/liblbfgs +// + +struct ggml_lbfgs_iteration_data { + float alpha; + float ys; + float * s; + float * y; +}; + +static enum ggml_opt_result linesearch_backtracking( + const struct ggml_opt_params * params, + int nx, + float * x, + float * fx, + float * g, + float * d, + float * step, + const float * xp, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + struct ggml_cplan * cplan, + const int np, + struct ggml_tensor * ps[], + ggml_opt_callback callback, + void * callback_data) { + int count = 0; + + float width = 0.0f; + float dg = 0.0f; + float finit = 0.0f; + float dginit = 0.0f; + float dgtest = 0.0f; + + const float dec = 0.5f; + const float inc = 2.1f; + + if (*step <= 0.f) { + return GGML_LINESEARCH_INVALID_PARAMETERS; + } + + // compute the initial gradient in the search direction + ggml_vec_dot_f32(nx, &dginit, g, d); + + // make sure that d points to a descent direction + if (0 < dginit) { + return GGML_LINESEARCH_FAIL; + } + + // initialize local variables + finit = *fx; + dgtest = params->lbfgs.ftol*dginit; + + while (true) { + if (callback) { + // LBFG-S does not support learning rate -> ignore learning schedule + float sched = 0; + callback(callback_data, &sched); + } + + ggml_vec_cpy_f32(nx, x, xp); + ggml_vec_mad_f32(nx, x, d, *step); + + // evaluate the function and gradient values + { + ggml_opt_set_params(np, ps, x); + + ggml_graph_reset (gf); + ggml_set_f32 (f->grad, 1.0f); + + ggml_graph_compute(gb, cplan); + + ggml_opt_get_grad(np, ps, g); + + *fx = ggml_get_f32_1d(f, 0); + } + + ++count; + + if (*fx > finit + (*step)*dgtest) { + width = dec; + } else { + // Armijo condition is satisfied + if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) { + return count; + } + + ggml_vec_dot_f32(nx, &dg, g, d); + + // check the Wolfe condition + if (dg < params->lbfgs.wolfe * dginit) { + width = inc; + } else { + if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) { + // regular Wolfe conditions + return count; + } + + if(dg > -params->lbfgs.wolfe*dginit) { + width = dec; + } else { + // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) + return count; + } + } + } + + if (*step < params->lbfgs.min_step) { + return GGML_LINESEARCH_MINIMUM_STEP; + } + if (*step > params->lbfgs.max_step) { + return GGML_LINESEARCH_MAXIMUM_STEP; + } + if (params->lbfgs.max_linesearch <= count) { + return GGML_LINESEARCH_MAXIMUM_ITERATIONS; + } + + (*step) *= width; + } + + return GGML_LINESEARCH_FAIL; +} + +static enum ggml_opt_result ggml_opt_lbfgs( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_opt_params params, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + ggml_opt_callback callback, + void * callback_data) { + if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE || + params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { + if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) { + return GGML_OPT_INVALID_WOLFE; + } + } + + const int m = params.lbfgs.m; + + // these will store the parameters we want to optimize + struct ggml_tensor * ps[GGML_MAX_PARAMS]; + + int np = 0; + int nx = 0; + for (int i = 0; i < gf->n_nodes; ++i) { + if (gf->nodes[i]->is_param) { + GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); + + GGML_ASSERT(np < GGML_MAX_PARAMS); + + ps[np++] = gf->nodes[i]; + nx += ggml_nelements(gf->nodes[i]); + } + } + + if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past) || (opt->params.lbfgs.m != params.lbfgs.m)) { + int iter = opt->iter; + ggml_opt_init(ctx, opt, params, nx); + opt->iter = iter; + } + + struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; + + float * x = opt->lbfgs.x->data; // current parameters + float * xp = opt->lbfgs.xp->data; // previous parameters + float * g = opt->lbfgs.g->data; // current gradient + float * gp = opt->lbfgs.gp->data; // previous gradient + float * d = opt->lbfgs.d->data; // search direction + + float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values + + float fx = 0.0f; // cost function value + float xnorm = 0.0f; // ||x|| + float gnorm = 0.0f; // ||g|| + + // initialize x from the graph nodes + ggml_opt_get_params(np, ps, x); + + // the L-BFGS memory + float * lm_alpha = opt->lbfgs.lmal->data; + float * lm_ys = opt->lbfgs.lmys->data; + float * lm_s = opt->lbfgs.lms->data; + float * lm_y = opt->lbfgs.lmy->data; + + if (callback) { + // LBFG-S does not support learning rate -> ignore learning schedule + float sched = 0; + callback(callback_data, &sched); + } + + // evaluate the function value and its gradient + { + ggml_opt_set_params(np, ps, x); + + ggml_graph_reset (gf); + ggml_set_f32 (f->grad, 1.0f); + + ggml_graph_compute(gb, &cplan); + + ggml_opt_get_grad(np, ps, g); + + fx = ggml_get_f32_1d(f, 0); + + opt->loss_before = fx; + opt->loss_after = fx; + } + + // search direction = -gradient + ggml_vec_neg_f32(nx, d, g); + + // ||x||, ||g|| + ggml_vec_norm_f32(nx, &xnorm, x); + ggml_vec_norm_f32(nx, &gnorm, g); + + if (xnorm < 1.0f) { + xnorm = 1.0f; + } + + // already optimized + if (gnorm/xnorm <= params.lbfgs.eps) { + return GGML_OPT_OK; + } + + if (opt->just_initialized) { + if (pf) { + pf[0] = fx; + } + opt->lbfgs.fx_best = fx; + + // initial step + ggml_vec_norm_inv_f32(nx, &opt->lbfgs.step, d); + opt->lbfgs.j = 0; + opt->lbfgs.k = 1; + opt->lbfgs.end = 0; + opt->lbfgs.n_no_improvement = 0; + opt->just_initialized = false; + } + + float * fx_best = &opt->lbfgs.fx_best; + float * step = &opt->lbfgs.step; + int * j = &opt->lbfgs.j; + int * k = &opt->lbfgs.k; + int * end = &opt->lbfgs.end; + int * n_no_improvement = &opt->lbfgs.n_no_improvement; + + int ls = 0; + int bound = 0; + + float ys = 0.0f; + float yy = 0.0f; + float beta = 0.0f; + + int it = 0; + + while (true) { + // store the current position and gradient vectors + ggml_vec_cpy_f32(nx, xp, x); + ggml_vec_cpy_f32(nx, gp, g); + + ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data); + + if (ls < 0) { + // linesearch failed - go back to the previous point and return + ggml_vec_cpy_f32(nx, x, xp); + ggml_vec_cpy_f32(nx, g, gp); + + return ls; + } + + opt->loss_after = fx; + + ggml_vec_norm_f32(nx, &xnorm, x); + ggml_vec_norm_f32(nx, &gnorm, g); + + GGML_PRINT_DEBUG("f = %10.6f\n", ggml_get_f32_1d(f, 0)); + + if (xnorm < 1.0f) { + xnorm = 1.0f; + } + if (gnorm/xnorm <= params.lbfgs.eps) { + // converged + return GGML_OPT_OK; + } + + // delta-based convergence test + if (pf != NULL) { + // need at least params.past iterations to start checking for convergence + if (params.past <= k[0]) { + const float rate = (pf[k[0]%params.past] - fx)/fx; + + if (fabsf(rate) < params.delta) { + return GGML_OPT_OK; + } + } + + pf[k[0]%params.past] = fx; + } + + // check for improvement + if (params.max_no_improvement > 0) { + if (fx < fx_best[0]) { + fx_best[0] = fx; + n_no_improvement[0] = 0; + } else { + n_no_improvement[0]++; + + if (n_no_improvement[0] >= params.max_no_improvement) { + return GGML_OPT_OK; + } + } + } + + if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) { + // reached the maximum number of iterations + return GGML_OPT_DID_NOT_CONVERGE; + } + + // update vectors s and y: + // s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}. + // y_{k+1} = g_{k+1} - g_{k}. + // + ggml_vec_sub_f32(nx, &lm_s[end[0]*nx], x, xp); + ggml_vec_sub_f32(nx, &lm_y[end[0]*nx], g, gp); + + // compute scalars ys and yy: + // ys = y^t \cdot s -> 1 / \rho. + // yy = y^t \cdot y. + // + ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]); + ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]); + + lm_ys[end[0]] = ys; + + // find new search direction + // ref: https://en.wikipedia.org/wiki/Limited-memory_BFGS + + bound = (m <= k[0]) ? m : k[0]; + k[0]++; + it++; + end[0] = (end[0] + 1)%m; + + // initialize search direction with -g + ggml_vec_neg_f32(nx, d, g); + + j[0] = end[0]; + for (int i = 0; i < bound; ++i) { + j[0] = (j[0] + m - 1) % m; + // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1} + ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d); + lm_alpha[j[0]] /= lm_ys[j[0]]; + // q_{i} = q_{i+1} - \alpha_{i} y_{i} + ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]); + } + + ggml_vec_scale_f32(nx, d, ys/yy); + + for (int i = 0; i < bound; ++i) { + // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i} + ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d); + beta /= lm_ys[j[0]]; + // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j} + ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta); + j[0] = (j[0] + 1)%m; + } + + step[0] = 1.0; + } + + return GGML_OPT_DID_NOT_CONVERGE; +} + +struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { + struct ggml_opt_params result; + + switch (type) { + case GGML_OPT_ADAM: + { + result = (struct ggml_opt_params) { + .type = GGML_OPT_ADAM, + .n_threads = 1, + .past = 0, + .delta = 1e-5f, + + .max_no_improvement = 100, + + .print_forward_graph = true, + .print_backward_graph = true, + + .adam = { + .n_iter = 10000, + .sched = 1.000f, + .decay = 0.0f, + .decay_min_ndim = 2, + .alpha = 0.001f, + .beta1 = 0.9f, + .beta2 = 0.999f, + .eps = 1e-8f, + .eps_f = 1e-5f, + .eps_g = 1e-3f, + .gclip = 0.0f, + }, + }; + } break; + case GGML_OPT_LBFGS: + { + result = (struct ggml_opt_params) { + .type = GGML_OPT_LBFGS, + .n_threads = 1, + .past = 0, + .delta = 1e-5f, + + .max_no_improvement = 0, + + .print_forward_graph = true, + .print_backward_graph = true, + + .lbfgs = { + .m = 6, + .n_iter = 100, + .max_linesearch = 20, + + .eps = 1e-5f, + .ftol = 1e-4f, + .wolfe = 0.9f, + .min_step = 1e-20f, + .max_step = 1e+20f, + + .linesearch = GGML_LINESEARCH_DEFAULT, + }, + }; + } break; + } + + return result; +} + +GGML_API void ggml_opt_init( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_opt_params params, + int64_t nx) { + opt->ctx = ctx; + opt->params = params; + opt->iter = 0; + opt->nx = nx; + opt->just_initialized = true; + switch (opt->params.type) { + case GGML_OPT_ADAM: + { + opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->adam.pf = params.past > 0 + ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past) + : NULL; + ggml_set_zero(opt->adam.m); + ggml_set_zero(opt->adam.v); + if (opt->adam.pf) { + ggml_set_zero(opt->adam.pf); + } + } break; + case GGML_OPT_LBFGS: + { + opt->lbfgs.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.xp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.gp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.d = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.pf = params.past > 0 + ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past) + : NULL; + opt->lbfgs.lmal = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.lbfgs.m); + opt->lbfgs.lmys = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.lbfgs.m); + opt->lbfgs.lms = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, params.lbfgs.m); + opt->lbfgs.lmy = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, params.lbfgs.m); + ggml_set_zero(opt->lbfgs.x); + ggml_set_zero(opt->lbfgs.xp); + ggml_set_zero(opt->lbfgs.g); + ggml_set_zero(opt->lbfgs.gp); + ggml_set_zero(opt->lbfgs.d); + if (opt->lbfgs.pf) { + ggml_set_zero(opt->lbfgs.pf); + } + ggml_set_zero(opt->lbfgs.lmal); + ggml_set_zero(opt->lbfgs.lmys); + ggml_set_zero(opt->lbfgs.lms); + ggml_set_zero(opt->lbfgs.lmy); + } break; + } +} + +enum ggml_opt_result ggml_opt( + struct ggml_context * ctx, + struct ggml_opt_params params, + struct ggml_tensor * f) { + bool free_ctx = false; + if (ctx == NULL) { + struct ggml_init_params params_ctx = { + .mem_size = 16*1024*1024, + .mem_buffer = NULL, + .no_alloc = false, + }; + + ctx = ggml_init(params_ctx); + if (ctx == NULL) { + return GGML_OPT_NO_CONTEXT; + } + + free_ctx = true; + } + + enum ggml_opt_result result = GGML_OPT_OK; + + struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context)); + + ggml_opt_init(ctx, opt, params, 0); + result = ggml_opt_resume(ctx, opt, f); + + if (free_ctx) { + ggml_free(ctx); + } + + return result; +} + +enum ggml_opt_result ggml_opt_resume( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f) { + + // build forward + backward compute graphs + struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0)); + struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0)); + + struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data; + struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data; + + *gf = ggml_build_forward (f); + *gb = ggml_build_backward(ctx, gf, true); + + return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL); +} + +enum ggml_opt_result ggml_opt_resume_g( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + ggml_opt_callback callback, + void * callback_data) { + + // build forward + backward compute graphs + enum ggml_opt_result result = GGML_OPT_OK; + + switch (opt->params.type) { + case GGML_OPT_ADAM: + { + result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data); + } break; + case GGML_OPT_LBFGS: + { + result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data); + } break; + } + + if (opt->params.print_forward_graph) { + ggml_graph_print (gf); + ggml_graph_dump_dot(gf, NULL, "opt-forward.dot"); + } + + if (opt->params.print_backward_graph) { + ggml_graph_print (gb); + ggml_graph_dump_dot(gb, gf, "opt-backward.dot"); + } + + return result; +} + +//////////////////////////////////////////////////////////////////////////////// + +size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % QK4_0 == 0); + const int nb = k / QK4_0; + + for (int b = 0; b < n; b += k) { + block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0; + + quantize_row_q4_0_reference(src + b, y, k); + + for (int i = 0; i < nb; i++) { + for (int j = 0; j < QK4_0; j += 2) { + const uint8_t vi0 = y[i].qs[j/2] & 0x0F; + const uint8_t vi1 = y[i].qs[j/2] >> 4; + + hist[vi0]++; + hist[vi1]++; + } + } + } + + return (n/QK4_0*sizeof(block_q4_0)); +} + +size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % QK4_1 == 0); + const int nb = k / QK4_1; + + for (int b = 0; b < n; b += k) { + block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1; + + quantize_row_q4_1_reference(src + b, y, k); + + for (int i = 0; i < nb; i++) { + for (int j = 0; j < QK4_1; j += 2) { + const uint8_t vi0 = y[i].qs[j/2] & 0x0F; + const uint8_t vi1 = y[i].qs[j/2] >> 4; + + hist[vi0]++; + hist[vi1]++; + } + } + } + + return (n/QK4_1*sizeof(block_q4_1)); +} + +size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % QK5_0 == 0); + const int nb = k / QK5_0; + + for (int b = 0; b < n; b += k) { + block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0; + + quantize_row_q5_0_reference(src + b, y, k); + + for (int i = 0; i < nb; i++) { + uint32_t qh; + memcpy(&qh, &y[i].qh, sizeof(qh)); + + for (int j = 0; j < QK5_0; j += 2) { + const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12)); + + // cast to 16 bins + const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2; + const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2; + + hist[vi0]++; + hist[vi1]++; + } + } + } + + return (n/QK5_0*sizeof(block_q5_0)); +} + +size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % QK5_1 == 0); + const int nb = k / QK5_1; + + for (int b = 0; b < n; b += k) { + block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1; + + quantize_row_q5_1_reference(src + b, y, k); + + for (int i = 0; i < nb; i++) { + uint32_t qh; + memcpy(&qh, &y[i].qh, sizeof(qh)); + + for (int j = 0; j < QK5_1; j += 2) { + const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12)); + + // cast to 16 bins + const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2; + const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2; + + hist[vi0]++; + hist[vi1]++; + } + } + } + + return (n/QK5_1*sizeof(block_q5_1)); +} + +size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + for (int b = 0; b < n; b += k) { + block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0; + + quantize_row_q8_0_reference(src + b, y, k); + + for (int i = 0; i < nb; i++) { + for (int j = 0; j < QK8_0; ++j) { + const int8_t vi = y[i].qs[j]; + + hist[vi/16 + 8]++; + } + } + } + + return (n/QK8_0*sizeof(block_q8_0)); +} + +size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) { + size_t result = 0; + switch (type) { + case GGML_TYPE_Q4_0: + { + GGML_ASSERT(start % QK4_0 == 0); + block_q4_0 * block = (block_q4_0*)dst + start / QK4_0; + result = ggml_quantize_q4_0(src + start, block, n, n, hist); + } break; + case GGML_TYPE_Q4_1: + { + GGML_ASSERT(start % QK4_1 == 0); + block_q4_1 * block = (block_q4_1*)dst + start / QK4_1; + result = ggml_quantize_q4_1(src + start, block, n, n, hist); + } break; + case GGML_TYPE_Q5_0: + { + GGML_ASSERT(start % QK5_0 == 0); + block_q5_0 * block = (block_q5_0*)dst + start / QK5_0; + result = ggml_quantize_q5_0(src + start, block, n, n, hist); + } break; + case GGML_TYPE_Q5_1: + { + GGML_ASSERT(start % QK5_1 == 0); + block_q5_1 * block = (block_q5_1*)dst + start / QK5_1; + result = ggml_quantize_q5_1(src + start, block, n, n, hist); + } break; + case GGML_TYPE_Q8_0: + { + GGML_ASSERT(start % QK8_0 == 0); + block_q8_0 * block = (block_q8_0*)dst + start / QK8_0; + result = ggml_quantize_q8_0(src + start, block, n, n, hist); + } break; +#ifdef GGML_USE_K_QUANTS + case GGML_TYPE_Q2_K: + { + GGML_ASSERT(start % QK_K == 0); + block_q2_K * block = (block_q2_K*)dst + start / QK_K; + result = ggml_quantize_q2_K(src + start, block, n, n, hist); + } break; + case GGML_TYPE_Q3_K: + { + GGML_ASSERT(start % QK_K == 0); + block_q3_K * block = (block_q3_K*)dst + start / QK_K; + result = ggml_quantize_q3_K(src + start, block, n, n, hist); + } break; + case GGML_TYPE_Q4_K: + { + GGML_ASSERT(start % QK_K == 0); + block_q4_K * block = (block_q4_K*)dst + start / QK_K; + result = ggml_quantize_q4_K(src + start, block, n, n, hist); + } break; + case GGML_TYPE_Q5_K: + { + GGML_ASSERT(start % QK_K == 0); + block_q5_K * block = (block_q5_K*)dst + start / QK_K; + result = ggml_quantize_q5_K(src + start, block, n, n, hist); + } break; + case GGML_TYPE_Q6_K: + { + GGML_ASSERT(start % QK_K == 0); + block_q6_K * block = (block_q6_K*)dst + start / QK_K; + result = ggml_quantize_q6_K(src + start, block, n, n, hist); + } break; +#endif + case GGML_TYPE_F16: + { + int elemsize = sizeof(ggml_fp16_t); + ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n); + result = n * elemsize; + } break; + case GGML_TYPE_F32: + { + int elemsize = sizeof(float); + result = n * elemsize; + memcpy((uint8_t *)dst + start * elemsize, src + start, result); + } break; + default: + assert(false); + } + return result; +} + +//////////////////////////////////////////////////////////////////////////////// + +struct gguf_str { + uint64_t n; // GGUFv2 + char * data; +}; + +static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = { + [GGUF_TYPE_UINT8] = sizeof(uint8_t), + [GGUF_TYPE_INT8] = sizeof(int8_t), + [GGUF_TYPE_UINT16] = sizeof(uint16_t), + [GGUF_TYPE_INT16] = sizeof(int16_t), + [GGUF_TYPE_UINT32] = sizeof(uint32_t), + [GGUF_TYPE_INT32] = sizeof(int32_t), + [GGUF_TYPE_FLOAT32] = sizeof(float), + [GGUF_TYPE_BOOL] = sizeof(bool), + [GGUF_TYPE_STRING] = sizeof(struct gguf_str), + [GGUF_TYPE_UINT64] = sizeof(uint64_t), + [GGUF_TYPE_INT64] = sizeof(int64_t), + [GGUF_TYPE_FLOAT64] = sizeof(double), + [GGUF_TYPE_ARRAY] = 0, // undefined +}; +static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); + +static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = { + [GGUF_TYPE_UINT8] = "u8", + [GGUF_TYPE_INT8] = "i8", + [GGUF_TYPE_UINT16] = "u16", + [GGUF_TYPE_INT16] = "i16", + [GGUF_TYPE_UINT32] = "u32", + [GGUF_TYPE_INT32] = "i32", + [GGUF_TYPE_FLOAT32] = "f32", + [GGUF_TYPE_BOOL] = "bool", + [GGUF_TYPE_STRING] = "str", + [GGUF_TYPE_ARRAY] = "arr", + [GGUF_TYPE_UINT64] = "u64", + [GGUF_TYPE_INT64] = "i64", + [GGUF_TYPE_FLOAT64] = "f64", +}; +static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); + +union gguf_value { + uint8_t uint8; + int8_t int8; + uint16_t uint16; + int16_t int16; + uint32_t uint32; + int32_t int32; + float float32; + uint64_t uint64; + int64_t int64; + double float64; + bool bool_; + + struct gguf_str str; + + struct { + enum gguf_type type; + + uint64_t n; // GGUFv2 + void * data; + } arr; +}; + +struct gguf_kv { + struct gguf_str key; + + enum gguf_type type; + union gguf_value value; +}; + +struct gguf_header { + uint32_t magic; + uint32_t version; + uint64_t n_tensors; // GGUFv2 + uint64_t n_kv; // GGUFv2 +}; + +struct gguf_tensor_info { + struct gguf_str name; + + uint32_t n_dims; + uint64_t ne[GGML_MAX_DIMS]; + + enum ggml_type type; + + uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT` + + // for writing API + const void * data; + size_t size; +}; + +struct gguf_context { + struct gguf_header header; + + struct gguf_kv * kv; + struct gguf_tensor_info * infos; + + size_t alignment; + size_t offset; // offset of `data` from beginning of file + size_t size; // size of `data` in bytes + + //uint8_t * padding; + void * data; +}; + +static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) { + const size_t n = fread(dst, 1, size, file); + *offset += n; + return n == size; +} + +// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 +static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) { + p->n = 0; + p->data = NULL; + + bool ok = true; + + ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1); + ok = ok && gguf_fread_el(file, p->data, p->n, offset); + + return ok; +} + +static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) { + p->n = 0; + p->data = NULL; + + bool ok = true; + + uint32_t n = 0; + ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n; + ok = ok && gguf_fread_el(file, p->data, p->n, offset); + + return ok; +} + +struct gguf_context * gguf_init_empty(void) { + struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + + ctx->header.magic = GGUF_MAGIC; + ctx->header.version = GGUF_VERSION; + ctx->header.n_tensors = 0; + ctx->header.n_kv = 0; + + ctx->kv = NULL; + ctx->infos = NULL; + + ctx->alignment = GGUF_DEFAULT_ALIGNMENT; + ctx->offset = 0; + ctx->size = 0; + + ctx->data = NULL; + + return ctx; +} + +struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { + FILE * file = fopen(fname, "rb"); + if (!file) { + return NULL; + } + + // offset from start of file + size_t offset = 0; + + uint32_t magic = 0; + + // check the magic before making allocations + { + gguf_fread_el(file, &magic, sizeof(magic), &offset); + + if (magic != GGUF_MAGIC) { + fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic); + fclose(file); + return NULL; + } + } + + bool ok = true; + + struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + + // read the header + { + ctx->header.magic = magic; + + ctx->kv = NULL; + ctx->infos = NULL; + ctx->data = NULL; + + ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset); + + if (ctx->header.version == 1) { + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + uint32_t n_tensors = 0; + uint32_t n_kv = 0; + + ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset); + ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset); + + ctx->header.n_tensors = n_tensors; + ctx->header.n_kv = n_kv; + } else { + ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset); + ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset); + } + + if (!ok) { + fprintf(stderr, "%s: failed to read header\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } + } + + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur; + if (ctx->header.version == 1) { + gguf_fread_str = gguf_fread_str_v1; + } + + // read the kv pairs + { + ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv)); + + for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { + struct gguf_kv * kv = &ctx->kv[i]; + + //fprintf(stderr, "%s: reading kv %d\n", __func__, i); + + ok = ok && gguf_fread_str(file, &kv->key, &offset); + ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset); + + //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data); + + switch (kv->type) { + case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break; + case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break; + case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break; + case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break; + case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break; + case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break; + case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break; + case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break; + case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break; + case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break; + case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break; + case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break; + case GGUF_TYPE_ARRAY: + { + ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset); + + if (ctx->header.version == 1) { + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + uint32_t n = 0; + ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset); + kv->value.arr.n = n; + } else { + ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); + } + + switch (kv->value.arr.type) { + case GGUF_TYPE_UINT8: + case GGUF_TYPE_INT8: + case GGUF_TYPE_UINT16: + case GGUF_TYPE_INT16: + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: + case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: + case GGUF_TYPE_BOOL: + { + kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]); + ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset); + } break; + case GGUF_TYPE_STRING: + { + kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str)); + for (uint32_t j = 0; j < kv->value.arr.n; ++j) { + ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset); + } + } break; + case GGUF_TYPE_ARRAY: + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; + }; + } break; + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); + }; + + if (!ok) { + break; + } + } + + if (!ok) { + fprintf(stderr, "%s: failed to read key-value pairs\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } + } + + // read the tensor infos + { + ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); + + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + info->ne[j] = 1; + } + + ok = ok && gguf_fread_str(file, &info->name, &offset); + ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset); + for (uint32_t j = 0; j < info->n_dims; ++j) { + if (ctx->header.version == 1) { + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + uint32_t t = 0; + ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset); + info->ne[j] = t; + } else { + ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset); + } + } + ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset); + ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); + + if (!ok) { + fprintf(stderr, "%s: failed to read tensor info\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } + } + } + + ctx->alignment = GGUF_DEFAULT_ALIGNMENT; + + int alignment_idx = gguf_find_key(ctx, "general.alignment"); + if (alignment_idx != -1) { + ctx->alignment = gguf_get_val_u32(ctx, alignment_idx); + } + + // we require the data section to be aligned, so take into account any padding + { + const size_t offset_pad = offset % ctx->alignment; + + if (offset_pad != 0) { + offset += ctx->alignment - offset_pad; + fseek(file, offset, SEEK_SET); + } + } + + // store the current file offset - this is where the data section starts + ctx->offset = offset; + + // compute the total size of the data section, taking into account the alignment + { + ctx->size = 0; + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + const int64_t ne = + (int64_t) info->ne[0] * + (int64_t) info->ne[1] * + (int64_t) info->ne[2] * + (int64_t) info->ne[3]; + + if (ne % ggml_blck_size(info->type) != 0) { + fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n", + __func__, info->name.data, ne, ggml_blck_size(info->type)); + fclose(file); + gguf_free(ctx); + return NULL; + } + + const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type); + + ctx->size += GGML_PAD(size_cur, ctx->alignment); + } + } + + // load the tensor data only if requested + if (params.ctx != NULL) { + // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob + // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of + // the ggml_tensor structs to the appropriate locations in the binary blob + + // compute the exact size needed for the new ggml_context + const size_t mem_size = + params.no_alloc ? + (ctx->header.n_tensors )*ggml_tensor_overhead() : + (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size; + + struct ggml_init_params pdata = { + .mem_size = mem_size, + .mem_buffer = NULL, + .no_alloc = params.no_alloc, + }; + + *params.ctx = ggml_init(pdata); + + struct ggml_context * ctx_data = *params.ctx; + + struct ggml_tensor * data = NULL; + + if (!params.no_alloc) { + data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size); + + ok = ok && data != NULL; + + // read the binary blob with the tensor data + ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset); + + if (!ok) { + fprintf(stderr, "%s: failed to read tensor data\n", __func__); + fclose(file); + ggml_free(ctx_data); + gguf_free(ctx); + return NULL; + } + + ctx->data = data->data; + } + + ggml_set_no_alloc(ctx_data, true); + + // create the tensors + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + const int64_t ne[GGML_MAX_DIMS] = { + ctx->infos[i].ne[0], + ctx->infos[i].ne[1], + ctx->infos[i].ne[2], + ctx->infos[i].ne[3], + }; + + struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne); + + ok = ok && cur != NULL; + + ggml_set_name(cur, ctx->infos[i].name.data); + + if (!ok) { + break; + } + + // point the data member to the appropriate location in the binary blob using the tensor infos + if (!params.no_alloc) { + //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file + cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data + } + } + + if (!ok) { + fprintf(stderr, "%s: failed to read the tensor data\n", __func__); + fclose(file); + ggml_free(ctx_data); + gguf_free(ctx); + return NULL; + } + + ggml_set_no_alloc(ctx_data, params.no_alloc); + } + + fclose(file); + + return ctx; +} + +void gguf_free(struct gguf_context * ctx) { + if (ctx == NULL) { + return; + } + + if (ctx->kv) { + // free string memory - not great.. + for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { + struct gguf_kv * kv = &ctx->kv[i]; + + if (kv->key.data) { + free(kv->key.data); + } + + if (kv->type == GGUF_TYPE_STRING) { + if (kv->value.str.data) { + free(kv->value.str.data); + } + } + + if (kv->type == GGUF_TYPE_ARRAY) { + if (kv->value.arr.data) { + if (kv->value.arr.type == GGUF_TYPE_STRING) { + for (uint32_t j = 0; j < kv->value.arr.n; ++j) { + struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j]; + if (str->data) { + free(str->data); + } + } + } + free(kv->value.arr.data); + } + } + } + + free(ctx->kv); + } + + if (ctx->infos) { + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + if (info->name.data) { + free(info->name.data); + } + } + + free(ctx->infos); + } + + GGML_ALIGNED_FREE(ctx); +} + +const char * gguf_type_name(enum gguf_type type) { + return GGUF_TYPE_NAME[type]; +} + +int gguf_get_version(const struct gguf_context * ctx) { + return ctx->header.version; +} + +size_t gguf_get_alignment(const struct gguf_context * ctx) { + return ctx->alignment; +} + +size_t gguf_get_data_offset(const struct gguf_context * ctx) { + return ctx->offset; +} + +void * gguf_get_data(const struct gguf_context * ctx) { + return ctx->data; +} + +int gguf_get_n_kv(const struct gguf_context * ctx) { + return ctx->header.n_kv; +} + +int gguf_find_key(const struct gguf_context * ctx, const char * key) { + // return -1 if key not found + int keyfound = -1; + + const int n_kv = gguf_get_n_kv(ctx); + + for (int i = 0; i < n_kv; ++i) { + if (strcmp(key, gguf_get_key(ctx, i)) == 0) { + keyfound = i; + break; + } + } + + return keyfound; +} + +const char * gguf_get_key(const struct gguf_context * ctx, int i) { + return ctx->kv[i].key.data; +} + +enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int i) { + return ctx->kv[i].type; +} + +enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.arr.type; +} + +const void * gguf_get_arr_data(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.arr.data; +} + +const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) { + struct gguf_kv * kv = &ctx->kv[key_id]; + struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i]; + return str->data; +} + +int gguf_get_arr_n(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.arr.n; +} + +uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.uint8; +} + +int8_t gguf_get_val_i8(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.int8; +} + +uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.uint16; +} + +int16_t gguf_get_val_i16(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.int16; +} + +uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.uint32; +} + +int32_t gguf_get_val_i32(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.int32; +} + +float gguf_get_val_f32(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.float32; +} + +uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.uint64; +} + +int64_t gguf_get_val_i64(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.int64; +} + +double gguf_get_val_f64(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.float64; +} + +bool gguf_get_val_bool(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.bool_; +} + +const char * gguf_get_val_str (const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.str.data; +} + +int gguf_get_n_tensors(const struct gguf_context * ctx) { + return ctx->header.n_tensors; +} + +int gguf_find_tensor(const struct gguf_context * ctx, const char * name) { + // return -1 if tensor not found + int tensorfound = -1; + + const int n_tensors = gguf_get_n_tensors(ctx); + + for (int i = 0; i < n_tensors; ++i) { + if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) { + tensorfound = i; + break; + } + } + + return tensorfound; +} + +size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) { + return ctx->infos[i].offset; +} + +char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) { + return ctx->infos[i].name.data; +} + +// returns the index +static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) { + const int idx = gguf_find_key(ctx, key); + if (idx >= 0) { + return idx; + } + + const int n_kv = gguf_get_n_kv(ctx); + + ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv)); + ctx->kv[n_kv].key.n = strlen(key); + ctx->kv[n_kv].key.data = strdup(key); + ctx->header.n_kv++; + + return n_kv; +} + +void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_UINT8; + ctx->kv[idx].value.uint8 = val; +} + +void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_INT8; + ctx->kv[idx].value.int8 = val; +} + +void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_UINT16; + ctx->kv[idx].value.uint16 = val; +} + +void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_INT16; + ctx->kv[idx].value.int16 = val; +} + +void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_UINT32; + ctx->kv[idx].value.uint32 = val; +} + +void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_INT32; + ctx->kv[idx].value.int32 = val; +} + +void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_FLOAT32; + ctx->kv[idx].value.float32 = val; +} + +void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_UINT64; + ctx->kv[idx].value.uint64 = val; +} + +void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_INT64; + ctx->kv[idx].value.int64 = val; +} + +void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_FLOAT64; + ctx->kv[idx].value.float64 = val; +} + +void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_BOOL; + ctx->kv[idx].value.bool_ = val; +} + +void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_STRING; + ctx->kv[idx].value.str.n = strlen(val); + ctx->kv[idx].value.str.data = strdup(val); +} + +void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_ARRAY; + ctx->kv[idx].value.arr.type = type; + ctx->kv[idx].value.arr.n = n; + ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]); + memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]); +} + +void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_ARRAY; + ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING; + ctx->kv[idx].value.arr.n = n; + ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str)); + for (int i = 0; i < n; i++) { + struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i]; + str->n = strlen(data[i]); + str->data = strdup(data[i]); + } +} + +// set or add KV pairs from another context +void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { + for (uint32_t i = 0; i < src->header.n_kv; i++) { + switch (src->kv[i].type) { + case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break; + case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break; + case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break; + case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break; + case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break; + case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break; + case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break; + case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break; + case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break; + case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break; + case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break; + case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break; + case GGUF_TYPE_ARRAY: + { + if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) { + const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *)); + for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) { + data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data; + } + gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n); + free(data); + } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) { + GGML_ASSERT(false && "nested arrays not supported"); + } else { + gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n); + } + } break; + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; + } + } +} + +void gguf_add_tensor( + struct gguf_context * ctx, + const struct ggml_tensor * tensor) { + const int idx = ctx->header.n_tensors; + ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info)); + + ctx->infos[idx].name.n = strlen(tensor->name); + ctx->infos[idx].name.data = strdup(tensor->name); + + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + ctx->infos[idx].ne[i] = 1; + } + + ctx->infos[idx].n_dims = tensor->n_dims; + for (int i = 0; i < tensor->n_dims; i++) { + ctx->infos[idx].ne[i] = tensor->ne[i]; + } + + ctx->infos[idx].type = tensor->type; + ctx->infos[idx].offset = 0; + ctx->infos[idx].data = tensor->data; + ctx->infos[idx].size = ggml_nbytes(tensor); + + if (ctx->header.n_tensors > 0) { + ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment); + } + + ctx->header.n_tensors++; +} + +void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) { + const int idx = gguf_find_tensor(ctx, name); + if (idx < 0) { + GGML_ASSERT(false && "tensor not found"); + } + + ctx->infos[idx].type = type; +} + +void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) { + const int idx = gguf_find_tensor(ctx, name); + if (idx < 0) { + GGML_ASSERT(false && "tensor not found"); + } + + ctx->infos[idx].data = data; + ctx->infos[idx].size = size; + + // update offsets + for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) { + ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment); + } +} + +//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) { +// fwrite(&val->n, sizeof(val->n), 1, file); +// fwrite(val->data, sizeof(char), val->n, file); +//} +// +//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) { +// fwrite(val, sizeof(char), size, file); +//} + +struct gguf_buf { + void * data; + size_t size; + size_t offset; +}; + +static struct gguf_buf gguf_buf_init(size_t size) { + struct gguf_buf buf = { + /*buf.data =*/ size == 0 ? NULL : malloc(size), + /*buf.size =*/ size, + /*buf.offset =*/ 0, + }; + + return buf; +} + +static void gguf_buf_free(struct gguf_buf buf) { + if (buf.data) { + free(buf.data); + } +} + +static void gguf_buf_grow(struct gguf_buf * buf, size_t size) { + if (buf->offset + size > buf->size) { + buf->size = 1.5*(buf->offset + size); + if (buf->data) { + buf->data = realloc(buf->data, buf->size); + } + } +} + +static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) { + gguf_buf_grow(buf, sizeof(val->n) + val->n); + + if (buf->data) { + memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n)); + } + buf->offset += sizeof(val->n); + + if (buf->data) { + memcpy((char *) buf->data + buf->offset, val->data, val->n); + } + buf->offset += val->n; +} + +static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) { + gguf_buf_grow(buf, el_size); + + if (buf->data) { + memcpy((char *) buf->data + buf->offset, val, el_size); + } + buf->offset += el_size; +} + +static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) { + // write header + gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic)); + gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version)); + gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors)); + gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv)); + + // write key-value pairs + for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { + struct gguf_kv * kv = &ctx->kv[i]; + + gguf_bwrite_str(buf, &kv->key); + gguf_bwrite_el (buf, &kv->type, sizeof(kv->type)); + + switch (kv->type) { + case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break; + case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break; + case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break; + case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break; + case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break; + case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break; + case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break; + case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break; + case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break; + case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break; + case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break; + case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break; + case GGUF_TYPE_ARRAY: + { + gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type)); + gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) ); + + switch (kv->value.arr.type) { + case GGUF_TYPE_UINT8: + case GGUF_TYPE_INT8: + case GGUF_TYPE_UINT16: + case GGUF_TYPE_INT16: + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: + case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: + case GGUF_TYPE_BOOL: + { + gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]); + } break; + case GGUF_TYPE_STRING: + { + for (uint32_t j = 0; j < kv->value.arr.n; ++j) { + gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]); + } + } break; + case GGUF_TYPE_ARRAY: + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; + }; + } break; + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); + }; + } + + // write tensor infos + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + gguf_bwrite_str(buf, &info->name); + gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims)); + for (uint32_t j = 0; j < info->n_dims; ++j) { + gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j])); + } + gguf_bwrite_el(buf, &info->type, sizeof(info->type)); + gguf_bwrite_el(buf, &info->offset, sizeof(info->offset)); + } + + // we require the data section to be aligned, so take into account any padding + { + const size_t offset = buf->offset; + const size_t offset_pad = GGML_PAD(offset, ctx->alignment); + + if (offset_pad != offset) { + uint8_t pad = 0; + for (size_t i = 0; i < offset_pad - offset; ++i) { + gguf_bwrite_el(buf, &pad, sizeof(pad)); + } + } + } + + if (only_meta) { + return; + } + + size_t offset = 0; + + // write tensor data + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + const size_t size = info->size; + const size_t size_pad = GGML_PAD(size, ctx->alignment); + + gguf_bwrite_el(buf, info->data, size); + + if (size_pad != size) { + uint8_t pad = 0; + for (size_t j = 0; j < size_pad - size; ++j) { + gguf_bwrite_el(buf, &pad, sizeof(pad)); + } + } + + GGML_ASSERT(offset == info->offset); + + offset += size_pad; + } +} + +void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) { + FILE * file = fopen(fname, "wb"); + if (!file) { + GGML_ASSERT(false && "failed to open file for writing"); + } + + struct gguf_buf buf = gguf_buf_init(16*1024); + + gguf_write_to_buf(ctx, &buf, only_meta); + + fwrite(buf.data, 1, buf.offset, file); + + gguf_buf_free(buf); + + fclose(file); +} + +size_t gguf_get_meta_size(const struct gguf_context * ctx) { + // no allocs - only compute size + struct gguf_buf buf = gguf_buf_init(0); + + gguf_write_to_buf(ctx, &buf, true); + + return buf.offset; +} + +void gguf_get_meta_data(const struct gguf_context * ctx, void * data) { + struct gguf_buf buf = gguf_buf_init(16*1024); + + gguf_write_to_buf(ctx, &buf, true); + + memcpy(data, buf.data, buf.offset); + + gguf_buf_free(buf); +} + +//////////////////////////////////////////////////////////////////////////////// + +int ggml_cpu_has_avx(void) { +#if defined(__AVX__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_avx2(void) { +#if defined(__AVX2__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_avx512(void) { +#if defined(__AVX512F__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_avx512_vbmi(void) { +#if defined(__AVX512VBMI__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_avx512_vnni(void) { +#if defined(__AVX512VNNI__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_fma(void) { +#if defined(__FMA__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_neon(void) { +#if defined(__ARM_NEON) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_arm_fma(void) { +#if defined(__ARM_FEATURE_FMA) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_f16c(void) { +#if defined(__F16C__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_fp16_va(void) { +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_wasm_simd(void) { +#if defined(__wasm_simd128__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_blas(void) { +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_cublas(void) { +#if defined(GGML_USE_CUBLAS) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_clblast(void) { +#if defined(GGML_USE_CLBLAST) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_gpublas(void) { + return ggml_cpu_has_cublas() || ggml_cpu_has_clblast(); +} + +int ggml_cpu_has_sse3(void) { +#if defined(__SSE3__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_ssse3(void) { +#if defined(__SSSE3__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_vsx(void) { +#if defined(__POWER9_VECTOR__) + return 1; +#else + return 0; +#endif +} + +//////////////////////////////////////////////////////////////////////////////// diff --git a/seamless_communication/ggml/test_ggml_integration.py b/seamless_communication/ggml/test_ggml_integration.py new file mode 100644 index 0000000..55ec48d --- /dev/null +++ b/seamless_communication/ggml/test_ggml_integration.py @@ -0,0 +1,390 @@ +import ctypes +import functools +import logging +import sys +from ctypes import c_void_p +from pathlib import Path +from typing import Any, Iterator, Tuple + +import fairseq2.nn +import fairseq2.nn.transformer +import numpy as np +import pytest +import torch + +import ggml +from ctypes_utils import Ptr +from ggml import NativeObj +from ggml_convert import convert_model + +Ctx = ggml.ggml_context_p + +UNITY_MODELS = Path(__file__).parent / "examples/unity/models" +CTX_PARAMS = ggml.ggml_init_params(mem_size=16 * 1024 * 1024, mem_buffer=None) + + +@pytest.fixture(name="ctx") +def _ctx() -> Iterator[Ctx]: + """Allocate a new context with 16 MB of memory""" + try: + ctx = ggml.ggml_init(params=CTX_PARAMS) + yield ctx + finally: + ggml.ggml_free(ctx) + + +def test_ggml_bindings_work(ctx: Ctx) -> None: + # Instantiate tensors + x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) + a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) + b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) + + # Use ggml operations to build a computational graph + x2 = ggml.ggml_mul(ctx, x, x) + f = ggml.ggml_add(ctx, ggml.ggml_mul(ctx, a, x2), b) + + gf = ggml.ggml_build_forward(f) + + # Set the input values + ggml.ggml_set_f32(x, 2.0) + ggml.ggml_set_f32(a, 3.0) + ggml.ggml_set_f32(b, 4.0) + + # Compute the graph + ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1) + + # Get the output value + output = ggml.ggml_get_f32_1d(f, 0) + assert output == 16.0 + + +def test_ggml_matmul(ctx: Ctx) -> None: + # Instantiate tensors + a = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 4, 2) + x = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 4, 3) + + # Use ggml operations to build a computational graph + y = ggml.ggml_mul_mat(ctx, a, x) + assert ggml.shape(y) == (3, 2) + gf = ggml.ggml_build_forward(y) + + # Set the input values + ggml.ggml_set_f32(x, 0.0) + for i in range(4 * 3): + ggml.ggml_set_f32_1d(x, i, i) + + ggml.ggml_set_f32(a, 0.0) + ggml.ggml_set_f32_1d(a, 1, 1.0) + ggml.ggml_set_f32_1d(a, 7, 1.0) + ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1) + output = [[ggml.ggml_get_f32_1d(y, j * 2 + i) for j in range(3)] for i in range(2)] + assert output == [[1, 5, 9], [3, 7, 11]] + + +def test_shape_works(ctx: Ctx) -> None: + """GGML shape order convention is the reverse from numpy""" + a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 10) + assert ggml.shape(a) == (10,) + + b = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 11, 21) + assert ggml.shape(b) == (21, 11) + + c = ggml.ggml_new_tensor_3d(ctx, ggml.GGML_TYPE_F32, 12, 22, 32) + assert ggml.shape(c) == (32, 22, 12) + + +def test_nb_works(ctx: Ctx) -> None: + a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 10) + assert ggml.nb(a) == (4, 40, 40, 40) + + b = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F16, 11, 21) + assert ggml.nb(b) == (2, 22, 462, 462) + + c = ggml.ggml_new_tensor_3d(ctx, ggml.GGML_TYPE_F32, 12, 22, 32) + assert ggml.nb(c) == (4, 48, 1056, 33792) + + +def test_strides_works(ctx: Ctx) -> None: + a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 10) + assert ggml.strides(a) == np.ones((10,), dtype=np.float32).strides + + b = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 11, 21) + assert ggml.strides(b) == np.ones((21, 11), dtype=np.float32).strides + + c = ggml.ggml_new_tensor_3d(ctx, ggml.GGML_TYPE_F32, 12, 22, 32) + assert ggml.strides(c) == np.ones((32, 22, 12), dtype=np.float32).strides + + +def test_to_numpy_works_with_f32(ctx: Ctx) -> None: + a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 10) + na = ggml.to_numpy(a) + for i in range(10): + ggml.ggml_set_f32_1d(a, i, i) + assert na[5] == 5 + assert np.allclose(na, np.array(range(10), dtype=np.float32)) + ggml.ggml_set_f32_1d(a, 5, -1.5) + assert na[5] == -1.5 + + # Note: GGML order of dims is reversed wrt numpy shapes + b = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 11, 21) + for i in range(11 * 21): + ggml.ggml_set_f32_1d(b, i, i) + nb = ggml.to_numpy(b) + # assert nb.shape == (21, 11) + assert nb[0, 5] == 5 + assert nb[3, 5] == 11 * 3 + 5 + assert np.allclose( + nb, np.array(range(11 * 21), dtype=np.float32).reshape(ggml.shape(b)) + ) + ggml.ggml_set_f32_1d(b, 11 * 3 + 5, -1.5) + assert nb[3, 5] == -1.5 + + sum_rows = ggml.ggml_sum_rows(ctx, b) + gf = ggml.ggml_build_forward(sum_rows) + ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1) + np_sum_rows = np.sum(nb, axis=-1, keepdims=True) + assert np_sum_rows.shape == ggml.shape(sum_rows) + for i in range(11): + assert np_sum_rows[i] == ggml.ggml_get_f32_1d(sum_rows, i) + + c = ggml.ggml_new_tensor_3d(ctx, ggml.GGML_TYPE_F32, 12, 22, 32) + for i in range(12 * 22 * 32): + ggml.ggml_set_f32_1d(c, i, i) + nc = ggml.to_numpy(c) + assert ggml.shape(c) == (32, 22, 12) + assert nc[3, 5, 11] == 22 * 12 * 3 + 12 * 5 + 11 + assert np.allclose( + nc, np.array(range(12 * 22 * 32), dtype=np.float32).reshape(ggml.shape(c)) + ) + ggml.ggml_set_f32_1d(c, 22 * 12 * 3 + 12 * 5 + 11, -1.5) + assert nc[3, 5, 11] == -1.5 + + +def test_from_numpy_works_with_f32(ctx: Ctx) -> None: + a = np.random.normal(size=(10,)).astype(dtype=np.float32) + ga = ggml.from_numpy(ctx, a) + assert ggml.shape(ga) == (10,) + assert ggml.nb(ga) == ggml.nb(ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 10)) + assert np.allclose(a, ggml.to_numpy(ga)) + + a = np.random.normal(size=(11, 21)).astype(dtype=np.float32) + ga = ggml.from_numpy(ctx, a) + assert ggml.shape(ga) == (11, 21) + assert ggml.nb(ga) == ggml.nb( + ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, *a.shape[::-1]) + ) + assert np.allclose(a, ggml.to_numpy(ga)) + + a = np.random.normal(size=(12, 22, 32)).astype(dtype=np.float32) + ga = ggml.from_numpy(ctx, a) + assert ggml.shape(ga) == (12, 22, 32) + assert ggml.nb(ga) == ggml.nb( + ggml.ggml_new_tensor_3d(ctx, ggml.GGML_TYPE_F32, *a.shape[::-1]) + ) + assert np.allclose(a, ggml.to_numpy(ga)) + + +def test_to_numpy_works_with_f16(ctx: Ctx) -> None: + # We explicitly fill the tensor otherwise they might have non-zero values in them. + a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F16, 10) + na = ggml.to_numpy(a) + ggml.ggml_set_f32(a, 2.14) + assert np.allclose(na, np.ones((10,), dtype=np.float16) * 2.14) + ggml.ggml_set_f32(a, 4.28) + assert np.allclose(na, np.ones((10,), dtype=np.float16) * 4.28) + + b = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F16, 11, 21) + nb = ggml.to_numpy(b) + ggml.ggml_set_f32(b, 4.18) + assert np.allclose(nb, np.ones((21, 11), dtype=np.float16) * 4.18) + ggml.ggml_set_f32(b, 5.12) + assert np.allclose(nb, np.ones((21, 11), dtype=np.float16) * 5.12) + + c = ggml.ggml_new_tensor_3d(ctx, ggml.GGML_TYPE_F16, 12, 22, 32) + nc = ggml.to_numpy(c) + ggml.ggml_set_f32(c, 3.16) + assert np.allclose(nc, np.ones((32, 22, 12), dtype=np.float16) * 3.16) + ggml.ggml_set_f32(c, 5.08) + assert np.allclose(nc, np.ones((32, 22, 12), dtype=np.float16) * 5.08) + + +def test_from_numpy_works_with_f16(ctx: Ctx) -> None: + a = np.random.normal(size=(10,)).astype(dtype=np.float16) + ga = ggml.from_numpy(ctx, a) + assert np.allclose(a, ggml.to_numpy(ga)) + a = np.random.normal(size=(11, 21)).astype(dtype=np.float16) + ga = ggml.from_numpy(ctx, a) + assert np.allclose(a, ggml.to_numpy(ga)) + a = np.random.normal(size=(12, 22, 32)).astype(dtype=np.float16) + ga = ggml.from_numpy(ctx, a) + assert np.allclose(a, ggml.to_numpy(ga)) + + +def test_to_numpy_works_with_transposed(ctx: Ctx) -> None: + ga = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 10, 5) + a = ggml.to_numpy(ga) + a[...] = np.arange(50).reshape(5, 10).astype(dtype=np.float32) + + gat = ggml.ggml_transpose(ctx, ga) + at = ggml.to_numpy(gat) + assert np.allclose(a.T, at) + + +def test_ggml_slice(ctx: Ctx) -> None: + ga = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 10, 5) + a = ggml.to_numpy(ga) + a[...] = np.arange(50).reshape(5, 10).astype(dtype=np.float32) + + gs0 = ggml.ggml_slice(ctx, ga, 0, 3, 7) + s0 = ggml.to_numpy(gs0) + assert np.allclose(a[:, 3:7], s0) + + gs1 = ggml.ggml_slice(ctx, ga, 1, 2, 5) + s1 = ggml.to_numpy(gs1) + assert np.allclose(a[2:5, :], s1) + + +@pytest.mark.xfail(reason="to_numpy not implemented") +def test_ggml_transpose_and_slice(ctx: Ctx) -> None: + ga = ggml.ggml_new_tensor_2d(ctx, ggml.GGML_TYPE_F32, 10, 5) + a = ggml.to_numpy(ga) + a[...] = np.arange(50).reshape(5, 10).astype(dtype=np.float32) + + gat = ggml.ggml_transpose(ctx, ga) + gs0 = ggml.ggml_slice(ctx, gat, 0, 2, 5) + s0 = ggml.to_numpy(gs0) + assert np.allclose(a.T[:, 2:5], s0) + + gs1 = ggml.ggml_slice(ctx, gat, 1, 3, 7) + s1 = ggml.to_numpy(gs1) + assert np.allclose(a.T[3:7, :], s1) + + +def test_numpy_mul_mat(ctx: Ctx) -> None: + slen, d_in, d_out = (5, 4, 2) + # torch.nn and fairseq2.nn assumes (seq_len, dim) to represent inputs, + x = np.zeros((slen, d_in), dtype=np.float32) # (seq_len, dim_in) + x[0, :] = [1, 1 / 3, 0, 0] + + weight = np.eye(d_out, d_in, dtype=np.float32) + weight[1, 1] = 1 + # assert weight.shape == (d_out, d_in) # (dim_out, dim_in) + y_exp = x @ weight.T # (seq_len, dim_out) + + gx = ggml.from_numpy(ctx, x) # (dim_in, seq_len) + gw = ggml.from_numpy(ctx, weight) # (dim_in, dim_out) + # gb = ggml.from_numpy(ctx, linear.bias.numpy()) # (dim_out) + # GGML linear impl + assert ggml.ggml_can_mul_mat(gw, gx) + # gy = ggml.ggml_add(ctx, ggml.ggml_mul_mat(ctx, gw, gx), gb) # (dim_out, seq_len) + gy = ggml.ggml_mul_mat(ctx, gw, gx) # (dim_out, seq_len) + + ggml.build_and_compute(ctx, gy) + + y = ggml.to_numpy(gy) + assert np.allclose(y_exp, y) + + +@pytest.mark.parametrize("ndim", [2, 3, 4]) +def test_flatten(ctx: Ctx, ndim: int) -> None: + shape = [11, 7, 5, 3][:ndim] # Prime numbers to avoid surprises + numel = functools.reduce(lambda a, b: a * b, shape, 1) + x = torch.arange(numel, dtype=torch.float32).reshape(shape) + for torch_dim in range(ndim - 1): + ggml_dim = ndim - 1 - torch_dim + n = x.shape[torch_dim + 1] + + gx = ggml.from_numpy(ctx, x) + gx1 = ggml.ggml_flatten_1d(ctx, gx, ggml_dim - 1) + gy = ggml.ggml_unflatten_1d(ctx, gx1, ggml_dim - 1, n) + + x1 = x.flatten(torch_dim, torch_dim + 1) + y = x1.unflatten(torch_dim, (-1, n)) + assert y.shape == x.shape + assert np.allclose(y.numpy(), x.numpy()) + assert x1.shape == ggml.shape(gx1) + assert np.allclose(x1.numpy(), ggml.to_numpy(gx1)) + assert y.shape == ggml.shape(gy) + assert np.allclose(y.numpy(), ggml.to_numpy(gy)) + + +@torch.no_grad() +def test_torch_spda_vs_ggml_flash_attn(ctx: Ctx) -> None: + slen, d_in, num_heads = (5, 4, 2) + torch.random.manual_seed(0) + q = torch.zeros((num_heads, slen, d_in)) + torch.nn.init.uniform_(q, -1, 1) + k = torch.zeros((num_heads, slen, d_in)) + torch.nn.init.uniform_(k, -1, 1) + v = torch.zeros((num_heads, slen, d_in)) + torch.nn.init.uniform_(v, -1, 1) + + # Note: we are using x for both keys and queries, so every position + # attends mostly to itself, hence y_exp looks a bit like arange(slen) + y_exp = torch.nn.functional.scaled_dot_product_attention(q, k, v, is_causal=True) + y_exp = y_exp.numpy() + gq = ggml.from_numpy(ctx, q.numpy()) + gk = ggml.from_numpy(ctx, k.numpy()) + # ggml flash attention expect a different order of axis for v: + # (H, slen, H_dim) -> (H, H_dim, slen) + gv = ggml.from_numpy(ctx, v.transpose(1, 2).contiguous().numpy()) + assert ggml.shape(gv) == (num_heads, d_in, slen) + gy = ggml.ggml_flash_attn(ctx, gq, gk, gv, True) + gf = ggml.ggml_build_forward(gy) + ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1) + + y = ggml.to_numpy(gy) + assert np.allclose(y_exp, y) + + +@pytest.mark.parametrize("shape", [(5, 8, 4), (2, 5, 8, 4)]) +def test_ggml_softmax_vs_torch(ctx: Ctx, shape: Tuple[int, ...]) -> None: + x = torch.empty(shape) + torch.nn.init.uniform_(x, -1, 1) + y_exp = torch.softmax(x, dim=-1).numpy() + + gx = ggml.from_numpy(ctx, x.numpy()) + gy = ggml.ggml_soft_max(ctx, gx) + + ggml.build_and_compute(ctx, gy) + + y = ggml.to_numpy(gy) + assert np.allclose(y_exp, y, rtol=1e-3) + assert np.allclose(np.argmax(y_exp, axis=-1), np.argmax(y, axis=-1)) + + +def test_can_return_hypothesis_ptr(ctx: Ctx) -> None: + hyp_ptr = ggml._testing_return_hypothesis_ptr(ctx) + + hyp0, hyp1 = hyp_ptr[0], hyp_ptr[1] + assert ggml.to_numpy(hyp0.seq).tolist() == [314] + assert hyp0.score == pytest.approx(3.14) + + assert ggml.to_numpy(hyp1.seq).tolist() == [421] + assert hyp1.score == pytest.approx(4.21) + + +@pytest.mark.parametrize("inplace", ["", "inplace"]) +def test_set_2d(ctx: Ctx, inplace: bool): + a = torch.empty((5, 3, 2)) + torch.nn.init.uniform_(a, -1, 1) + b = torch.empty((3, 2)) + torch.nn.init.uniform_(b, -1, 1) + a_original = a.clone() + + # make a copy of `a` before we modify it + ga = ggml.from_numpy(ctx, a.clone().numpy()) + gb = ggml.from_numpy(ctx, b.numpy()) + a[3, ...] = b + + set_2d = ggml.ggml_set_2d_inplace if inplace else ggml.ggml_set_2d + ga_updated = set_2d(ctx, ga, gb, ggml.nb(ga)[1], ggml.nb(ga)[2] * 3) + ggml.build_and_compute(ctx, ga_updated) + + a_updated = ggml.to_numpy(ga if inplace else ga_updated) + assert np.allclose(a.numpy(), a_updated) + + if not inplace: + # When not using set_2d_inplace, the original tensor is unmodified. + assert np.allclose(ggml.to_numpy(ga), a_original.numpy()) + assert ga.contents.data != ga_updated.contents.data diff --git a/seamless_communication/ggml/test_unity_cpp.py b/seamless_communication/ggml/test_unity_cpp.py new file mode 100644 index 0000000..4dcad06 --- /dev/null +++ b/seamless_communication/ggml/test_unity_cpp.py @@ -0,0 +1,783 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import ctypes +import functools +from ctypes import c_void_p +from pathlib import Path +from typing import Any, Iterator, List, Tuple + +import fairseq2.nn +import fairseq2.nn.transformer +import numpy as np +import pytest +import torch +import torchaudio +from fairseq2.data.audio import WaveformToFbankConverter +from seamless_communication.inference.generator import SequenceGeneratorOptions +from fairseq2.models.wav2vec2.feature_extractor import Wav2Vec2FbankFeatureExtractor +from seamless_communication.inference.translator import Modality, Translator + +import ggml +from ctypes_utils import NULLPTR, Ptr +from ggml import NativeObj +from ggml_convert import convert_model, read_layer_config +import requests + +Ctx = ggml.ggml_context_p + +UNITY_MODELS = Path(__file__).parent / "examples/unity/models" +FAIRSEQ2_CPP = Path(__file__).parent / "examples/unity/fairseq2.cpp" +UNITY_FLASH_ATTN = "\n# define UNITY_FLASH_ATTN 0\n" not in FAIRSEQ2_CPP.read_text() + +DATA = Path(__file__).parent / "test_data" +LOCAL_AUDIO_SAMPLE_PATH = DATA / "LJ037-0171_sr16k.wav" +TEST_AUDIO_SAMPLE_URL = ( + "https://dl.fbaipublicfiles.com/seamless/tests/LJ037-0171_sr16k.wav" +) + + +MB = 1024 * 1024 + + +@pytest.fixture(name="ctx") +def _ctx() -> Iterator[Ctx]: + """Allocate a new context with 1024 MB of memory""" + try: + mem_size = 16 * MB + memory = torch.zeros(mem_size, dtype=torch.uint8) + ctx = ggml.ggml_init( + params=ggml.ggml_init_params( + mem_size=mem_size, + mem_buffer=ctypes.c_void_p(memory.data_ptr()), + no_alloc=True, + ) + ) + with torch.inference_mode(): + yield ctx + finally: + ggml.ggml_free(ctx) + + +@functools.lru_cache() +def _load_g_model_once() -> NativeObj: + model_file = Path(__file__).parent / "seamlessM4T_medium.ggml" + if not model_file.exists(): + convert_model("seamlessM4T_medium", model_file) + return ggml.load_fairseq2_ggml_file(model_file) + + +@pytest.fixture() +def g_model(ctx: Ctx) -> c_void_p: + model = _load_g_model_once() + ggml.lib.fairseq2_model_set_inference_ctx(model.ptr, ctx) + return model.ptr + + +@functools.lru_cache(maxsize=1) +def load_translator() -> Translator: + return Translator("seamlessM4T_medium", None, device=torch.device("cpu")) + + +def load_pt_model() -> Any: + return load_translator().model + + +def download_sample_audio() -> Any: + response = requests.get(TEST_AUDIO_SAMPLE_URL, stream=True) + with open(DATA / "LJ037-0171_sr16k.wav", "wb") as file: + for chunk in response.iter_content(chunk_size=1024): + if chunk: + file.write(chunk) + + +def test_convert_linear(tmp_path: Path) -> None: + module = fairseq2.nn.Linear(16, 24, True) + + layer_config = read_layer_config(module) + assert layer_config == {"input_dim": 16, "output_dim": 24} + + module_file = tmp_path / "module.ggml" + convert_model(module, module_file) + g_module = ggml.load_fairseq2_ggml_file(module_file) + + for k, v in layer_config.items(): + assert ( + ggml.fairseq2_model_layer_config_int(g_module.ptr, bytes(k, "ascii")) == v + ) + +def test_convert_linear_fp16(tmp_path: Path, ctx: Ctx) -> None: + pt_model = torch.nn.ModuleDict({"linear": fairseq2.nn.Linear(16, 24, True)}) + + layer_config = read_layer_config(pt_model) + assert layer_config == {"linear.input_dim": 16, "linear.output_dim": 24} + + ggml_file = tmp_path / "linear.ggml" + convert_model(pt_model, ggml_file, fp16=True) + assert ggml_file.stat().st_size < (16 * 24 + 24) * 2 * 1.5 + g_model = ggml.load_fairseq2_ggml_file(ggml_file) + ggml.lib.fairseq2_model_set_inference_ctx(g_model.ptr, ctx) + + x = torch.empty((2, 5, 16)) + torch.nn.init.uniform_(x, -1, 1) + y_exp = pt_model.linear(x).numpy() + gx = ggml.from_numpy(ctx, x) + gy = ggml.forward("Linear", g_model.ptr, "linear", gx) + ggml.build_and_compute(ctx, gy) + y = ggml.to_numpy(gy) + + assert np.allclose(y_exp, y, atol=1e-3) + + +def test_causal_attention_mask(ctx: Ctx): + x = torch.zeros((1, 10, 32)) + generator = fairseq2.nn.transformer.CausalAttentionMaskFactory() + mask_exp = generator(x, x).materialize().numpy() + + gx = ggml.from_numpy(ctx, x) + gmask = ggml.causal_attention_mask(ctx, gx) + ggml.build_and_compute(ctx, gmask) + mask = ggml.to_numpy(gmask) + + assert mask_exp.shape == (10, 10) + assert mask.shape == (10, 10) + assert np.all(mask == mask_exp) + + x = x[:, :8, :] + mask_exp = generator(x, x).materialize().numpy() + gx = ggml.from_numpy(ctx, x) + gmask = ggml.causal_attention_mask(ctx, gx) + ggml.build_and_compute(ctx, gmask) + mask = ggml.to_numpy(gmask) + + assert mask_exp.shape == (8, 8) + assert mask.shape == (8, 8) + assert np.all(mask == mask_exp) + + +def test_LayerNorm_forward(ctx: Ctx, g_model: c_void_p) -> None: + x = torch.empty((2, 21, 1024)) + torch.nn.init.uniform_(x, -1, 1) + + pt_model = load_pt_model() + y_exp = pt_model.text_encoder.layers[0].ffn_layer_norm(x).numpy() + gx = ggml.from_numpy(ctx, x) + gy = ggml.forward("LayerNorm", g_model, "text_encoder.layers.0.ffn_layer_norm", gx) + ggml.build_and_compute(ctx, gy) + + y = ggml.to_numpy(gy) + assert np.allclose(y_exp, y, atol=1e-5) + + +def test_Linear_forward(ctx: Ctx, g_model: c_void_p) -> None: + x = torch.empty((2, 21, 1024)) + torch.nn.init.uniform_(x, -1, 1) + + pt_model = load_pt_model() + y_exp = pt_model.text_encoder.layers[0].ffn.inner_proj(x).numpy() + gx = ggml.from_numpy(ctx, x) + gy = ggml.forward("Linear", g_model, "text_encoder.layers.0.ffn.inner_proj", gx) + gf = ggml.build_and_compute(ctx, gy, dump="dot/test_Linear_forward.dot") + + y = ggml.to_numpy(gy) + assert np.allclose(y_exp, y, atol=1e-5) + + +def test_FeedForwardNetwork_forward(ctx: Ctx, g_model: c_void_p) -> None: + x = torch.empty((2, 21, 1024)) # (bs, seq_len, model_dim) + torch.nn.init.uniform_(x, -1 / 32, 1 / 32) + + # Test FFN without LayerNorm + pt_model = load_pt_model() + y_exp = pt_model.text_encoder.layers[0].ffn(x).numpy() + gx = ggml.from_numpy(ctx, x) + gy = ggml.forward( + "StandardFeedForwardNetwork", g_model, "text_encoder.layers.0.ffn", gx + ) + ggml.build_and_compute(ctx, gy) + + y = ggml.to_numpy(gy) + assert np.allclose(y_exp, y, atol=1e-5) + + +@pytest.mark.parametrize("lengths", [(11, 21), (21, 13)]) +def test_MultiheadAttention_forward( + ctx: Ctx, g_model: c_void_p, lengths: Tuple[int, int] +) -> None: + x = torch.empty((2, 21, 1024)) + torch.random.manual_seed(0) + torch.nn.init.uniform_(x, -1, 1) + + # Note: we use different lengths for queries and keys, + # this tests the implementation in decoding context too. + # Note2: ggml_flash_attn requires that we have more keys than queries + # qlen, klen = (11, 21) if flash_attn else (21, 13) + qlen, klen = lengths + xq = x[:, :qlen] + xk = x[:, :klen] + if qlen > klen and UNITY_FLASH_ATTN: + pytest.skip(reason="flash_attn requires qlen > klen") + + gxq = ggml.from_numpy(ctx, xq.contiguous()) + ggml.ggml_set_name(gxq, b"xq") + gxk = ggml.from_numpy(ctx, xk.contiguous()) + ggml.ggml_set_name(gxk, b"xk") + ggml.ggml_set_no_alloc(ctx, True) + gy = ggml.forward( + "MultiheadAttention", + g_model, + "text_encoder.layers.0.self_attn", + gxq, + gxk, + gxk, + NULLPTR, # TODO: tests with causal attention masks + ) + gf = ggml.build_and_compute(ctx, gy, dump="dot/test_MultiheadAttention_forward") + y = ggml.to_numpy(gy) + nodes = ggml.nodes(gf) + node_buffers = set(t.contents.data for t in nodes.values()) + + pt_model = load_pt_model() + self_attn = pt_model.text_encoder.layers[0].self_attn + + # If buffers are overlapping, reading node contents, can be misleading. + overlap = len(node_buffers) < len(nodes) + if not overlap: + q_exp = self_attn._project_q(xq, None).numpy().reshape(2 * 16, qlen, 64) + q = ggml.to_numpy(nodes[b"q"]) + assert q.shape == q_exp.shape + assert np.allclose(q_exp, q, atol=1e-5) + + attn_weights_hook = fairseq2.nn.transformer.AttentionWeightStoreHook([]) + self_attn.register_attn_weight_hook(attn_weights_hook) + + y_exp = self_attn(xq, None, xk, None, xk).numpy() + + # with flash_attn we don't have attn_weights + naive_attn = b"attn_weights" in nodes + if naive_attn and not overlap: + attn_weights = ggml.to_numpy(nodes[b"attn_weights"]).reshape(-1, 16, qlen, klen) + [(_, attn_weights_exp)] = attn_weights_hook._storage + attn_weights_exp = attn_weights_exp.numpy() + assert attn_weights_exp.shape == attn_weights.shape + # GGML is very agressively reducing small softmax weights to 0, + # so the error isn't that small + assert np.allclose(attn_weights_exp, attn_weights, atol=1e-3) + # But the sums should be close to 1 + assert np.allclose(np.sum(attn_weights, axis=-1), np.ones((2, 16, qlen))) + # And the maximum index should match the original ones. + assert np.allclose( + np.argmax(attn_weights_exp, axis=-1), np.argmax(attn_weights, axis=-1) + ) + assert y.shape == y_exp.shape + assert np.allclose(y_exp, y, atol=1e-2 if naive_attn else 1e-4) + + +def test_MultiheadAttention_forward_self_attn_with_cache( + ctx: Ctx, g_model: c_void_p +) -> None: + pt_model = load_pt_model() + attn = pt_model.text_decoder.layers[0].self_attn + + x = torch.empty((2, 21, 1024)) + torch.random.manual_seed(0) + torch.nn.init.uniform_(x, -1, 1) + + state_bag = fairseq2.nn.IncrementalStateBag(100) + + with ggml.fairseq2_kv_cache_alloc(g_model, 16 * MB, 2, 21): + # Incremental decoding + for t in range(3): + xq = x[:, t : t + 1] + + gxq = ggml.from_numpy(ctx, xq.contiguous()) + ggml.ggml_set_name(gxq, b"xq") + gy = ggml.forward( + "MultiheadAttention", + g_model, + "text_decoder.layers.0.self_attn", + gxq, + gxq, + gxq, + None, # type: ignore + ) + gf = ggml.build_and_compute( + ctx, + gy, + dump=f"dot/test_MultiheadAttention_forward_self_attn_with_cache_{t}.dot", + ) + nodes = ggml.nodes(gf) + gk_cache = ggml.to_numpy( + nodes[b"text_decoder.layers.0.self_attn.k (step=%d)" % t] + ) + assert gk_cache.shape == (2, t + 1, 1024) + gk_cache = gk_cache.reshape(2, t + 1, 16, 64).transpose(0, 2, 1, 3) + assert gk_cache.shape == (2, 16, t + 1, 64) + + y_exp = attn(xq, None, xq, None, xq, state_bag=state_bag).numpy() + assert y_exp.shape == (2, 1, 1024) + state = state_bag.get_state(attn, fairseq2.nn.transformer.AttentionState) + state_bag.increment_step_nr() + assert state is not None + + k_cache = state.get()[0].numpy() + assert k_cache.shape == (2, 16, t + 1, 64) + assert np.allclose(gk_cache, k_cache, atol=1e-3) + + y = ggml.to_numpy(gy) + assert np.allclose(y, y_exp, atol=1e-2) + + +def test_MultiheadAttention_forward_cross_attn_with_cache( + ctx: Ctx, g_model: c_void_p +) -> None: + pt_model = load_pt_model() + attn = pt_model.text_decoder.layers[0].encoder_decoder_attn + + x = torch.empty((2, 21, 1024)) + torch.random.manual_seed(0) + torch.nn.init.uniform_(x, -1, 1) + + state_bag = fairseq2.nn.IncrementalStateBag(100) + + with ggml.fairseq2_kv_cache_alloc(g_model, 16 * MB, 2, 21): + # Incremental decoding, the keys come from the encoder, and don't change during decoding + xk = x[:, :11] + gxk = ggml.from_numpy(ctx, xk.contiguous(), name=b"xk") + + for t in range(3): + xq = x[:, t : t + 1] + + gxq = ggml.from_numpy(ctx, xq.contiguous()) + ggml.ggml_set_name(gxq, b"xq") + gy = ggml.forward( + "MultiheadAttention", + g_model, + "text_decoder.layers.0.encoder_decoder_attn", + gxq, + gxk, + gxk, + None, # type: ignore + ) + gf = ggml.build_and_compute( + ctx, + gy, + dump=f"dot/test_MultiheadAttention_forward_cross_attn_with_cache_{t}.dot", + ) + y = ggml.to_numpy(gy) + nodes = ggml.nodes(gf) + leaves = ggml.leafs(gf) + + if t > 0: + # the cache only appear in the graph during the second call + state = state_bag.get_state( + attn, fairseq2.nn.transformer.AttentionState + ) + assert state is not None + assert np.allclose( + state.get()[0].transpose(1, 2).numpy(), + ggml.to_numpy( + nodes[ + b"text_decoder.layers.0.encoder_decoder_attn.k_cache (view)" + ] + ), + atol=1e-3, + ) + + state_bag.increment_step_nr() + y_exp = attn(xq, None, xk, None, xk, state_bag=state_bag).numpy() + assert y_exp.shape == (2, 1, 1024) + assert np.allclose(y, y_exp, atol=1e-2) + + +def test_StandardTransformerEncoderLayer_forward(ctx: Ctx, g_model: c_void_p) -> None: + x = torch.empty((2, 21, 1024)) + torch.random.manual_seed(0) + torch.nn.init.uniform_(x, -1, 1) + + pt_model = load_pt_model() + layer = pt_model.text_encoder.layers[0] + + gx = ggml.from_numpy(ctx, x) + ggml.ggml_set_name(gx, b"x") + gy = ggml.forward( + "StandardTransformerEncoderLayer", + g_model, + "text_encoder.layers.0", + gx, + None, # TODO support padding mask + ) + gf = ggml.build_and_compute(ctx, gy) + + y = ggml.to_numpy(gy) + + y_exp, _ = layer(x, padding_mask=None) + y_exp = y_exp.numpy() + + assert y.shape == y_exp.shape + assert np.allclose(y_exp, y, atol=1e-4 if UNITY_FLASH_ATTN else 1e-2) + + +def test_StandardConformerEncoderLayer_forward(ctx: Ctx, g_model: c_void_p) -> None: + pt_model = load_pt_model() + x = torch.rand(1, 137, 1024) + + layer = pt_model.speech_encoder.inner.layers[0] + gx = ggml.from_numpy(ctx, x[0]) + ggml.ggml_set_name(gx, b"x") + gy = ggml.forward( + "StandardConformerEncoderLayer", + g_model, + "speech_encoder.inner.layers.0", + gx, + None, # TODO support padding mask + ) + gf = ggml.build_and_compute(ctx, gy) + + y = ggml.to_numpy(gy) + + y_exp, _ = layer(x, padding_mask=None) + y_exp = y_exp.squeeze(0).numpy() + assert y.shape == y_exp.shape + assert np.allclose(y_exp, y, atol=2e-3) + + +def test_StandardConformerEncoderAdaptorLayer_forward( + ctx: Ctx, g_model: c_void_p +) -> None: + pt_model = load_pt_model() + torch.random.manual_seed(0) + x = torch.rand(1, 137, 1024) + layer = pt_model.speech_encoder.adaptor_layers[0] + gx = ggml.from_numpy(ctx, x[0]) + ggml.ggml_set_name(gx, b"x") + gy = ggml.forward( + "StandardConformerEncoderAdaptorLayer", + g_model, + "speech_encoder.adaptor_layers.0", + gx, + None, # TODO support padding mask + ) + gf = ggml.build_and_compute(ctx, gy) + + y = ggml.to_numpy(gy) + + y_exp, _ = layer(x, None) + y_exp = y_exp.numpy() + + assert y.shape == y_exp.shape + assert np.allclose(y_exp, y, atol=2e-3) + + +def test_StandardTransformerEncoder_forward(ctx: Ctx, g_model: c_void_p) -> None: + x = torch.empty((2, 21, 1024)) + padding_mask = fairseq2.nn.padding.PaddingMask(torch.tensor([21, 21]), 21) + torch.random.manual_seed(0) + torch.nn.init.uniform_(x, -1, 1) + + gx = ggml.from_numpy(ctx, x) + ggml.ggml_set_name(gx, b"x") + gpad = ggml.from_numpy(ctx, padding_mask.materialize()) + ggml.ggml_set_name(gpad, b"padding_mask") + gy = ggml.forward( + "StandardTransformerEncoder", + g_model, + "text_encoder", + gx, + None, # TODO support padding mask + ) + gf = ggml.build_and_compute(ctx, gy) + + y = ggml.to_numpy(gy) + + pt_model = load_pt_model() + y_exp, _ = pt_model.text_encoder(x, padding_mask) + y_exp = y_exp.numpy() + + assert y.shape == y_exp.shape + assert np.allclose(y_exp, y, atol=5e-3) + + +def test_StandardConformerEncoder_forward(ctx: Ctx, g_model: c_void_p) -> None: + pt_model = load_pt_model() + if not LOCAL_AUDIO_SAMPLE_PATH.exists(): + download_sample_audio() + wav, _ = torchaudio.load(LOCAL_AUDIO_SAMPLE_PATH) + gx = ggml.from_numpy(ctx, wav * 2**15) # Apply scale before sending into ggml! + ggml.ggml_set_name(gx, b"x") + gy = ggml.forward( + "StandardConformerEncoder", + g_model, + "speech_encoder", + gx, + None, # TODO support padding mask + ) + gf = ggml.build_and_compute(ctx, gy) + + y = ggml.to_numpy(gy) + + cache = DATA / "test_StandardConformerEncoder_forward.npy" + if not cache.exists(): + converter = WaveformToFbankConverter( + num_mel_bins=80, + waveform_scale=2**15, + channel_last=True, + standardize=True, + ) + converter_input = { + "waveform": wav.transpose(0, 1), + "sample_rate": 16000.0, + "format": -1, + } + + pt_model = load_pt_model() + speech_encoder_input = pt_model.speech_encoder_frontend( + converter(converter_input)["fbank"].unsqueeze(0), None + )[0] + + y_exp, _ = pt_model.speech_encoder(speech_encoder_input, None) + y_exp = y_exp.numpy() + np.save(cache, y_exp) + else: + y_exp = np.load(cache) + + assert y.shape == y_exp.shape + assert np.allclose(y_exp, y, atol=1e-2) + + +def test_WaveformToFbank_forward(ctx: Ctx, g_model: c_void_p) -> None: + converter = WaveformToFbankConverter( + num_mel_bins=80, + waveform_scale=2**15, + channel_last=True, + standardize=True, + ) + extractor = Wav2Vec2FbankFeatureExtractor(80, stride=2, sample_every_k=1) + if not LOCAL_AUDIO_SAMPLE_PATH.exists(): + download_sample_audio() + wav, _ = torchaudio.load(LOCAL_AUDIO_SAMPLE_PATH) + gx = ggml.from_numpy(ctx, wav * 2**15) # Apply scale before sending into ggml! + ggml.ggml_set_name(gx, b"x") + + gy = ggml.forward("WaveformToFbank", g_model, "", gx) + gf = ggml.build_and_compute(ctx, gy) + + y = ggml.to_numpy(gy) + converter_input = { + "waveform": wav.transpose(0, 1), + "sample_rate": 16000.0, + "format": -1, + } + y_exp, _ = extractor(converter(converter_input)["fbank"].unsqueeze(0), None) + y_exp = y_exp.squeeze(0).numpy() + + assert y.shape == y_exp.shape + assert np.allclose(y_exp, y, atol=4e-3) # reduce? error is from standardization + + +def test_PositionalEmbedding_forward(ctx: Ctx, g_model: c_void_p) -> None: + seq = torch.zeros((4, 20, 1024), dtype=torch.float32) + + pos_encoder = fairseq2.nn.SinusoidalPositionEncoder(1024, 55, _legacy_pad_idx=1) + y_exp = pos_encoder(seq, None)[0].numpy() + + gseq = ggml.from_numpy(ctx, seq[0].clone().numpy()) + ggml.ggml_set_name(gseq, b"seq") + gy = ggml.forward( + "PositionalEmbedding", g_model, "text_decoder_frontend.pos_encoder", gseq + ) + gf = ggml.build_and_compute(ctx, gy, dump=True) + y = ggml.to_numpy(gy) + + assert y.shape == y_exp.shape + assert np.allclose(y_exp, y, atol=1e-6) + + +def test_PositionalEmbedding_forward_with_cache(ctx: Ctx, g_model: c_void_p) -> None: + seq = torch.zeros((4, 20, 1024), dtype=torch.float32) + pos_encoder = fairseq2.nn.SinusoidalPositionEncoder(1024, 55, _legacy_pad_idx=1) + pos_encoder.eval() + state_bag = fairseq2.nn.IncrementalStateBag(100) + + with ggml.fairseq2_kv_cache_alloc(g_model, 16 * MB, 2, 21): + # Incremental decoding + for t in range(20): + gseq = ggml.from_numpy(ctx, seq[:, t : t + 1, :].numpy()) + ggml.ggml_set_name(gseq, b"seq") + gy = ggml.forward( + "PositionalEmbedding", + g_model, + "text_decoder_frontend.pos_encoder", + gseq, + ) + gf = ggml.build_and_compute(ctx, gy, dump=t == 1) + y = ggml.to_numpy(gy) + + y_exp = pos_encoder(seq[:, t : t + 1, :], None, state_bag=state_bag).numpy() + state_bag.increment_step_nr() + assert y.shape == y_exp.shape + assert np.allclose(y_exp, y, atol=1e-6) + + +def test_TransformerEmbeddingFrontend_forward(ctx: Ctx, g_model: c_void_p) -> None: + seq = torch.arange(2 * 20).reshape(2, 20) + seq[1, 15:] = 0 # padding for second sentence + seq_len = torch.tensor([20, 15]) + gseq = ggml.from_numpy(ctx, seq.numpy().astype(np.int32)) + + ggml.ggml_set_name(gseq, b"seq") + gy = ggml.forward( + "TransformerEmbeddingFrontend", g_model, "text_decoder_frontend", gseq + ) + ggml.build_and_compute(ctx, gy) + y = ggml.to_numpy(gy) + + pt_model = load_pt_model() + y_exp, _ = pt_model.text_decoder_frontend(seq, seq_len) + y_exp = y_exp.numpy() + + assert y.shape == y_exp.shape + assert np.allclose(y_exp, y, atol=1e-6) + + +def test_StandardTransformerDecoderLayer_forward(ctx: Ctx, g_model: c_void_p) -> None: + x = torch.empty((2, 13, 1024)) + encoder_out = torch.empty((2, 21, 1024)) + torch.random.manual_seed(0) + torch.nn.init.uniform_(x, -1, 1) + torch.nn.init.uniform_(encoder_out, -1, 1) + + self_attn_mask = fairseq2.nn.transformer.CausalAttentionMaskFactory()(x, x) + gx = ggml.from_numpy(ctx, x) + ggml.ggml_set_name(gx, b"x") + gself_attn_mask = ggml.from_numpy(ctx, self_attn_mask.materialize().numpy()) + ggml.ggml_set_name(gself_attn_mask, b"self_attn_mask") + genc = ggml.from_numpy(ctx, encoder_out) + ggml.ggml_set_name(genc, b"encoder_out") + gy = ggml.forward( + "StandardTransformerDecoderLayer", + g_model, + "text_decoder.layers.0", + gx, + gself_attn_mask, + genc, + NULLPTR, # TODO support padding mask, + ) + ggml.build_and_compute(ctx, gy, dump=True) + y = ggml.to_numpy(gy) + + pt_model = load_pt_model() + y_exp, _ = pt_model.text_decoder.layers[0](x, None, encoder_output=encoder_out, self_attn_mask=self_attn_mask) + y_exp = y_exp.numpy() + + assert y.shape == y_exp.shape + # We still have some numerical imprecision + assert np.allclose(y_exp, y, atol=0.1) + assert np.sum(np.abs(y_exp-y) > 1e-2) < 20 + + +def test_StandardTransformerDecoder_forward(ctx: Ctx, g_model: c_void_p) -> None: + x = torch.empty((2, 13, 1024)) + encoder_out = torch.empty((2, 21, 1024)) + padding_mask = fairseq2.nn.padding.PaddingMask(torch.tensor([13, 13]), 13) + torch.random.manual_seed(0) + torch.nn.init.uniform_(x, -1, 1) + torch.nn.init.uniform_(encoder_out, -1, 1) + gx = ggml.from_numpy(ctx, x) + ggml.ggml_set_name(gx, b"x") + gpad = ggml.from_numpy(ctx, padding_mask.materialize()) + ggml.ggml_set_name(gpad, b"padding_mask") + genc = ggml.from_numpy(ctx, encoder_out) + gy = ggml.forward( + "StandardTransformerDecoder", + g_model, + "text_decoder", + gx, + None, # TODO support padding mask, + genc, + None, + ) + ggml.build_and_compute(ctx, gy) + y = ggml.to_numpy(gy) + + pt_model = load_pt_model() + y_exp, _ = pt_model.text_decoder(x, padding_mask, encoder_out, None) + y_exp = y_exp.numpy() + + assert y.shape == y_exp.shape + assert np.allclose(y_exp, y, atol=1e-3) # TODO: those tests are failing now + + +def test_s2tt(ctx: Ctx, g_model: c_void_p): + if not LOCAL_AUDIO_SAMPLE_PATH.exists(): + download_sample_audio() + src_audio_wav, _ = torchaudio.load(LOCAL_AUDIO_SAMPLE_PATH) + sample_file = DATA / "LJ037-0171_sr16k.wav.trans" + translator = load_translator() + if not sample_file.exists(): + decoded_audio = { + "waveform": src_audio_wav.t(), + "sample_rate": 16000.0, + "format": -1, + } + src = translator.collate(translator.convert_to_fbank(decoded_audio))["fbank"] + + text_out, _ = translator.get_prediction( + translator.model, + translator.text_tokenizer, + translator.unit_tokenizer, + src["seqs"], + padding_mask=None, + input_modality=Modality.SPEECH, + output_modality=Modality.TEXT, + tgt_lang="cmn", + text_generation_opts=SequenceGeneratorOptions(), + unit_generation_opts=None, + ) + + tgt_text = str(text_out[0]) + assert tgt_text == "专家的检查和证据使该委员会得出了结论,可能有五次枪击." + with open(sample_file, "w") as f: + f.write(tgt_text) + + with open(sample_file, "r") as exp: + exp_tgt_text = exp.readlines()[0].strip() + + # Apply scale before sending into ggml! + gx = ggml.from_numpy(ctx, src_audio_wav * 2**15) + ggml.ggml_set_name(gx, b"x") + encoder_out = ggml.forward( + "StandardConformerEncoder", + g_model, + "speech_encoder", + gx, + NULLPTR, # TODO support padding mask + ) + gf = ggml.build_and_compute(ctx, encoder_out) + + beam_size = 5 + opts = ggml.SequenceGeneratorOptions( + beam_size=beam_size, + soft_max_seq_len_a=1, + soft_max_seq_len_b=200, + hard_max_seq_len=500, + ) + job = ggml.SequenceGeneratorJob( + opts=opts, + prefix_seq=ggml.from_numpy(ctx, np.array([3, 256200]).astype(np.int32)), + pad_idx=0, + unk_idx=1, + bos_idx=2, + eos_idx=3, + ) + result_ptr = ggml.generate_sequence(g_model, Ptr(job), encoder_out, NULLPTR, ctx) + results = [result_ptr[i] for i in range(beam_size) if result_ptr[i].seq != None] + tokens = [ + translator.text_tokenizer.model.index_to_token(id) + for id in ggml.to_numpy(results[0].seq).tolist() + ][2:-1] + tokens = "".join(tokens).replace("▁", " ")[1:] + assert tokens == exp_tgt_text diff --git a/seamless_communication/ggml/tests/CMakeLists.txt b/seamless_communication/ggml/tests/CMakeLists.txt new file mode 100644 index 0000000..a1cedf0 --- /dev/null +++ b/seamless_communication/ggml/tests/CMakeLists.txt @@ -0,0 +1,357 @@ +# check systems +if (NOT UNAME_S) + execute_process(COMMAND uname -s OUTPUT_VARIABLE UNAME_S) +endif() +if (NOT UNAME_P) + execute_process(COMMAND uname -p OUTPUT_VARIABLE UNAME_P) +endif() +if (NOT UNAME_M) + execute_process(COMMAND uname -m OUTPUT_VARIABLE UNAME_M) +endif() +#message(STATUS "UNAME_S: ${UNAME_S} UNAME_P: ${UNAME_P} UNAME_M: ${UNAME_M}") + +# Mac OS + Arm can report x86_64 +# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 +if (UNAME_S MATCHES "Darwin") + if (NOT UNAME_P MATCHES "arm") + execute_process(COMMAND sysctl -n hw.optional.arm64 OUTPUT_VARIABLE SYSCTL_M) + if (SYSCTL_M MATCHES "1") + #set(UNAME_P "arm") + #set(UNAME_M "arm64") + message(WARNING "Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lea +d to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-#1282546789") + endif() + endif() +endif() + +if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") + message(STATUS "ARM detected") + #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=apple-m1") +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") + message(STATUS "PPC64 detected") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpower9-vector") +else() + message(STATUS "x86 detected") + #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c") + if (UNAME_S MATCHES "Darwin") + execute_process(COMMAND sysctl machdep.cpu.features OUTPUT_VARIABLE AVX1_M) + if (AVX1_M MATCHES "AVX1.0") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx") + endif() + execute_process(COMMAND sysctl machdep.cpu.leaf7_features OUTPUT_VARIABLE AVX2_M) + if (AVX2_M MATCHES "AVX2") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2") + endif() + if (AVX1_M MATCHES "FMA") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma") + endif() + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c") + elseif (UNAME_S MATCHES "Linux") + message(STATUS "Linux detected") + execute_process(COMMAND grep "avx " /proc/cpuinfo OUTPUT_VARIABLE AVX1_M) + if (AVX1_M MATCHES "avx") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx") + endif() + execute_process(COMMAND grep "avx2 " /proc/cpuinfo OUTPUT_VARIABLE AVX2_M) + if (AVX2_M MATCHES "avx2") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2") + endif() + execute_process(COMMAND grep "fma " /proc/cpuinfo OUTPUT_VARIABLE FMA_M) + if (FMA_M MATCHES "fma") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma") + endif() + execute_process(COMMAND grep "f16c " /proc/cpuinfo OUTPUT_VARIABLE F16C_M) + if (F16C_M MATCHES "f16c") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c") + endif() + execute_process(COMMAND grep "sse3 " /proc/cpuinfo OUTPUT_VARIABLE SSE3_M) + if (SSE3_M MATCHES "sse3") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3") + endif() + elseif (UNAME_S MATCHES "Haiku") + message(STATUS "Haiku detected") + execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX " OUTPUT_VARIABLE AVX1_M) + if (AVX1_M MATCHES "avx") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx") + endif() + execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX2 " OUTPUT_VARIABLE AVX2_M) + if (AVX2_M MATCHES "avx2") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2") + endif() + execute_process(COMMAND sysinfo -cpu COMMAND grep "FMA " OUTPUT_VARIABLE FMA_M) + if (FMA_M MATCHES "fma") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma") + endif() + execute_process(COMMAND sysinfo -cpu COMMAND grep "F16C " OUTPUT_VARIABLE F16C_M) + if (F16C_M MATCHES "f16c") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c") + endif() + elseif (MSVC) + if (GGML_AVX512) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX512") + # MSVC has no compile-time flags enabling specific + # AVX512 extensions, neither it defines the + # macros corresponding to the extensions. + # Do it manually. + if (GGML_AVX512_VBMI) + add_compile_definitions(__AVX512VBMI__) + endif() + if (GGML_AVX512_VNNI) + add_compile_definitions(__AVX512VNNI__) + endif() + elseif (GGML_AVX2) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2") + elseif (GGML_AVX) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX") + endif() + else() + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma -mf16c -mavx -mavx2") + endif() +endif() + +# on APPLE - include Accelerate framework +if (APPLE AND NOT GGML_NO_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate) + if (ACCELERATE_FRAMEWORK) + message(STATUS "Accelerate framework found") + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) + set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_ACCELERATE) + else() + message(WARNING "Accelerate framework not found") + endif() +endif() + +if (GGML_OPENBLAS) + set(OPENBLAS_INCLUDE_SEARCH_PATHS + /usr/include + /usr/include/openblas + /usr/include/openblas-base + /usr/local/include + /usr/local/include/openblas + /usr/local/include/openblas-base + /opt/OpenBLAS/include + $ENV{OpenBLAS_HOME} + $ENV{OpenBLAS_HOME}/include + ) + find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS}) + find_library(OPENBLAS_LIB NAMES openblas libopenblas) + if (OPENBLAS_LIB) + message(STATUS "OpenBLAS found") + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${OPENBLAS_LIB}) + set(GGML_EXTRA_INCS ${GGML_EXTRA_INCS} ${OPENBLAS_INC}) + set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_OPENBLAS) + else() + message(WARNING "OpenBLAS not found") + endif() +endif() + +# undefine NDEBUG so asserts don't get disabled in tests +add_definitions(-UNDEBUG) + +# +# test-vec0 + +set(TEST_TARGET test-vec0) +add_executable(${TEST_TARGET} ${TEST_TARGET}.c) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) + +# +# test-vec1 (x86) +if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86") + set(TEST_TARGET test-vec1) + add_executable(${TEST_TARGET} ${TEST_TARGET}.c) + target_link_libraries(${TEST_TARGET} PRIVATE ggml) +endif() + +# +# test-vec2 (arm) +if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") + set(TEST_TARGET test-vec2) + add_executable(${TEST_TARGET} ${TEST_TARGET}.c) + target_link_libraries(${TEST_TARGET} PRIVATE ggml) +endif() + +# +# test-grad0 + +set(TEST_TARGET test-grad0) +add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") + +# +# test-opt + +set(TEST_TARGET test-opt) +add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") + +# +# test-quantize-fns + +set(TEST_TARGET test-quantize-fns) +add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") + +# +# test-quantize-perf + +set(TEST_TARGET test-quantize-perf) +add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") + +# +# test-mul-mat0 + +set(TEST_TARGET test-mul-mat0) +add_executable(${TEST_TARGET} ${TEST_TARGET}.c) +target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS}) +if (MSVC) + target_link_options(${TEST_TARGET} PRIVATE "/STACK: 8388608") # 8MB +endif() +target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS}) +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") + +# +# test-mul-mat1 (arm) + +if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE) + set(TEST_TARGET test-mul-mat1) + add_executable(${TEST_TARGET} ${TEST_TARGET}.c) + target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS}) + target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS}) +endif() + +# +# test-blas0 (arm) + +if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE) + set(TEST_TARGET test-blas0) + add_executable(${TEST_TARGET} ${TEST_TARGET}.c) + target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS}) + target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS}) + add_test(NAME ${TEST_TARGET} COMMAND $ 128 128 128) + set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") +endif() + +# +# test-mul-mat2 + +set(TEST_TARGET test-mul-mat2) +add_executable(${TEST_TARGET} ${TEST_TARGET}.c) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") + +# +# test0 + +set(TEST_TARGET test0) +add_executable(${TEST_TARGET} ${TEST_TARGET}.c) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") + +# +# test1 + +set(TEST_TARGET test1) +add_executable(${TEST_TARGET} ${TEST_TARGET}.c) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +if (MSVC) + target_link_options(${TEST_TARGET} PRIVATE "/STACK: 8388608") # 8MB +endif() +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") + +# +# test2 + +set(TEST_TARGET test2) +add_executable(${TEST_TARGET} ${TEST_TARGET}.c) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") + +# +# test3 + +set(TEST_TARGET test3) +add_executable(${TEST_TARGET} ${TEST_TARGET}.c) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") + +# +# test-pool + +set(TEST_TARGET test-pool) +add_executable(${TEST_TARGET} ${TEST_TARGET}.c) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +if (MSVC) + target_link_options(${TEST_TARGET} PRIVATE "/STACK: 8388608") # 8MB +endif() +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") + +# +# test-conv-transpose + +set(TEST_TARGET test-conv-transpose) +add_executable(${TEST_TARGET} ${TEST_TARGET}.c) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +add_test(NAME ${TEST_TARGET} COMMAND $) + +# +# test-rel-pos + +set(TEST_TARGET test-rel-pos) +add_executable(${TEST_TARGET} ${TEST_TARGET}.c) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +add_test(NAME ${TEST_TARGET} COMMAND $) + +# +# test-svd0 (arm/x86) + +if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE) + set(TEST_TARGET test-svd0) + add_executable(${TEST_TARGET} ${TEST_TARGET}.c) + target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS}) + target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS}) +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86" AND GGML_OPENBLAS) + set(TEST_TARGET test-svd0) + add_executable(${TEST_TARGET} ${TEST_TARGET}.c) + target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS}) + target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS}) +endif() + +# +# test-customop + +set(TEST_TARGET test-customop) +add_executable(${TEST_TARGET} ${TEST_TARGET}.c) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +if (MSVC) + target_link_options(${TEST_TARGET} PRIVATE "/STACK: 8388608") # 8MB +endif() +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") + +# +# test-xpos + +set(TEST_TARGET test-xpos) +add_executable(${TEST_TARGET} ${TEST_TARGET}.c) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") diff --git a/seamless_communication/ggml/tests/test-blas0.c b/seamless_communication/ggml/tests/test-blas0.c new file mode 100644 index 0000000..0977d3e --- /dev/null +++ b/seamless_communication/ggml/tests/test-blas0.c @@ -0,0 +1,267 @@ +#include "ggml.h" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +uint64_t get_time_us() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000000 + tv.tv_usec; +} + +// +// naive implementation +// + +void mul_mat_f32_0( + const float * restrict src0, // M x K + const float * restrict src1, // N x K (transposed) + float * dst, + int m, int n, int k) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + float sum = 0; + for (int l = 0; l < k; l++) { + sum += src0[i*k + l] * src1[j*k + l]; + } + dst[j*m + i] = sum; + } + } +} + +int main(int argc, const char ** argv) { + if (argc < 4) { + printf("Usage: %s M N K\n", argv[0]); + return 1; + } + + const int n_threads = 1; + + int M = atoi(argv[1]); + int N = atoi(argv[2]); + int K = atoi(argv[3]); + + srand(time(NULL)); + + if (M == 0) M = rand() % 1000 + 1; + if (N == 0) N = rand() % 1000 + 1; + if (K == 0) K = rand() % 1000 + 1; + + printf("M = %d, N = %d, K = %d\n", M, N, K); + + float * src0 = malloc(sizeof(float)*M*K); + float * src1 = malloc(sizeof(float)*N*K); + float * dst0 = malloc(sizeof(float)*M*N); // naive + float * dst1 = malloc(sizeof(float)*M*N); // blas + + struct ggml_init_params params = { + .mem_size = 2048ul*1024*1024, + .mem_buffer = NULL, + .no_alloc = false, + }; + + struct ggml_context * ctx0 = ggml_init(params); + + struct ggml_tensor * s0_f32 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, K, M); + struct ggml_tensor * s1_f32 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, K, N); + + struct ggml_tensor * s0_f16 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F16, K, M); + struct ggml_tensor * s1_f16 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F16, K, N); + + for (int j = 0; j < M; j++) { + for (int i = 0; i < K; i++) { + //src0[j*K + i] = j; + src0[j*K + i] = 1e-3*(rand() % 1000); + } + } + + for (int j = 0; j < N; j++) { + for (int i = 0; i < K; i++) { + //src1[j*K + i] = j + 1; + src1[j*K + i] = 1e-3*(rand() % 1000); + } + } + + // copy src0 to s0_f32 + { + float * p_f32 = s0_f32->data; + ggml_fp16_t * p_f16 = s0_f16->data; + for (int i = 0; i < M; i++) { + for (int j = 0; j < K; j++) { + p_f32[i*K + j] = src0[i*K + j]; + p_f16[i*K + j] = ggml_fp32_to_fp16(src0[i*K + j]); + } + } + } + + // copy src1 to s1_f32 + { + float * p_f32 = s1_f32->data; + ggml_fp16_t * p_f16 = s1_f16->data; + for (int i = 0; i < N; i++) { + for (int j = 0; j < K; j++) { + p_f32[i*K + j] = src1[i*K + j]; + p_f16[i*K + j] = ggml_fp32_to_fp16(src1[i*K + j]); + } + } + } + + const clock_t start = clock(); + const uint64_t start_us = get_time_us(); + + double iM = 1.0/M; + mul_mat_f32_0(src0, src1, dst0, M, N, K); + + // Use BLAS sgemm from Accelerate framework + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, N, M, K, 1.0f, src1, K, src0, K, 0.0f, dst1, M); + + struct ggml_tensor * dst2 = NULL; + struct ggml_tensor * dst3 = NULL; + + { + dst2 = ggml_mul_mat(ctx0, s0_f32, s1_f32); + + struct ggml_cgraph gf = ggml_build_forward(dst2); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + } + + { + dst3 = ggml_mul_mat(ctx0, s0_f16, s1_f32); + + struct ggml_cgraph gf = ggml_build_forward(dst3); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + } + + bool ok_blas = true; + bool ok_ggml_f32 = true; + bool ok_ggml_f16 = true; + + // check BLAS + for (int i = 0; i < M*N; i++) { + if (fabs(dst0[i] - dst1[i])/fabs(dst0[i]) > 0.0001) { + printf("dst0[%d] = %f, dst1[%d] = %f\n", i, dst0[i], i, dst1[i]); + ok_blas = false; + } + } + + // check ggml (f32) + { + float * p = dst2->data; + for (int i = 0; i < M*N; i++) { + if (fabs(dst0[i] - p[i])/fabs(dst0[i]) > 0.0001) { + printf("dst0[%d] = %f, dst2[%d] = %f\n", i, dst0[i], i, p[i]); + ok_ggml_f32 = false; + } + } + } + + // check ggml (f16) + { + float * p = dst3->data; + for (int i = 0; i < M*N; i++) { + if (fabs(dst0[i] - p[i])/fabs(dst0[i]) > 0.01) { + printf("dst0[%d] = %f, dst3[%d] = %f\n", i, dst0[i], i, p[i]); + ok_ggml_f16 = false; + } + } + } + + { + const clock_t end = clock(); + const uint64_t end_us = get_time_us(); + printf("%s: elapsed ticks: %ld\n", __func__, end - start); + } + +#if 0 + // print src0 + printf("src0:\n"); + for (int i = 0; i < M; i++) { + for (int j = 0; j < K; j++) { + printf("%4.1f ", src0[i*K+j]); + } + printf("\n"); + } + + // print src1 + printf("src1:\n"); + for (int i = 0; i < N; i++) { + for (int j = 0; j < K; j++) { + printf("%4.1f ", src1[i*K+j]); + } + printf("\n"); + } + + printf("\n"); + printf("dst0 (naive):\n"); + for (int j = 0; j < N; j++) { + for (int i = 0; i < M; i++) { + printf("%4.1f ", dst0[j*M+i]); + } + printf("\n"); + } + + printf("\n"); + printf("dst1 (BLAS):\n"); + for (int j = 0; j < N; j++) { + for (int i = 0; i < M; i++) { + printf("%4.1f ", dst1[j*M+i]); + } + printf("\n"); + } + + printf("\n"); + printf("dst2 (ggml f32):\n"); + for (int j = 0; j < N; j++) { + for (int i = 0; i < M; i++) { + printf("%4.1f ", ((float *)dst2->data)[j*M+i]); + } + printf("\n"); + } + + printf("\n"); + printf("dst3 (ggml f16):\n"); + for (int j = 0; j < N; j++) { + for (int i = 0; i < M; i++) { + printf("%4.1f ", ((float *)dst3->data)[j*M+i]); + } + printf("\n"); + } + + printf("\n"); +#endif + + free(src0); + free(src1); + free(dst0); + free(dst1); + + ggml_free(ctx0); + + printf("ok_blas = %d\n", ok_blas); + if (!ok_blas) { + printf("ERROR: BLAS failed\n"); + } + + printf("ok_ggml_f32 = %d\n", ok_ggml_f32); + if (!ok_ggml_f32) { + printf("ERROR: ggml failed\n"); + } + + printf("ok_ggml_f16 = %d\n", ok_ggml_f16); + if (!ok_ggml_f16) { + printf("ERROR: ggml failed\n"); + } + + return (ok_blas && ok_ggml_f32 && ok_ggml_f16) ? 0 : 1; +} diff --git a/seamless_communication/ggml/tests/test-conv-transpose.c b/seamless_communication/ggml/tests/test-conv-transpose.c new file mode 100644 index 0000000..13888cf --- /dev/null +++ b/seamless_communication/ggml/tests/test-conv-transpose.c @@ -0,0 +1,175 @@ +#include "ggml/ggml.h" + +#include +#include +#include + +struct ggml_context* make_ctx(void) { + struct ggml_init_params params = { + .mem_size = 2 * 1024 * 1024, + }; + + return ggml_init(params); +} + +// void printf_tensor(struct ggml_tensor * t) { +// if (t->type == GGML_TYPE_F32) { +// const float * t_d = ggml_get_data_f32(t); +// for (int i = 0; i < t->ne[2]; ++i) { +// for (int j = 0; j < t->ne[1]; ++j) { +// for (int k = 0; k < t->ne[0]; ++k) { +// printf("%.1f ", t_d[i * t->ne[1] * t->ne[0] + j * t->ne[0] + k]); +// } +// printf("\n"); +// } +// printf("---\n"); +// } +// } +// else if (t->type == GGML_TYPE_F16) { +// const ggml_fp16_t * t_d = ggml_get_data(t); +// for (int i = 0; i < t->ne[2]; ++i) { +// for (int j = 0; j < t->ne[1]; ++j) { +// for (int k = 0; k < t->ne[0]; ++k) { +// printf("%.1f ", ggml_fp16_to_fp32(t_d[i * t->ne[1] * t->ne[0] + j * t->ne[0] + k])); +// } +// printf("\n"); +// } +// printf("---\n"); +// } +// } +// else { +// printf("unknown type\n"); +// } +// } + +void check_tensor(struct ggml_tensor * t, float * expected_t_d, int ne0, int ne1, int ne2) { + GGML_ASSERT(t->type == GGML_TYPE_F32); + GGML_ASSERT(t->ne[0] == ne0); + GGML_ASSERT(t->ne[1] == ne1); + GGML_ASSERT(t->ne[2] == ne2); + for (int i2 = 0; i2 < ne2; ++i2) { + for (int i1 = 0; i1 < ne1; ++i1) { + for (int i0 = 0; i0 < ne0; ++i0) { + float expected = *(expected_t_d + i2 * ne1 * ne0 + i1 * ne0 + i0); + float actual = ggml_get_data_f32(t)[i2 * ne1 * ne0 + i1 * ne0 + i0]; + GGML_ASSERT(expected == actual); + } + } + } +} + +int main(int argc, const char** argv) { + + float buf_f32[1024]; + for (int i = 0; i < 1024; ++i) { + buf_f32[i] = (float)i; + } + + ggml_fp16_t buf_f16[1024]; + for (int i = 0; i < 1024; ++i) { + buf_f16[i] = ggml_fp32_to_fp16((float)i); + } + + float expected_out_1[3][3][4] = { + { + {72.0, 162.0, 188.0, 106.0}, + {192.0, 430.0, 490.0, 274.0}, + {132.0, 292.0, 326.0, 180.0}, + }, + { + {96.0, 218.0, 260.0, 146.0}, + {264.0, 590.0, 682.0, 378.0}, + {180.0, 396.0, 446.0, 244.0}, + }, + { + {120.0, 274.0, 332.0, 186.0}, + {336.0, 750.0, 874.0, 482.0}, + {228.0, 500.0, 566.0, 308.0}, + }, + }; + + float expected_out_2[3][4][6] = { + { + {72.0, 78.0, 84.0, 92.0, 96.0, 106.0}, + {84.0, 90.0, 100.0, 108.0, 116.0, 126.0}, + {108.0, 120.0, 120.0, 134.0, 132.0, 148.0}, + {132.0, 144.0, 148.0, 162.0, 164.0, 180.0}, + }, + { + {96.0, 102.0, 116.0, 124.0, 136.0, 146.0}, + {108.0, 114.0, 132.0, 140.0, 156.0, 166.0}, + {156.0, 168.0, 176.0, 190.0, 196.0, 212.0}, + {180.0, 192.0, 204.0, 218.0, 228.0, 244.0}, + }, + { + {120.0, 126.0, 148.0, 156.0, 176.0, 186.0}, + {132.0, 138.0, 164.0, 172.0, 196.0, 206.0}, + {204.0, 216.0, 232.0, 246.0, 260.0, 276.0}, + {228.0, 240.0, 260.0, 274.0, 292.0, 308.0}, + }, + }; + + float expected_out_3[3][5][8] = { + { + {72.0, 78.0, 0.0, 84.0, 92.0, 0.0, 96.0, 106.0}, + {84.0, 90.0, 0.0, 100.0, 108.0, 0.0, 116.0, 126.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {108.0, 120.0, 0.0, 120.0, 134.0, 0.0, 132.0, 148.0}, + {132.0, 144.0, 0.0, 148.0, 162.0, 0.0, 164.0, 180.0}, + }, + { + {96.0, 102.0, 0.0, 116.0, 124.0, 0.0, 136.0, 146.0}, + {108.0, 114.0, 0.0, 132.0, 140.0, 0.0, 156.0, 166.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {156.0, 168.0, 0.0, 176.0, 190.0, 0.0, 196.0, 212.0}, + {180.0, 192.0, 0.0, 204.0, 218.0, 0.0, 228.0, 244.0}, + }, + { + {120.0, 126.0, 0.0, 148.0, 156.0, 0.0, 176.0, 186.0}, + {132.0, 138.0, 0.0, 164.0, 172.0, 0.0, 196.0, 206.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {204.0, 216.0, 0.0, 232.0, 246.0, 0.0, 260.0, 276.0}, + {228.0, 240.0, 0.0, 260.0, 274.0, 0.0, 292.0, 308.0}, + }, + }; + + // conv transpose 2d with stride 1, 2 & 3 + { + struct ggml_context * ctx = make_ctx(); + + struct ggml_tensor * t = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 3, 2, 2, 1); // w x h x cin + memcpy(t->data, buf_f32, ggml_nbytes(t)); + + struct ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 2, 2, 3, 2); // w x h cin x cout + memcpy(k->data, buf_f16, ggml_nbytes(k)); + + struct ggml_tensor * out_1 = ggml_conv_transpose_2d_p0(ctx, k, t, 1); + struct ggml_tensor * out_2 = ggml_conv_transpose_2d_p0(ctx, k, t, 2); + struct ggml_tensor * out_3 = ggml_conv_transpose_2d_p0(ctx, k, t, 3); + + struct ggml_cgraph gf_1 = ggml_build_forward(out_1); + struct ggml_cgraph gf_2 = ggml_build_forward(out_2); + struct ggml_cgraph gf_3 = ggml_build_forward(out_3); + + ggml_graph_compute_with_ctx(ctx, &gf_1, 1); + ggml_graph_compute_with_ctx(ctx, &gf_2, 1); + ggml_graph_compute_with_ctx(ctx, &gf_3, 1); + + // printf("in\n"); + // printf_tensor(t); + // printf("\n\nkernel\n"); + // printf_tensor(k); + // printf("\n\nout\n"); + // printf_tensor(out); + // printf("\n\nout_2\n"); + // printf_tensor(out_2); + // printf("\n\nout_3\n"); + // printf_tensor(out_3); + + check_tensor(out_1, (float*)expected_out_1, 4, 3, 3); + check_tensor(out_2, (float*)expected_out_2, 6, 4, 3); + check_tensor(out_3, (float*)expected_out_3, 8, 5, 3); + + } + return 0; +} diff --git a/seamless_communication/ggml/tests/test-customop.c b/seamless_communication/ggml/tests/test-customop.c new file mode 100644 index 0000000..ec261ec --- /dev/null +++ b/seamless_communication/ggml/tests/test-customop.c @@ -0,0 +1,223 @@ +#include "ggml/ggml.h" +#include +#include +#include +#include + +#if defined(_WIN32) +#include +typedef volatile LONG atomic_int; +static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) { + return InterlockedExchangeAdd(ptr, inc); +} +#else +#include +#endif + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +struct ggml_context * make_ctx(void) { + struct ggml_init_params params = { + /*.mem_size =*/ 1 * 1024 * 1024, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + + return ggml_init(params); +} + +char g_userdata[] = "ggml"; +atomic_int g_custom1_count = 0; +atomic_int g_custom2_count = 0; +atomic_int g_custom3_count = 0; + +void custom1(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata) { + // check that the userdata is correct + assert(userdata == NULL); + assert(ggml_are_same_shape(dst, a)); + + atomic_fetch_add(&g_custom1_count, 1); + + const float * a_data = ggml_get_data_f32(a); + float * dst_data = ggml_get_data_f32(dst); + + // this assumes that the tensors are contiguous + assert(ggml_is_contiguous(dst)); + assert(ggml_is_contiguous(a)); + + // parallelize by elements + const int ne = (int)ggml_nelements(dst); + const int dr = (ne + nth - 1) / nth; + const int ie0 = dr * ith; + const int ie1 = MIN(ie0 + dr, ne); + + for (int i = ie0; i < ie1; ++i) { + dst_data[i] = a_data[i] * 2; + } +} + +void custom2(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata) { + // check that the userdata is correct + assert(userdata == g_userdata); + assert(strcmp(userdata, "ggml") == 0); + assert(ggml_are_same_shape(dst, a)); + assert(ggml_are_same_shape(dst, b)); + + atomic_fetch_add(&g_custom2_count, 1); + + const float * a_data = ggml_get_data_f32(a); + const float * b_data = ggml_get_data_f32(b); + float * dst_data = ggml_get_data_f32(dst); + + // parallelize by rows + const int nr = (int)ggml_nrows(dst); + // number of rows per thread + const int dr = (nr + nth - 1) / nth; + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + + // number of columns + const int nc = (int)dst->ne[0]; + + // this assumes that the tensors are contiguous + assert(ggml_is_contiguous(dst)); + assert(ggml_is_contiguous(a)); + assert(ggml_is_contiguous(b)); + + for (int ir = ir0; ir < ir1; ++ir) { + for (int ic = 0; ic < nc; ++ic) { + const int i = ir * nc + ic; + dst_data[i] = a_data[i] + b_data[i]; + } + } +} + +void custom3(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata) { + // check that the userdata is correct + assert(userdata == g_userdata); + assert(strcmp(userdata, "ggml") == 0); + assert(ggml_are_same_shape(dst, a)); + assert(ggml_are_same_shape(dst, b)); + assert(ggml_are_same_shape(dst, c)); + + atomic_fetch_add(&g_custom3_count, 1); + + const float * a_data = ggml_get_data_f32(a); + const float * b_data = ggml_get_data_f32(b); + const float * c_data = ggml_get_data_f32(c); + float * dst_data = ggml_get_data_f32(dst); + + // dont parallelize + assert(ith == 0); + + // number of elements + const int ne = (int)ggml_nelements(dst); + + // this assumes that the tensors are contiguous + assert(ggml_is_contiguous(dst)); + assert(ggml_is_contiguous(a)); + assert(ggml_is_contiguous(b)); + assert(ggml_is_contiguous(c)); + + for (int i = 0; i < ne; ++i) { + dst_data[i] = a_data[i] + b_data[i] + c_data[i]; + } +} + +int main(int argc, const char** argv) { + + float buf1_f32[1024]; + for (int i = 0; i < 1024; ++i) { + buf1_f32[i] = (float)(i + 1); + } + float buf2_f32[1024]; + for (int i = 0; i < 1024; ++i) { + buf2_f32[i] = (float)(i + 1) * 2; + } + float buf3_f32[1024]; + for (int i = 0; i < 1024; ++i) { + buf3_f32[i] = (float)(i + 1) * 3; + } + + // map_custom1 + // 2 tasks, no userdata, parallelized by elements + { + struct ggml_context * ctx = make_ctx(); + struct ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2); + memcpy(t->data, buf1_f32, ggml_nbytes(t)); + + struct ggml_tensor * m1 = ggml_map_custom1(ctx, t, custom1, 2, NULL); + + struct ggml_cgraph graph = ggml_build_forward(m1); + + ggml_graph_compute_with_ctx(ctx, &graph, 4); + + const float * output = ggml_get_data_f32(m1); + + for (int i = 0; i < ggml_nelements(m1); ++i) { + assert(output[i] == buf1_f32[i] * 2); + } + assert(g_custom1_count == 2); + + ggml_free(ctx); + } + + // map_custom2 + // max tasks (4), userdata, parallelized by rows + { + struct ggml_context * ctx = make_ctx(); + struct ggml_tensor * t1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2); + memcpy(t1->data, buf1_f32, ggml_nbytes(t1)); + struct ggml_tensor * t2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2); + memcpy(t2->data, buf2_f32, ggml_nbytes(t2)); + + struct ggml_tensor * m2 = ggml_map_custom2(ctx, t1, t2, custom2, GGML_N_TASKS_MAX, g_userdata); + + struct ggml_cgraph graph = ggml_build_forward(m2); + + ggml_graph_compute_with_ctx(ctx, &graph, 4); + + const float * output = ggml_get_data_f32(m2); + + for (int i = 0; i < ggml_nelements(m2); ++i) { + assert(output[i] == buf1_f32[i] + buf2_f32[i]); + } + + assert(g_custom2_count == 4); + + ggml_free(ctx); + } + + // map_custom3 + // 1 task, userdata, not parallelized + { + struct ggml_context * ctx = make_ctx(); + struct ggml_tensor * t1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2); + memcpy(t1->data, buf1_f32, ggml_nbytes(t1)); + struct ggml_tensor * t2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2); + memcpy(t2->data, buf2_f32, ggml_nbytes(t2)); + struct ggml_tensor * t3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2); + memcpy(t3->data, buf3_f32, ggml_nbytes(t3)); + + struct ggml_tensor * m3 = ggml_map_custom3(ctx, t1, t2, t3, custom3, 1, g_userdata); + + struct ggml_cgraph graph = ggml_build_forward(m3); + + ggml_graph_compute_with_ctx(ctx, &graph, 4); + + const float * output = ggml_get_data_f32(m3); + + for (int i = 0; i < ggml_nelements(m3); ++i) { + assert(output[i] == buf1_f32[i] + buf2_f32[i] + buf3_f32[i]); + } + + assert(g_custom3_count == 1); + + ggml_free(ctx); + } + + + return 0; +} diff --git a/seamless_communication/ggml/tests/test-grad0.cpp b/seamless_communication/ggml/tests/test-grad0.cpp new file mode 100644 index 0000000..468cde6 --- /dev/null +++ b/seamless_communication/ggml/tests/test-grad0.cpp @@ -0,0 +1,1545 @@ +#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows +#include "ggml.h" + +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wdouble-promotion" +#endif + +#define MAX_NARGS 3 + +#undef MIN +#undef MAX +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +#define GGML_SILU_FP16 + +// +// logging +// + +#if (GGML_DEBUG >= 1) +#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG(...) +#endif + +#if (GGML_DEBUG >= 5) +#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG_5(...) +#endif + +#if (GGML_DEBUG >= 10) +#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG_10(...) +#endif + +#define GGML_PRINT(...) printf(__VA_ARGS__) + +static float frand(void) { + return (float)rand()/(float)RAND_MAX; +} + +static int irand(int n) { + if (n == 0) return 0; + return rand()%n; +} + +static void get_random_dims(int64_t * dims, int ndims) { + dims[0] = dims[1] = dims[2] = dims[3] = 1; + + for (int i = 0; i < ndims; i++) { + dims[i] = 1 + irand(4); + } +} + +static struct ggml_tensor * get_random_tensor_f32( + struct ggml_context * ctx0, + int ndims, + int64_t ne[], + float fmin, + float fmax) { + struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne); + + switch (ndims) { + case 1: + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin; + } + break; + case 2: + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + break; + case 3: + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + } + break; + case 4: + for (int i3 = 0; i3 < ne[3]; i3++) { + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + } + } + break; + default: + assert(false); + }; + + return result; +} + +static struct ggml_tensor * get_random_tensor_f16( + struct ggml_context * ctx0, + int ndims, + int64_t ne[], + float fmin, + float fmax) { + struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne); + + switch (ndims) { + case 1: + for (int i0 = 0; i0 < ne[0]; i0++) { + ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin); + } + break; + case 2: + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin); + } + } + break; + case 3: + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin); + } + } + } + break; + case 4: + for (int i3 = 0; i3 < ne[3]; i3++) { + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin); + } + } + } + } + break; + default: + assert(false); + }; + + return result; +} + +static struct ggml_tensor * get_random_tensor_i32( + struct ggml_context * ctx0, + int ndims, + int64_t ne[], + int32_t imin, + int32_t imax) { + struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_I32, ndims, ne); + + switch (ndims) { + case 1: + for (int i0 = 0; i0 < ne[0]; i0++) { + ((int32_t *)result->data)[i0] = irand(imax - imin) + imin; + } + break; + case 2: + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((int32_t *)result->data)[i1*ne[0] + i0] = irand(imax - imin) + imin; + } + } + break; + case 3: + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((int32_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin; + } + } + } + break; + case 4: + for (int i3 = 0; i3 < ne[3]; i3++) { + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((int32_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin; + } + } + } + } + break; + default: + assert(false); + }; + + return result; +} + +static void print_elements(const char* label, const struct ggml_tensor * t) { + if (!t) { + printf("%s: %s = null\n", __func__, label); + return; + } + const int nelements = ggml_nelements(t); + printf("%s: %s = [", __func__, label); + for (int k = 0; k < nelements; ++k) { + if (k > 0) { printf(", "); } + printf("%.5f", ggml_get_f32_1d(t, k)); + } + printf("] shape: ["); + for (int k = 0; k < t->n_dims; ++k) { + if (k > 0) { printf(", "); } + printf("%d", (int)t->ne[k]); + } + printf("]\n"); + +} + +static bool check_gradient( + const char * op_name, + struct ggml_context * ctx0, + struct ggml_tensor * x[], + struct ggml_tensor * f, + int ndims, + int nargs, + float eps, + float max_error_abs, + float max_error_rel) { + + static int n_threads = -1; + if (n_threads < 0) { + n_threads = GGML_DEFAULT_N_THREADS; + + const char *env = getenv("GGML_N_THREADS"); + if (env) { + n_threads = atoi(env); + } + + printf("GGML_N_THREADS = %d\n", n_threads); + } + + struct ggml_cgraph gf = ggml_build_forward (f); + struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); + + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + + ggml_graph_reset (&gf); + ggml_set_f32 (f->grad, 1.0f); + + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); + + // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot"); + // ggml_graph_dump_dot(&gb, &gf, "test-grad0-backward.dot"); + + for (int i = 0; i < nargs; ++i) { + const int nelements = ggml_nelements(x[i]); + for (int k = 0; k < nelements; ++k) { + // compute gradient using finite differences + const float x0 = ggml_get_f32_1d(x[i], k); + const float xm = x0 - eps; + const float xp = x0 + eps; + ggml_set_f32_1d(x[i], k, xp); + + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + + const double f0 = ggml_get_f32_1d(f, 0); + + ggml_set_f32_1d(x[i], k, xm); + + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + + const double f1 = ggml_get_f32_1d(f, 0); + const double g0 = (f0 - f1)/(2.0*(double) eps); + + ggml_set_f32_1d(x[i], k, x0); + + // compute gradient using backward graph + ggml_graph_reset (&gf); + ggml_set_f32 (f->grad, 1.0f); + + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); + + const double g1 = ggml_get_f32_1d(x[i]->grad, k); + + const double error_abs = fabs(g0 - g1); + const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0; + + if (error_abs > max_error_abs || error_rel > max_error_rel) { + printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n", + op_name, ndims, i, k, x0, xm, xp, f0, f1, g0, g1, eps, error_abs, error_rel); + //assert(false); + return false; + } + } + } + + return true; +} + +// TODO: clean-up this .. +static bool check_mat_mul( + const struct ggml_tensor * y, + const struct ggml_tensor * x0, + const struct ggml_tensor * x1) { + float * dst = (float *) y->data; + float * src0 = (float *) x0->data; + float * src1 = (float *) x1->data; + + const int nc = x0->ne[1]; + const int nr = x1->ne[1]; + const int nk = x0->ne[0]; + + GGML_PRINT_DEBUG("check_mat_mul: nc=%d, nr=%d, nk=%d\n", nc, nr, nk); + + GGML_PRINT_DEBUG("x0:\n"); + for (int j = 0; j < x0->ne[1]; ++j) { + for (int i = 0; i < x0->ne[0]; ++i) { + GGML_PRINT_DEBUG("%6.3f ", src0[j*nk + i]); + } + GGML_PRINT_DEBUG("\n"); + } + GGML_PRINT_DEBUG("\n"); + + GGML_PRINT_DEBUG("x1:\n"); + for (int j = 0; j < x1->ne[1]; ++j) { + for (int i = 0; i < x1->ne[0]; ++i) { + GGML_PRINT_DEBUG("%6.3f ", src1[j*nk + i]); + } + GGML_PRINT_DEBUG("\n"); + } + GGML_PRINT_DEBUG("\n"); + + GGML_PRINT_DEBUG("y: n_dims = %d, (%lld, %lld)\n", y->n_dims, y->ne[0], y->ne[1]); + for (int j = 0; j < y->ne[1]; ++j) { + for (int i = 0; i < y->ne[0]; ++i) { + GGML_PRINT_DEBUG("%6.3f ", dst[j*nr + i]); + } + GGML_PRINT_DEBUG("\n"); + } + + for (int i = 0; i < nr; ++i) { + for (int j = 0; j < nc; ++j) { + float sum = 0.0f; + + for (int k = 0; k < nk; ++k) { + sum += src0[j*nk + k]*src1[i*nk + k]; + } + + if (fabsf(dst[i*nc + j] - sum) > 1e-5f) { + fprintf(stderr, "check_mat_mul: dst[%d] = %f, sum = %f\n", i*nc + j, dst[i*nc + j], sum); + assert(false); + return false; + } + } + } + + return true; +} + +#define NUM_PERMUTATIONS (4*3*2*1) + +int main(int argc, const char ** argv) { + struct ggml_init_params params = { + /* .mem_size = */ 128*1024*1024, + /* .mem_buffer = */ NULL, + /* .no_alloc = */ false, + }; + + int64_t ne[4]; + + int all_permutations[4 * NUM_PERMUTATIONS]; + { + int count = 0; + for (int ax0=0; ax0<4; ++ax0) { + for (int ax1=0; ax1<4; ++ax1) { + if (ax1 == ax0) continue; + for (int ax2=0; ax2<4; ++ax2) { + if (ax2 == ax0) continue; + if (ax2 == ax1) continue; + for (int ax3=0; ax3<4; ++ax3) { + if (ax3 == ax0) continue; + if (ax3 == ax1) continue; + if (ax3 == ax2) continue; + assert(count < NUM_PERMUTATIONS); + all_permutations[count*4+0] = ax0; + all_permutations[count*4+1] = ax1; + all_permutations[count*4+2] = ax2; + all_permutations[count*4+3] = ax3; + ++count; + } + } + } + } + } + + + // original loop: 1000 + int niter = 4; + const char *env = getenv("GGML_NLOOP"); + if (env != NULL) { + niter = atoi(env); + } + if (argc > 1) { + niter = atoi(argv[1]); + } + for (int iter = 0; iter < niter; ++iter) { + printf("test-grad0: iter:%d/%d\n", iter, niter); + struct ggml_context * ctx0 = ggml_init(params); + + get_random_dims(ne, 4); + + struct ggml_tensor * x[MAX_NARGS]; + + // add f32 + { + const int nargs = 2; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1])); + + check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f); + } + } + + // add f16 + { + const int nargs = 2; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1])); + + check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f); + } + } + + // sub + { + const int nargs = 2; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1])); + + check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); + } + } + + // mul + { + const int nargs = 2; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1])); + + check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // div + { + const int nargs = 2; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1])); + + check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f); + } + } + + // sqr + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 2; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0])); + + check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // sqrt + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 2; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0])); + + check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f); + } + } + + // log + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 2; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0])); + + check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f); + } + } + + // sum + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 2; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, x[0]); + + check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); + } + } + + + // sum_rows + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0]))); + + check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY); + } + } + + // mean, not yet fully implemented + if(0) + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0])); + + check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); + } + } + + // argmax + if (0) + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0])); + + check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); + } + } + + // repeat + { + int64_t ne2[4]; + get_random_dims(ne2, 4); + + ne2[0] = ne[0] * ne2[0]; + ne2[1] = ne[1] * ne2[1]; + ne2[2] = 1; + ne2[3] = 1; + + const int nargs = 1; + for (int ndims = 1; ndims <= 2; ++ndims) { + x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1])))); + + check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY); + } + } + + // repeat back + { + int64_t ne2[4]; + get_random_dims(ne2, 4); + + ne2[0] = ne[0] * ne2[0]; + ne2[1] = ne[1] * ne2[1]; + ne2[2] = 1; + ne2[3] = 1; + + const int nargs = 1; + for (int ndims = 1; ndims <= 2; ++ndims) { + x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0])))); + + check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY); + } + } + + // abs (finite differences do not work) + //{ + // const int nargs = 1; + + // for (int ndims = 1; ndims <= 2; ++ndims) { + // for (int i = 0; i < nargs; ++i) { + // x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + // ggml_set_param(ctx0, x[i]); + // } + + // struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0])); + + // check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f); + // } + //} + + // sgn + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0])); + + check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); + } + } + + // neg + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0])); + + check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); + } + } + + // step + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0])); + + check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); + } + } + + // tanh, not yet fully implemented + if(0) + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0])); + + check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); + } + } + + // mul_mat + { + const int nargs = 2; + + for (int ndims = 2; ndims <= 2; ++ndims) { + x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + { + int64_t ne2[4]; + get_random_dims(ne2, 4); + ne2[0] = ne[0]; + x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f); + } + + ggml_set_param(ctx0, x[0]); + ggml_set_param(ctx0, x[1]); + + struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]); + struct ggml_tensor * f = ggml_sum(ctx0, m); + + GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims); + + check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + check_mat_mul(m, x[1], x[0]); + } + } + + // elu, not yet fully implemented + if(0) + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0])); + + check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); + } + } + + // relu + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0])); + + check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // gelu, not yet fully implemented + if(0) + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 4; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0])); + + check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f); + } + } + + // silu + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 2; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_silu(ctx0, x[0])); + +#ifdef GGML_SILU_FP16 + // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds. + check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY); +#else + check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); +#endif + } + } + + // rms_norm + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 2; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f)); + + check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY); + } + } + + // scale + { + const int nargs = 2; + + int64_t ne2[4]; + ne2[0] = 1; + + for (int ndims = 1; ndims <= 2; ++ndims) { + x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f); + x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + + ggml_set_param(ctx0, x[0]); + ggml_set_param(ctx0, x[1]); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], x[1])); + + check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // cpy f32 + { + const int nargs = 2; + + for (int ndims = 1; ndims <= 2; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + // x[1] is overwritten by x[0], so the gradients don't propagate to x[1] + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1])); + + check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // cpy f16 + { + const int nargs = 2; + + for (int ndims = 1; ndims <= 2; ++ndims) { + for (int i = 0; i < nargs; ++i) { + x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[i]); + } + // x[1] is overwritten by x[0], so the gradients don't propagate to x[1] + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1])); + + check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY); + } + } + + // reshape (1d->nd) + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 2; ++ndims) { + int64_t ne2[4]; + ne2[0] = 1; + ne2[1] = 1; + ne2[2] = 1; + ne2[3] = 1; + for (int i = 0; i < ndims; ++i) { + ne2[0] *= ne[i]; + } + x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f); + x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1])); + check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // reshape (nd->1d) + { + const int nargs = 1; + + for (int ndims = 1; ndims <= 2; ++ndims) { + int64_t ne2[4]; + ne2[0] = 1; + ne2[1] = 1; + ne2[2] = 1; + ne2[3] = 1; + for (int i = 0; i < ndims; ++i) { + ne2[0] *= ne[i]; + } + x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1])); + check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // acc 1d + { + int64_t ne2[4] = { 1, 1, 1, 1 }; + + const int nargs = 2; + for (int ndims = 1; ndims <= 4; ++ndims) { + + x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + get_random_dims(ne2, 1); + while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) { + get_random_dims(ne2, 1); + } + + x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f); + ggml_set_param(ctx0, x[1]); + + const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1])); + const int offset = irand(max_offset) * ggml_element_size(x[0]); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset)); + + check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // acc 2d + { + int64_t ne2[4] = { 1, 1, 1, 1 }; + int64_t max_offsets[4] = { 0, 0, 0, 0 }; + int64_t offsets[4] = { 0, 0, 0, 0 }; + + const int nargs = 2; + for (int ndims = 2; ndims <= 4; ++ndims) { + + x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + get_random_dims(ne2, 2); + while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) { + get_random_dims(ne2, 2); + } + + x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f); + ggml_set_param(ctx0, x[1]); + + max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]); + max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]); + offsets[0] = irand(max_offsets[0]) * x[0]->nb[0]; + offsets[1] = irand(max_offsets[1]) * x[0]->nb[1]; + const int offset = offsets[0] + offsets[1]; + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset)); + + check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // acc 3d + { + int64_t ne2[4] = { 1, 1, 1, 1 }; + int64_t max_offsets[4] = { 0, 0, 0, 0 }; + int64_t offsets[4] = { 0, 0, 0, 0 }; + + const int nargs = 2; + for (int ndims = 3; ndims <= 4; ++ndims) { + + x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + get_random_dims(ne2, 3); + while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0]))) { + get_random_dims(ne2, 3); + } + + x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f); + ggml_set_param(ctx0, x[1]); + + max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]); + max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]); + max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]); + offsets[0] = irand(max_offsets[0]) * x[0]->nb[0]; + offsets[1] = irand(max_offsets[1]) * x[0]->nb[1]; + offsets[2] = irand(max_offsets[2]) * x[0]->nb[2]; + const int offset = offsets[0] + offsets[1] + offsets[2]; + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset)); + + check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // acc 4d + { + int64_t ne2[4] = { 1, 1, 1, 1 }; + int64_t max_offsets[4] = { 0, 0, 0, 0 }; + int64_t offsets[4] = { 0, 0, 0, 0 }; + + const int nargs = 2; + for (int ndims = 4; ndims <= 4; ++ndims) { + + x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + get_random_dims(ne2, 4); + while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[3] > ne[3]) || (ne2[0]*ne2[1]*ne2[2]*ne2[3] > ggml_nelements(x[0]))) { + get_random_dims(ne2, 4); + } + + x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f); + ggml_set_param(ctx0, x[1]); + + max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]); + max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]); + max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]); + max_offsets[3] = MAX(0, x[0]->ne[3] - x[1]->ne[3]); + offsets[0] = irand(max_offsets[0]) * x[0]->nb[0]; + offsets[1] = irand(max_offsets[1]) * x[0]->nb[1]; + offsets[2] = irand(max_offsets[2]) * x[0]->nb[2]; + offsets[3] = irand(max_offsets[3]) * x[0]->nb[3]; + const int offset = offsets[0] + offsets[1] + offsets[2] + offsets[3]; + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset)); + + check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // set_1d + { + int64_t ne2[4]; + + const int nargs = 2; + for (int ndims = 1; ndims <= 4; ++ndims) { + + x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + get_random_dims(ne2, 1); + while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) { + get_random_dims(ne2, 1); + } + + x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f); + ggml_set_param(ctx0, x[1]); + + const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1])); + const int offset = irand(max_offset) * ggml_element_size(x[0]); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset)); + + check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // set_2d + { + int64_t ne2[4]; + int64_t max_offsets[4] = { 0, 0, 0, 0 }; + int64_t offsets[4] = { 0, 0, 0, 0 }; + + const int nargs = 1; + for (int ndims = 2; ndims <= 4; ++ndims) { + + x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + get_random_dims(ne2, 2); + while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) { + get_random_dims(ne2, 2); + } + + x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f); + ggml_set_param(ctx0, x[1]); + + max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]); + max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]); + offsets[0] = irand(max_offsets[0]) * x[0]->nb[0]; + offsets[1] = irand(max_offsets[1]) * x[0]->nb[1]; + const int offset = offsets[0] + offsets[1]; + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset)); + + check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // view_1d + { + const int nargs = 1; + for (int ndims = 1; ndims <= 4; ++ndims) { + + x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + + ggml_set_param(ctx0, x[0]); + + const int k0 = irand(ggml_nelements(x[0])); + const int k1 = irand(ggml_nelements(x[0])); + const int i0 = MIN(k0, k1); + const int i1 = MAX(k0, k1); + + const int offset = i0 * sizeof(float); + const int nelem = i1 - i0; + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset)); + + check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // view_2d + { + int64_t ne2[4]; + int64_t nb2[4]; + + const int nargs = 1; + for (int ndims = 1; ndims <= 4; ++ndims) { + + x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + + get_random_dims(ne2, 2); + while (ne2[0]*ne2[1] > ggml_nelements(x[0])) { + get_random_dims(ne2, 2); + } + const int count = ne2[0]*ne2[1]; + + nb2[0] = sizeof(float); + nb2[1] = nb2[0]*ne2[0]; + + ggml_set_param(ctx0, x[0]); + + const int max_offset = ggml_nelements(x[0]) - count; + const int offset = irand(max_offset+1) * sizeof(float); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset)); + + check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // view_3d + { + int64_t ne2[4] = {1,1,1,1}; + int64_t nb2[4] = {0,0,0,0}; + + const int nargs = 1; + for (int ndims = 1; ndims <= 4; ++ndims) { + + x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); + + get_random_dims(ne2, 3); + while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) { + get_random_dims(ne2, 3); + } + const int count = ne2[0]*ne2[1]*ne2[2]; + + nb2[0] = sizeof(float); + nb2[1] = nb2[0]*ne2[0]; + nb2[2] = nb2[1]*ne2[1]; + + ggml_set_param(ctx0, x[0]); + + const int max_offset = ggml_nelements(x[0]) - count; + const int offset = irand(max_offset+1) * sizeof(float); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset)); + + check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } + } + + // permute + { + int64_t ne2[4]; + + const int nargs = 1; + for (int ndims = 1; ndims <= 4; ++ndims) + { + // ggml_permute will set axes of dimensions below n_dims to 1. + // to make ggml_permute work correctly on all axes, + // the input tensor needs maximal n_dim of 4. + for (int i=0; i finite differences should not work + // instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0) + struct ggml_tensor * f = ggml_sum(ctx0, + ggml_log(ctx0, + ggml_add1(ctx0, + ggml_scale(ctx0, + ggml_soft_max(ctx0, x[0]), + ggml_new_f32(ctx0, 1.0f - eps)), + ggml_new_f32(ctx0, eps)))); + + check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY); + } + } + + // cross_entropy_loss + { + const int nargs = 1; + + int64_t ne2[4]; + get_random_dims(ne2, 4); + + for (int ndims = 1; ndims <= 4; ++ndims) { + x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f); + x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f); + // the second argument to cross_entropy_loss must sum up to 1 for each row + int nr = ggml_nrows(x[1]); + int nc = ggml_nelements(x[1]) / nr; + for (int ir = 0; ir < nr; ++ir) { + float sum = 0; + for (int ic = 0; ic < nc; ++ic) { + sum += ((float *) x[1]->data)[ic + ir*nc]; + } + for (int ic = 0; ic < nc; ++ic) { + ((float *) x[1]->data)[ic + ir*nc] /= sum; + } + } + ggml_set_param(ctx0, x[0]); + + struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]); + + check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY); + } + } + + // rope f32 + { + const int nargs = 1; + + int64_t ne2[4]; + get_random_dims(ne2, 4); + ne2[0] += ne2[0] % 2; + int n_rot = ne2[0]; + + for (int ndims = 3; ndims <= 4; ++ndims) { + for (int mode = 0; mode < 4; ++mode) { + for (int n_past = 1; n_past < ne2[2]; ++n_past) { + x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f); + + ggml_set_param(ctx0, x[0]); + + const bool skip_past = (mode & 1); + if (skip_past) { + // we have no past, so this would have to work on uninitialized memory. + // we only test the gradients here; + // skip_past should have no influence on gradient computation. + // so when other modes work, we assume that this does as well. + continue; + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0)); + + GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode); + check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY); + } + } + } + } + + // rope f16 + { + const int nargs = 1; + + int64_t ne2[4]; + get_random_dims(ne2, 4); + ne2[0] += ne2[0] % 2; + int n_rot = ne2[0]; + + for (int ndims = 3; ndims <= 4; ++ndims) { + for (int mode = 0; mode < 4; ++mode) { + for (int n_past = 1; n_past < ne2[2]; ++n_past) { + x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f); + + ggml_set_param(ctx0, x[0]); + + const bool skip_past = (mode & 1); + if (skip_past) { + // we have no past, so this would have to work on uninitialized memory. + // we only test the gradients here; + // skip_past should have no influence on gradient computation. + // so when other modes work, we assume that this does as well. + continue; + } + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0)); + + GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode); + check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY); + } + } + } + } + + // flash_attn f32 + { + const int nargs = 3; + + int64_t ne2[4]; + + get_random_dims(ne2, 4); + int64_t D = ne2[0]; + int64_t N = ne2[1]; + int64_t M = ne2[2] + N; + int64_t B = ne2[3]; + + for (int masked = 0; masked <= 1; ++masked) { + for (int ndims = 2; ndims <= 4; ++ndims) { + int64_t neq[4] = { D, N, B, ne[3] }; + int64_t nek[4] = { D, M, B, ne[3] }; + int64_t nev[4] = { M, D, B, ne[3] }; + if (ndims == 2) { + neq[2] = 1; neq[3] = 1; + nek[2] = 1; nek[3] = 1; + nev[2] = 1; nev[3] = 1; + } else if (ndims == 3) { + neq[3] = 1; + nek[3] = 1; + nev[3] = 1; + } + x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f); + x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f); + x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f); + ggml_set_param(ctx0, x[0]); + ggml_set_param(ctx0, x[1]); + ggml_set_param(ctx0, x[2]); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0))); + + check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY); + } + } + } + + // flash_attn f16, not yet fully implemented + if(0) + { + const int nargs = 3; + + int64_t ne2[4]; + + get_random_dims(ne2, 4); + int64_t D = ne2[0]; + int64_t N = ne2[1]; + int64_t M = ne2[2] + N; + int64_t B = ne2[3]; + + for (int masked = 0; masked <= 1; ++masked) { + for (int ndims = 2; ndims <= 4; ++ndims) { + int64_t neq[4] = { D, N, B, ne[3] }; + int64_t nek[4] = { D, M, B, ne[3] }; + int64_t nev[4] = { M, D, B, ne[3] }; + if (ndims == 2) { + neq[2] = 1; neq[3] = 1; + nek[2] = 1; nek[3] = 1; + nev[2] = 1; nev[3] = 1; + } else if (ndims == 3) { + neq[3] = 1; + nek[3] = 1; + nev[3] = 1; + } + x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f); + x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f); + x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f); + ggml_set_param(ctx0, x[0]); + ggml_set_param(ctx0, x[1]); + ggml_set_param(ctx0, x[2]); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0))); + + check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY); + } + } + } + ggml_free(ctx0); + } + + return 0; +} diff --git a/seamless_communication/ggml/tests/test-mul-mat0.c b/seamless_communication/ggml/tests/test-mul-mat0.c new file mode 100644 index 0000000..6212da4 --- /dev/null +++ b/seamless_communication/ggml/tests/test-mul-mat0.c @@ -0,0 +1,332 @@ +#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows +#include "ggml/ggml.h" + +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +#define MAX_NARGS 2 + +float frand(void) { + return (float)rand()/(float)RAND_MAX; +} + +int irand(int n) { + return rand()%n; +} + +void get_random_dims(int64_t * dims, int ndims) { + dims[0] = dims[1] = dims[2] = dims[3] = 1; + + for (int i = 0; i < ndims; i++) { + dims[i] = 1 + irand(4); + } +} + +struct ggml_tensor * get_random_tensor( + struct ggml_context * ctx0, + int ndims, + int64_t ne[], + float fmin, + float fmax) { + struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne); + + switch (ndims) { + case 1: + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin; + } + break; + case 2: + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + break; + case 3: + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + } + break; + case 4: + for (int i3 = 0; i3 < ne[3]; i3++) { + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + } + } + break; + default: + assert(false); + }; + + return result; +} + +float get_element(const struct ggml_tensor * t, int idx) { + return ((float *)t->data)[idx]; +} + +void set_element(struct ggml_tensor * t, int idx, float value) { + ((float *)t->data)[idx] = value; +} + +bool check_gradient( + const char * op_name, + struct ggml_context * ctx0, + struct ggml_tensor * x[], + struct ggml_tensor * f, + int ndims, + int nargs, + float eps, + float max_error_abs, + float max_error_rel) { + const int n_threads = 1; + + struct ggml_cgraph gf = ggml_build_forward (f); + struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); + + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + ggml_graph_reset (&gf); + ggml_set_f32 (f->grad, 1.0f); + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); + + ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot"); + ggml_graph_dump_dot(&gb, &gf, "test-grad0-backward.dot"); + + for (int i = 0; i < nargs; ++i) { + const int64_t nelements = ggml_nelements(x[i]); + for (int64_t k = 0; k < nelements; ++k) { + // compute gradient using finite differences + const float x0 = get_element(x[i], k); + + set_element(x[i], k, x0 + eps); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + + const float f0 = ggml_get_f32_1d(f, 0); + + set_element(x[i], k, x0 - eps); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + + const float f1 = ggml_get_f32_1d(f, 0); + + const float g0 = (f0 - f1)/(2.0f*eps); + + set_element(x[i], k, x0); + + // compute gradient using backward graph + ggml_graph_reset (&gf); + ggml_set_f32 (f->grad, 1.0f); + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); + + const float g1 = get_element(x[i]->grad, k); + + const float error_abs = fabsf(g0 - g1); + const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabs(g0) : 0; + + if (error_abs > max_error_abs || error_rel > max_error_rel) { + printf("%s: ndims=%d, i=%d, k=%" PRId64 ", g0=%f, g1=%f, error_abs=%f, error_rel=%f\n", op_name, ndims, i, k, g0, g1, error_abs, error_rel); + assert(false); + } + } + } + + return true; +} + + +float mat_get(const struct ggml_tensor * t, int i0, int i1, int i2, int i3) { + const size_t nb0 = t->nb[0]; + const size_t nb1 = t->nb[1]; + const size_t nb2 = t->nb[2]; + const size_t nb3 = t->nb[3]; + + return + *((float*) ((char*)t->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)); +} + +bool check_mat_mul( + const struct ggml_tensor * y, + const struct ggml_tensor * x0, + const struct ggml_tensor * x1) { + const int64_t n00 = x0->ne[0]; + const int64_t n10 = x0->ne[1]; + const int64_t n20 = x0->ne[2]; + const int64_t n30 = x0->ne[3]; + + const int64_t n01 = x1->ne[0]; + const int64_t n11 = x1->ne[1]; + const int64_t n21 = x1->ne[2]; + const int64_t n31 = x1->ne[3]; + + const int64_t n02 = y->ne[0]; + const int64_t n12 = y->ne[1]; + const int64_t n22 = y->ne[2]; + const int64_t n32 = y->ne[3]; + + printf("x0: [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "]\n", n00, n10, n20, n30); + for (int j = 0; j < n10; ++j) { + for (int i = 0; i < n00; ++i) { + printf("%6.3f ", mat_get(x0, i, j, 0, 0)); + } + printf("\n"); + } + printf("\n"); + + printf("x1: [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "]\n", n01, n11, n21, n31); + for (int j = 0; j < n11; ++j) { + for (int i = 0; i < n01; ++i) { + printf("%6.3f ", mat_get(x1, i, j, 0, 0)); + } + printf("\n"); + } + printf("\n"); + + printf("y: [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "]\n", n02, n12, n22, n32); + for (int j = 0; j < n12; ++j) { + for (int i = 0; i < n02; ++i) { + printf("%6.3f ", mat_get(y, i, j, 0, 0)); + } + printf("\n"); + } + + for (int i3 = 0; i3 < n32; ++i3) { + for (int i2 = 0; i2 < n22; ++i2) { + for (int i1 = 0; i1 < n12; ++i1) { + for (int i0 = 0; i0 < n02; ++i0) { + float sum = 0.0f; + for (int k = 0; k < n00; ++k) { + sum += mat_get(x0, k, i0, i2, i3) * mat_get(x1, k, i1, i2, i3); + } + if (fabsf(sum - mat_get(y, i0, i1, i2, i3)) > 1e-5) { + printf("error: i0=%d, i1=%d, i2=%d, i3=%d, sum=%f, y=%f\n", + i0, i1, i2, i3, sum, mat_get(y, i0, i1, i2, i3)); + assert(false); + return false; + } + } + } + } + } + + return true; +} + +int main(int argc, const char ** argv) { + struct ggml_init_params params = { + .mem_size = 128*1024*1024, + .mem_buffer = NULL, + .no_alloc = false, + }; + + int64_t ne[4]; + + // original loop: 500 + int niter = 500; + const char *env = getenv("GGML_NLOOP"); + if (env != NULL) { + niter = atoi(env); + } + if (argc > 1) { + niter = atoi(argv[1]); + } + + int n_threads = 1; + + for (int iter = 0; iter < niter; ++iter) { + printf("test-mul-mat0: iter:%d/%d\n", iter, niter); + struct ggml_context * ctx0 = ggml_init(params); + + get_random_dims(ne, 4); + + struct ggml_tensor * x[MAX_NARGS]; + + // mul_mat + { + const int nargs = 1; + + for (int ndims = 2; ndims <= 4; ++ndims) { + x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + ne[1] = rand()%4 + 1; + x[1] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + + ggml_set_param(ctx0, x[0]); + + struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]); + struct ggml_tensor * f = ggml_sum(ctx0, m); + + printf("testing: mul_mat, [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "] = [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "] * [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "]\n", + m->ne[0], m->ne[1], m->ne[2], m->ne[3], + x[1]->ne[0], x[1]->ne[1], x[1]->ne[2], x[1]->ne[3], + x[0]->ne[0], x[0]->ne[1], x[0]->ne[2], x[0]->ne[3]); + + assert(m->ne[0] == x[1]->ne[1]); + assert(m->ne[1] == x[0]->ne[1]); + assert(m->ne[2] == x[0]->ne[2]); + assert(m->ne[3] == x[0]->ne[3]); + + if (ndims <= 2) { + check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } else { + struct ggml_cgraph gf = ggml_build_forward(m); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + } + + check_mat_mul(m, x[1], x[0]); + } + } + + // mul_mat (transposed) + { + const int nargs = 1; + + for (int ndims = 2; ndims <= 4; ++ndims) { + x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); + ne[1] = ne[0]; + ne[0] = rand()%4 + 1; + x[1] = ggml_cont(ctx0, ggml_transpose(ctx0, get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f))); + + ggml_set_param(ctx0, x[0]); + + struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]); + struct ggml_tensor * f = ggml_sum(ctx0, m); + + printf("testing: mul_mat, [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "] = [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "] * [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "]\n", + m->ne[0], m->ne[1], m->ne[2], m->ne[3], + x[1]->ne[0], x[1]->ne[1], x[1]->ne[2], x[1]->ne[3], + x[0]->ne[0], x[0]->ne[1], x[0]->ne[2], x[0]->ne[3]); + + assert(m->ne[0] == x[1]->ne[1]); + assert(m->ne[1] == x[0]->ne[1]); + assert(m->ne[2] == x[0]->ne[2]); + assert(m->ne[3] == x[0]->ne[3]); + + if (ndims <= 2) { + check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + } else { + struct ggml_cgraph gf = ggml_build_forward(m); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + } + + check_mat_mul(m, x[1], x[0]); + } + } + ggml_free(ctx0); + } + + return 0; +} diff --git a/seamless_communication/ggml/tests/test-mul-mat1.c b/seamless_communication/ggml/tests/test-mul-mat1.c new file mode 100644 index 0000000..fc24077 --- /dev/null +++ b/seamless_communication/ggml/tests/test-mul-mat1.c @@ -0,0 +1,312 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +const int M = 1280; +const int N = 1536; +const int K = 1280; + +uint64_t get_time_us() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000000 + tv.tv_usec; +} + +// +// naive implementation +// + +void mul_mat_f32_0( + const float * restrict src0, // M x K + const float * restrict src1, // N x K (transposed) + float * dst, + int m, int n, int k) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + float sum = 0; + for (int l = 0; l < k; l++) { + sum += src0[i*k + l] * src1[j*k + l]; + } + dst[i*n + j] = sum; + } + } +} + +void mul_mat_f16_0( + const __fp16 * src0, + const __fp16 * src1, + float * dst, + int m, int n, int k) { + const int k32 = k & ~31; + + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + float sumf = 0.0; + + float16x8_t sum0 = vdupq_n_f16(0.0f); + float16x8_t sum1 = vdupq_n_f16(0.0f); + float16x8_t sum2 = vdupq_n_f16(0.0f); + float16x8_t sum3 = vdupq_n_f16(0.0f); + + float16x8_t x0, x1, x2, x3; + float16x8_t y0, y1, y2, y3; + + const __fp16 * restrict p0 = src0 + i*k; + const __fp16 * restrict p1 = src1 + j*k; + + for (int l = 0; l < k32; l += 32) { + x0 = vld1q_f16(p0 + l + 0 ); + x1 = vld1q_f16(p0 + l + 8 ); + x2 = vld1q_f16(p0 + l + 16); + x3 = vld1q_f16(p0 + l + 24); + + y0 = vld1q_f16(p1 + l + 0 ); + y1 = vld1q_f16(p1 + l + 8 ); + y2 = vld1q_f16(p1 + l + 16); + y3 = vld1q_f16(p1 + l + 24); + + sum0 = vfmaq_f16(sum0, x0, y0); + sum1 = vfmaq_f16(sum1, x1, y1); + sum2 = vfmaq_f16(sum2, x2, y2); + sum3 = vfmaq_f16(sum3, x3, y3); + } + + // reduce sum0..sum3 to sum0 + sum0 = vaddq_f16(sum0, sum1); + sum2 = vaddq_f16(sum2, sum3); + sum0 = vaddq_f16(sum0, sum2); + + // load sum0 into 2 float32x4_t + float32x4_t sum0f32 = vcvt_f32_f16(vget_low_f16(sum0)); + float32x4_t sum1f32 = vcvt_f32_f16(vget_high_f16(sum0)); + + // reduce sum0f32 and sum1f32 to sumf + sum0f32 = vaddq_f32(sum0f32, sum1f32); + + float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0f32), vget_high_f32(sum0f32)); + sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1); + + //sumf = sum0[0] + sum0[1] + sum0[2] + sum0[3] + sum0[4] + sum0[5] + sum0[6] + sum0[7]; + + for (int l = k32; l < k32; l++) { + sumf += p0[l]*p1[l]; + } + + dst[i*n + j] = sumf; + } + } +} + +// blocking with block size 32 +void mul_mat_f16_1( + const __fp16 * src0, + const __fp16 * src1, + float * dst, + int m, int n, int k) { + + const int k32 = k & ~31; + const int bs = 32; + + memset(dst, 0, m*n*sizeof(float)); + + for (int i = 0; i < m; i += bs) { + for (int j = 0; j < n; j += bs) { + for (int l = 0; l < k; l += bs) { + for (int ii = i; ii < i + bs; ii++) { + const __fp16 * restrict p0 = src0 + ii*k; + + float16x8_t x0, x1, x2, x3; + + x0 = vld1q_f16(p0 + l + 0 ); + x1 = vld1q_f16(p0 + l + 8 ); + x2 = vld1q_f16(p0 + l + 16); + x3 = vld1q_f16(p0 + l + 24); + + for (int jj = j; jj < j + bs; jj++) { + float sumf = 0.0; + + float16x8_t sum0 = vdupq_n_f16(0.0f); + float16x8_t sum1 = vdupq_n_f16(0.0f); + float16x8_t sum2 = vdupq_n_f16(0.0f); + float16x8_t sum3 = vdupq_n_f16(0.0f); + + float16x8_t y0, y1, y2, y3; + + const __fp16 * restrict p1 = src1 + jj*k; + + y0 = vld1q_f16(p1 + l + 0 ); + y1 = vld1q_f16(p1 + l + 8 ); + y2 = vld1q_f16(p1 + l + 16); + y3 = vld1q_f16(p1 + l + 24); + + sum0 = vfmaq_f16(sum0, x0, y0); + sum1 = vfmaq_f16(sum1, x1, y1); + sum2 = vfmaq_f16(sum2, x2, y2); + sum3 = vfmaq_f16(sum3, x3, y3); + + // reduce sum0..sum3 to sum0 + sum0 = vaddq_f16(sum0, sum1); + sum2 = vaddq_f16(sum2, sum3); + sum0 = vaddq_f16(sum0, sum2); + + // load sum0 into 2 float32x4_t + float32x4_t sum0f32 = vcvt_f32_f16(vget_low_f16(sum0)); + float32x4_t sum1f32 = vcvt_f32_f16(vget_high_f16(sum0)); + + // reduce sum0f32 and sum1f32 to sumf + sum0f32 = vaddq_f32(sum0f32, sum1f32); + + float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0f32), vget_high_f32(sum0f32)); + sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1); + + //sumf = sum0[0] + sum0[1] + sum0[2] + sum0[3] + sum0[4] + sum0[5] + sum0[6] + sum0[7]; + + dst[ii*n + jj] += sumf; + } + } + } + } + } + +} + +void mul_mat_f8_0( + const uint8_t * src0, + const uint8_t * src1, + float * dst, + int m, int n, int k) { + const int k32 = k & ~31; + + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + float sumf = 0.0; + + const uint8_t * restrict p0 = src0 + i*k; + const uint8_t * restrict p1 = src1 + j*k; + + for (int l = 0; l < k32; l += 32) { + uint8x16_t x0 = vld1q_u8(p0 + l + 0 ); + uint8x16_t x1 = vld1q_u8(p0 + l + 16); + + uint8x16_t y0 = vld1q_u8(p1 + l + 0 ); + uint8x16_t y1 = vld1q_u8(p1 + l + 16); + + x0 = vmulq_u8(x0, y0); + x1 = vmulq_u8(x1, y1); + + sumf += vaddvq_u8(x0) + vaddvq_u8(x1); + } + + dst[i*n + j] = sumf; + } + } +} + +int main(int argc, const char ** argv) { + float * src0 = malloc(sizeof(float)*M*K); + float * src1 = malloc(sizeof(float)*N*K); + float * dst = malloc(sizeof(float)*M*N); + + for (int i = 0; i < M*K; i++) { + src0[i] = rand() / (float)RAND_MAX; + } + + for (int i = 0; i < N*K; i++) { + src1[i] = rand() / (float)RAND_MAX; + } + + // convert src0 and src1 to __fp16 + __fp16 * src0_fp16 = (__fp16 *)(malloc(sizeof(__fp16)*M*K)); + __fp16 * src1_fp16 = (__fp16 *)(malloc(sizeof(__fp16)*N*K)); + + uint8_t * src0_fp8 = (uint8_t *)(malloc(sizeof(__fp16)*M*K)); + uint8_t * src1_fp8 = (uint8_t *)(malloc(sizeof(__fp16)*N*K)); + + { + const uint64_t t_start = get_time_us(); + + for (int i = 0; i < M*K; i++) { + src0_fp16[i] = src0[i]; + //printf("%f %f\n", src0[i], src0_fp16[i]); + //assert(!isnan(src0_fp16[i])); + } + + for (int i = 0; i < N*K; i++) { + src1_fp16[i] = src1[i]; + } + + const uint64_t t_end = get_time_us(); + printf("convert time: %f ms\n", (t_end - t_start) / 1000.0); + } + + for (int i = 0; i < 16; ++i) { + printf("%f %f\n", src0[i], src0_fp16[i]); + } + + int method = 0; + if (argc > 1) { + method = atoi(argv[1]); + } + + const int nIter = 1; + + const clock_t start = clock(); + const uint64_t start_us = get_time_us(); + + double iM = 1.0/M; + double sum = 0.0f; + for (int i = 0; i < nIter; i++) { + if (method == 0) { + mul_mat_f32_0(src0, src1, dst, M, N, K); + } + + if (method == 1) { + mul_mat_f16_0(src0_fp16, src1_fp16, dst, M, N, K); + } + + if (method == 2) { + mul_mat_f16_1(src0_fp16, src1_fp16, dst, M, N, K); + } + + if (method == 3) { + mul_mat_f8_0(src0_fp8, src1_fp8, dst, M, N, K); + } + + if (method == 4) { + // Use BLAS sgemm from Accelerate framework + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, M, N, K, 1.0f, src0, K, src1, K, 0.0f, dst, N); + } + } + + for (int i = 0; i < N; i++) { + sum += dst[i]*iM; + } + + { + const clock_t end = clock(); + const uint64_t end_us = get_time_us(); + printf("%s: elapsed ticks: %ld\n", __func__, end - start); + printf("%s: elapsed us: %llu / %f ms\n", __func__, end_us - start_us, (end_us - start_us) / 1000.0 / nIter); + } + + printf("%f\n", sum); + + free(src0); + free(src1); + free(dst); + + free(src0_fp16); + free(src1_fp16); + + return 0; +} diff --git a/seamless_communication/ggml/tests/test-mul-mat2.c b/seamless_communication/ggml/tests/test-mul-mat2.c new file mode 100644 index 0000000..89af286 --- /dev/null +++ b/seamless_communication/ggml/tests/test-mul-mat2.c @@ -0,0 +1,2585 @@ +// quantized matrix multiplication + +#include "ggml.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__ARM_NEON) +#include "arm_neon.h" +#elif defined(__AVX__) || defined(__AVX2__) +#include "immintrin.h" +#endif + +#ifndef MIN +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#include +#define __builtin_popcountll __popcnt64 +#endif + +const int M = 1280; +const int N = 1536; +const int K = 1280; + +//const int M = 64; +//const int N = 64; +//const int K = 64; + +#define QK 64 +#define QB 4 + +//#define GGML_GQ_USE_FP16_SCALE + +#if defined(GGML_GQ_USE_FP16_SCALE) +#define gq_scale_t ggml_fp16_t +#define GGML_FP32_TO_GQ(x) ggml_fp32_to_fp16(x) +#define GGML_GQ_TO_FP32(x) ggml_fp16_to_fp32(x) +#else +#define gq_scale_t float +#define GGML_FP32_TO_GQ(x) (x) +#define GGML_GQ_TO_FP32(x) (x) +#endif + +#define gq_t_bits 64 +#define gq_quant_t uint64_t + +float frand(void) { + return (float) rand() / (float) RAND_MAX; +} + +#if defined(__AVX2__) +// horizontally reduce 8 32-bit integers +static inline uint32_t _mm256_hadd_epi32_gg(__m256i v) { + __m128i v0 = _mm256_extractf128_si256(v, 0); + __m128i v1 = _mm256_extractf128_si256(v, 1); + + v0 = _mm_add_epi32(v0, v1); + + v1 = _mm_shuffle_epi32(v0, 0x0e); + v0 = _mm_add_epi32(v0, v1); + + v1 = _mm_shuffle_epi32(v0, 0x01); + v0 = _mm_add_epi32(v0, v1); + + return _mm_cvtsi128_si32(v0); +} + +//static inline float _mm256_hadd_epi32_gg(__m256i v) { +// const __m256 v0 = _mm256_cvtepi32_ps(v); +// const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(v0), _mm256_extractf128_ps(v0, 1)); +// const __m128 t1 = _mm_hadd_ps(t0, t0); +// +// return _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); +//} + +// horizontally reduce 32 8-bit integers +static inline int32_t _mm256_hadd_epi8_gg(__m256i v0) { + __m256i v1 = _mm256_maddubs_epi16(v0, _mm256_set1_epi8(1)); + __m256i v2 = _mm256_madd_epi16 (v1, _mm256_set1_epi16(1)); + + return _mm256_hadd_epi32_gg(v2); +} + +static inline float _mm256_hadd_ps_gg(__m256 v) { + const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1)); + const __m128 t1 = _mm_hadd_ps(t0, t0); + + return _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); +} +#endif + +// +// naive implementation +// + +void mul_mat_f32_naive( + const float * restrict src0, // M x K + const float * restrict src1, // N x K (transposed) + float * dst, + int m, int n, int k) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + float sum = 0; + for (int l = 0; l < k; l++) { + sum += src0[i*k + l] * src1[j*k + l]; + } + dst[i*n + j] = sum; + } + } +} + +// +// method 1 +// + +static inline int quantize_1_blocks_per_row(int k) { + return k/QK; +} + +static inline int quantize_1_quants_per_block(void) { + return QK/gq_t_bits; +} + +static inline int quantize_1_row_size(int k) { + const int nb = quantize_1_blocks_per_row(k); + const int nq = quantize_1_quants_per_block(); + + return nb*(2*sizeof(gq_scale_t) + nq*QB*sizeof(gq_quant_t)); +} + +void quantize_1(const float * src, void * dst, int n, int k) { + char * p0 = dst; + + gq_quant_t pp[QB]; + + for (int j = 0; j < n; j++) { + for (int i = 0; i < k/QK; i++) { + float min = FLT_MAX; + float max = -FLT_MAX; + + // find min/max +#ifdef __ARM_NEON + { + float32x4_t minv = vdupq_n_f32(FLT_MAX); + float32x4_t maxv = vdupq_n_f32(-FLT_MAX); + + for (int l = 0; l < QK; l += 4) { + float32x4_t v = vld1q_f32(src + j*k + i*QK + l); + minv = vminq_f32(minv, v); + maxv = vmaxq_f32(maxv, v); + } + + float32x2_t minv32 = vpmin_f32(vget_low_f32(minv), vget_high_f32(minv)); + float32x2_t maxv32 = vpmax_f32(vget_low_f32(maxv), vget_high_f32(maxv)); + + min = MIN(vget_lane_f32(minv32, 0), vget_lane_f32(minv32, 1)); + max = MAX(vget_lane_f32(maxv32, 0), vget_lane_f32(maxv32, 1)); + + //printf("SIMD min/max: %f %f\n", min, max); + } +#else + { + for (int l = 0; l < QK; l++) { + const float v = src[j*k + i*QK + l]; + if (v < min) min = v; + if (v > max) max = v; + } + + //printf("NORM min/max: %f %f\n", min, max); + } +#endif + + const float d = (max - min) / ((1 << QB) - 1); + const float id = d ? 1.0/d : 0.0; + + memcpy(p0, &min, sizeof(float)); p0 += sizeof(float); + memcpy(p0, &d, sizeof(float)); p0 += sizeof(float); + + //printf("min/max/d/id: %f %f %f %f\n", min, max, d, id); + + for (int s = 0; s < QK/gq_t_bits; ++s) { + memset(pp, 0, sizeof(pp)); + + for (int l = 0; l < gq_t_bits; l++) { + const float v = src[j*k + i*QK + s*gq_t_bits + l]; + const uint8_t q = (v - min)*id; + + for (int b = 0; b < QB; b++) { + pp[b] |= q & (1 << b) ? (1ULL << l) : 0; + } + } + + for (int b = 0; b < QB; b++) { + memcpy(p0, &pp[b], sizeof(gq_quant_t)); p0 += sizeof(gq_quant_t); + } + } + } + } +} + +void mul_mat_gq_1( + const void * src0, + const void * src1, + float * dst, + int m, int n, int k) { + const int kp = k & ~(gq_t_bits - 1); + + const char * restrict p0 = src0; + const char * restrict p1 = src1; + + float s0[QB + 1]; + float s1[QB + 1]; + + gq_quant_t m0[QB + 1]; + gq_quant_t m1[QB + 1]; + + for (int ir0 = 0; ir0 < m; ir0++) { + for (int ir1 = 0; ir1 < n; ir1++) { + float sumf = 0.0; + + const char * restrict pp0 = p0 + ir0*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_quant_t))*(k/QK)); + const char * restrict pp1 = p1 + ir1*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_quant_t))*(k/QK)); + + for (int i = 0; i < kp/QK; i++) { + float min0, d0; + memcpy(&min0, pp0, sizeof(float)); pp0 += sizeof(float); + memcpy(&d0, pp0, sizeof(float)); pp0 += sizeof(float); + + float min1, d1; + memcpy(&min1, pp1, sizeof(float)); pp1 += sizeof(float); + memcpy(&d1, pp1, sizeof(float)); pp1 += sizeof(float); + + //printf("min0/d0 = %f %f | min1/d1 = %f %f\n", min0, d0, min1, d1); + +#if 1 + // >>> General case for any QB + + s0[0] = min0; + s1[0] = min1; + + for (int b = 0; b < QB; b++) { + s0[b + 1] = d0*(1 << b); + s1[b + 1] = d1*(1 << b); + } + + m0[0] = 0-1ULL; + m1[0] = 0-1ULL; + + for (int s = 0; s < QK/gq_t_bits; ++s) { + for (int b = 0; b < QB; b++) { + memcpy(&m0[b + 1], pp0, sizeof(gq_quant_t)); pp0 += sizeof(gq_quant_t); + memcpy(&m1[b + 1], pp1, sizeof(gq_quant_t)); pp1 += sizeof(gq_quant_t); + } + + for (int q0 = 0; q0 < QB + 1; q0++) { + for (int q1 = 0; q1 < QB + 1; q1++) { + sumf += s0[q0]*s1[q1]*__builtin_popcountll(m0[q0] & m1[q1]); + } + } + } +#else +#endif + } + + dst[ir0*n + ir1] = sumf; + } + } +} + +// +// method 2 +// n-bit quantization (2nd attempt) +// + +static inline int quantize_2_blocks_per_row(int k) { + return k/QK; +} + +static inline int quantize_2_quants_per_block(void) { + return QK/gq_t_bits; +} + +static inline int quantize_2_row_size(int k) { + const int nb = quantize_2_blocks_per_row(k); + const int nq = quantize_2_quants_per_block(); + + return nb*(2*sizeof(gq_scale_t) + nq*QB*sizeof(gq_quant_t)); +} + +void quantize_2_row(const float * restrict src, void * restrict dst, int k) { + assert(k % QK == 0); + + const int nb = quantize_2_blocks_per_row(k); + const int nq = quantize_2_quants_per_block(); + + gq_scale_t * restrict pm = (gq_scale_t *) (dst); + gq_scale_t * restrict pd = (gq_scale_t *) (pm + nb); + gq_quant_t * restrict pb = (gq_quant_t *) (pd + nb); + + gq_quant_t pp[QB]; + + static const int32_t sh[32] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + }; + + for (int i = 0; i < nb; i++) { + float min = FLT_MAX; + float max = -FLT_MAX; + +#ifdef __ARM_NEON + { + float32x4_t minv = vdupq_n_f32(FLT_MAX); + float32x4_t maxv = vdupq_n_f32(-FLT_MAX); + + for (int l = 0; l < QK; l += 4) { + float32x4_t v = vld1q_f32(src + i*QK + l); + minv = vminq_f32(minv, v); + maxv = vmaxq_f32(maxv, v); + } + + float32x2_t minv32 = vpmin_f32(vget_low_f32(minv), vget_high_f32(minv)); + float32x2_t maxv32 = vpmax_f32(vget_low_f32(maxv), vget_high_f32(maxv)); + + min = MIN(vget_lane_f32(minv32, 0), vget_lane_f32(minv32, 1)); + max = MAX(vget_lane_f32(maxv32, 0), vget_lane_f32(maxv32, 1)); + } +#else + { + for (int l = 0; l < QK; l++) { + const float v = src[i*QK + l]; + if (v < min) min = v; + if (v > max) max = v; + } + } +#endif + + const float d = (max - min) / ((1 << QB) - 1); + const float id = d ? 1.0/d : 0.0; + + pm[i] = GGML_FP32_TO_GQ(min); + pd[i] = GGML_FP32_TO_GQ(d); + + for (int s = 0; s < nq; ++s) { + memset(pp, 0, sizeof(pp)); + +#if 1 + for (int l = 0; l < gq_t_bits; l++) { + const float v = src[i*QK + s*gq_t_bits + l]; + const uint8_t q = (v - min)*id + frand(); + + for (int b = 0; b < QB; b++) { + pp[b] |= q & (1 << b) ? (1ULL << l) : 0; + } + } +#elif defined(__ARM_NEON) +#if 1 + { + uint32_t ppt[2*4*QB]; + + float32x4_t minv = vdupq_n_f32(min); + float32x4_t idv = vdupq_n_f32(id); + + assert(gq_t_bits % 16 == 0); + + uint32x4_t p0[QB] = { vdupq_n_u32(0) }; + uint32x4_t p1[QB] = { vdupq_n_u32(0) }; + + for (int l = 0; l < gq_t_bits; l += 16) { + float32x4_t v0 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 0); + float32x4_t v1 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 4); + float32x4_t v2 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 8); + float32x4_t v3 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 12); + + v0 = vsubq_f32(v0, minv); + v1 = vsubq_f32(v1, minv); + v2 = vsubq_f32(v2, minv); + v3 = vsubq_f32(v3, minv); + + v0 = vmulq_f32(v0, idv); + v1 = vmulq_f32(v1, idv); + v2 = vmulq_f32(v2, idv); + v3 = vmulq_f32(v3, idv); + +#if 1 + v0[0] += frand(); v0[1] += frand(); v0[2] += frand(); v0[3] += frand(); + v1[0] += frand(); v1[1] += frand(); v1[2] += frand(); v1[3] += frand(); + v2[0] += frand(); v2[1] += frand(); v2[2] += frand(); v2[3] += frand(); + v3[0] += frand(); v3[1] += frand(); v3[2] += frand(); v3[3] += frand(); +#endif + + uint32x4_t q0 = vcvtq_u32_f32(v0); + uint32x4_t q1 = vcvtq_u32_f32(v1); + uint32x4_t q2 = vcvtq_u32_f32(v2); + uint32x4_t q3 = vcvtq_u32_f32(v3); + + for (int b = 0; b < QB; ++b) { + uint32x4_t m = vdupq_n_u32(1 << b); + uint32x4_t r = vdupq_n_u32(-b); + + if (l < 32) { + p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q0, m), r), vld1q_s32(sh + l + 0))); + p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q1, m), r), vld1q_s32(sh + l + 4))); + p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q2, m), r), vld1q_s32(sh + l + 8))); + p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q3, m), r), vld1q_s32(sh + l + 12))); + } else { + p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q0, m), r), vld1q_s32(sh + l - 32))); + p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q1, m), r), vld1q_s32(sh + l - 28))); + p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q2, m), r), vld1q_s32(sh + l - 24))); + p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q3, m), r), vld1q_s32(sh + l - 20))); + } + } + } + +#if QB == 4 + vst1q_u32((uint32_t *) ppt + 0, p0[0]); + vst1q_u32((uint32_t *) ppt + 4, p1[0]); + vst1q_u32((uint32_t *) ppt + 8, p0[1]); + vst1q_u32((uint32_t *) ppt + 12, p1[1]); + vst1q_u32((uint32_t *) ppt + 16, p0[2]); + vst1q_u32((uint32_t *) ppt + 20, p1[2]); + vst1q_u32((uint32_t *) ppt + 24, p0[3]); + vst1q_u32((uint32_t *) ppt + 28, p1[3]); + + pp[0] = (ppt[0] | ppt[1] | ppt[2] | ppt[3] ) | ((uint64_t) (ppt[4] | ppt[5] | ppt[6] | ppt[7]) ) << 32; + pp[1] = (ppt[8] | ppt[9] | ppt[10] | ppt[11]) | ((uint64_t) (ppt[12] | ppt[13] | ppt[14] | ppt[15])) << 32; + pp[2] = (ppt[16] | ppt[17] | ppt[18] | ppt[19]) | ((uint64_t) (ppt[20] | ppt[21] | ppt[22] | ppt[23])) << 32; + pp[3] = (ppt[24] | ppt[25] | ppt[26] | ppt[27]) | ((uint64_t) (ppt[28] | ppt[29] | ppt[30] | ppt[31])) << 32; +#else + for (int b = 0; b < QB; ++b) { + vst1q_u32((uint32_t *) ppt + 0, p0[b]); + vst1q_u32((uint32_t *) ppt + 4, p1[b]); + + pp[b] = (ppt[0] | ppt[1] | ppt[2] | ppt[3]) | ((uint64_t) (ppt[4] | ppt[5] | ppt[6] | ppt[7])) << 32; + } +#endif + } +#else + // less optimal SIMD + { + float32x4_t minv = vdupq_n_f32(min); + float32x4_t idv = vdupq_n_f32(id); + + assert(gq_t_bits == 64); + uint8_t qq[gq_t_bits]; + + for (int l = 0; l < gq_t_bits; l += 16) { + float32x4_t v0 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 0); + float32x4_t v1 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 4); + float32x4_t v2 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 8); + float32x4_t v3 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 12); + + v0 = vsubq_f32(v0, minv); + v1 = vsubq_f32(v1, minv); + v2 = vsubq_f32(v2, minv); + v3 = vsubq_f32(v3, minv); + + v0 = vmulq_f32(v0, idv); + v1 = vmulq_f32(v1, idv); + v2 = vmulq_f32(v2, idv); + v3 = vmulq_f32(v3, idv); + +#if 0 + v0[0] += frand(); v0[1] += frand(); v0[2] += frand(); v0[3] += frand(); + v1[0] += frand(); v1[1] += frand(); v1[2] += frand(); v1[3] += frand(); + v2[0] += frand(); v2[1] += frand(); v2[2] += frand(); v2[3] += frand(); + v3[0] += frand(); v3[1] += frand(); v3[2] += frand(); v3[3] += frand(); +#endif + + uint32x4_t q0 = vcvtq_u32_f32(v0); + uint32x4_t q1 = vcvtq_u32_f32(v1); + uint32x4_t q2 = vcvtq_u32_f32(v2); + uint32x4_t q3 = vcvtq_u32_f32(v3); + + // store in qq as uint8_t + vst1_u8(qq + l + 0, vmovn_u16(vcombine_u16(vmovn_u32(q0), vmovn_u32(q1)))); + vst1_u8(qq + l + 8, vmovn_u16(vcombine_u16(vmovn_u32(q2), vmovn_u32(q3)))); + } + + for (int l = 0; l < gq_t_bits; l++) { + for (int b = 0; b < QB; b++) { + const uint64_t ql = qq[l]; + /*pp[b] |= qq[l] & (1 << b) ? (1ULL << l) : 0;*/ + pp[b] |= ((ql & (1 << b)) >> b) << l; + } + } + } +#endif +#endif + memcpy(pb + i*nq*QB + s*QB, pp, sizeof(pp)); + } + } +} + +// reimplementation of quantize_2 using quantize_2_row +void quantize_2(const float * restrict src, char * restrict dst, int n, int k) { + assert(k % QK == 0); + + for (int j = 0; j < n; j++) { + quantize_2_row(src + j*k, dst, k); + dst = (char *) dst + quantize_2_row_size(k); + } +} + +void vec_dot_gq_2(const int n, float * restrict s, const void * restrict x, const void * restrict y) { + const int nb = quantize_2_blocks_per_row(n); + const int nq = quantize_2_quants_per_block(); + + const gq_scale_t * restrict pm0 = (const gq_scale_t *) x; + const gq_scale_t * restrict pm1 = (const gq_scale_t *) y; + + const gq_scale_t * restrict pd0 = pm0 + nb; + const gq_scale_t * restrict pd1 = pm1 + nb; + + const gq_quant_t * restrict pb0 = (const gq_quant_t *) (pd0 + nb); + const gq_quant_t * restrict pb1 = (const gq_quant_t *) (pd1 + nb); + + float sumf = 0.0; + +#if 1 + for (int i = 0; i < nb; i++) { + const float m0 = GGML_GQ_TO_FP32(pm0[i]); + const float d0 = GGML_GQ_TO_FP32(pd0[i]); + + const float m1 = GGML_GQ_TO_FP32(pm1[i]); + const float d1 = GGML_GQ_TO_FP32(pd1[i]); + +#if QB == 4 + int isum01 = 0; + int isum10 = 0; + int isum11 = 0; + + for (int s = 0; s < nq; ++s) { + const gq_quant_t * restrict mm0 = pb0 + i*nq*QB + s*QB; + const gq_quant_t * restrict mm1 = pb1 + i*nq*QB + s*QB; + +#define bpcnt(x) __builtin_popcountll(x) + isum01 += (1 << 0)*(bpcnt(mm1[0])); + isum01 += (1 << 1)*(bpcnt(mm1[1])); + isum01 += (1 << 2)*(bpcnt(mm1[2])); + isum01 += (1 << 3)*(bpcnt(mm1[3])); + + isum10 += (1 << 0)*(bpcnt(mm0[0])); + isum10 += (1 << 1)*(bpcnt(mm0[1])); + isum10 += (1 << 2)*(bpcnt(mm0[2])); + isum10 += (1 << 3)*(bpcnt(mm0[3])); + + isum11 += (1 << 0)*(bpcnt(mm0[0] & mm1[0])); + isum11 += (1 << 1)*(bpcnt(mm0[0] & mm1[1]) + bpcnt(mm0[1] & mm1[0])); + isum11 += (1 << 2)*(bpcnt(mm0[0] & mm1[2]) + bpcnt(mm0[1] & mm1[1]) + bpcnt(mm0[2] & mm1[0])); + isum11 += (1 << 3)*(bpcnt(mm0[0] & mm1[3]) + bpcnt(mm0[1] & mm1[2]) + bpcnt(mm0[2] & mm1[1]) + bpcnt(mm0[3] & mm1[0])); + isum11 += (1 << 4)*(bpcnt(mm0[1] & mm1[3]) + bpcnt(mm0[2] & mm1[2]) + bpcnt(mm0[3] & mm1[1])); + isum11 += (1 << 5)*(bpcnt(mm0[2] & mm1[3]) + bpcnt(mm0[3] & mm1[2])); + isum11 += (1 << 6)*(bpcnt(mm0[3] & mm1[3])); +#undef bpcnt + } + + sumf += nq*gq_t_bits*(m0*m1) + isum01*(m0*d1) + isum10*(m1*d0) + isum11*(d0*d1); +#elif QB == 3 + int isum01 = 0; + int isum10 = 0; + int isum11 = 0; + + for (int s = 0; s < nq; ++s) { + const gq_quant_t * restrict mm0 = pb0 + i*nq*QB + s*QB; + const gq_quant_t * restrict mm1 = pb1 + i*nq*QB + s*QB; + +#if gq_t_bits == 32 +#define bpcnt(x) __builtin_popcount(x) +#else +#define bpcnt(x) __builtin_popcountll(x) +#endif + isum01 += (1 << 0)*(bpcnt(mm1[0])); + isum01 += (1 << 1)*(bpcnt(mm1[1])); + isum01 += (1 << 2)*(bpcnt(mm1[2])); + + isum10 += (1 << 0)*(bpcnt(mm0[0])); + isum10 += (1 << 1)*(bpcnt(mm0[1])); + isum10 += (1 << 2)*(bpcnt(mm0[2])); + + isum11 += (1 << 0)*(bpcnt(mm0[0] & mm1[0])); + isum11 += (1 << 1)*(bpcnt(mm0[0] & mm1[1]) + bpcnt(mm0[1] & mm1[0])); + isum11 += (1 << 2)*(bpcnt(mm0[0] & mm1[2]) + bpcnt(mm0[1] & mm1[1]) + bpcnt(mm0[2] & mm1[0])); + isum11 += (1 << 3)*(bpcnt(mm0[1] & mm1[2]) + bpcnt(mm0[2] & mm1[1])); + isum11 += (1 << 4)*(bpcnt(mm0[2] & mm1[2])); +#undef bpcnt + } + + sumf += nq*gq_t_bits*(m0*m1) + isum01*(m0*d1) + isum10*(m1*d0) + isum11*(d0*d1); +#elif QB == 2 + int isum01 = 0; + int isum10 = 0; + int isum11 = 0; + + for (int s = 0; s < nq; ++s) { + const gq_quant_t * restrict mm0 = pb0 + i*nq*QB + s*QB; + const gq_quant_t * restrict mm1 = pb1 + i*nq*QB + s*QB; + +#if gq_t_bits == 32 +#define bpcnt(x) __builtin_popcount(x) +#else +#define bpcnt(x) __builtin_popcountll(x) +#endif + isum01 += (1 << 0)*(bpcnt(mm1[0])); + isum01 += (1 << 1)*(bpcnt(mm1[1])); + + isum10 += (1 << 0)*(bpcnt(mm0[0])); + isum10 += (1 << 1)*(bpcnt(mm0[1])); + + isum11 += (1 << 0)*(bpcnt(mm0[0] & mm1[0])); + isum11 += (1 << 1)*(bpcnt(mm0[0] & mm1[1]) + bpcnt(mm0[1] & mm1[0])); + isum11 += (1 << 2)*(bpcnt(mm0[1] & mm1[1])); +#undef bpcnt + } + + sumf += nq*gq_t_bits*(m0*m1) + isum01*(m0*d1) + isum10*(m1*d0) + isum11*(d0*d1); +#else + float s0[QB + 1]; + float s1[QB + 1]; + + s0[0] = m0; + s1[0] = m1; + + for (int b = 0; b < QB; b++) { + s0[b + 1] = d0*(1 << b); + s1[b + 1] = d1*(1 << b); + } + + for (int s = 0; s < nq; ++s) { + for (int q0 = 0; q0 < QB + 1; q0++) { + const gq_quant_t mm0 = q0 ? pb0[i*nq*QB + s*QB + q0 - 1] : -1ULL; + for (int q1 = 0; q1 < QB + 1; q1++) { + const gq_quant_t mm1 = q1 ? pb1[i*nq*QB + s*QB + q1 - 1] : -1ULL; + sumf += s0[q0]*s1[q1]*__builtin_popcountll(mm0 & mm1); + } + } + } +#endif + } +#else +#error "not implemented" +#endif + + *s = sumf; +} + +// use vec_dot_gq_2 to compute the dot product of two rows +void mul_mat_gq_2( + const void * src0, + const void * src1, // transposed + float * dst, + int m, int n, int k) { + assert(k % QK == 0); + + for (int ir0 = 0; ir0 < m; ir0++) { + for (int ir1 = 0; ir1 < n; ir1++) { + vec_dot_gq_2(k, dst + ir1, src0, src1); + src1 = (const char *) src1 + quantize_2_row_size(k); + } + src0 = (const char *) src0 + quantize_2_row_size(k); + src1 = (const char *) src1 - n*quantize_2_row_size(k); + + dst = (float *) dst + n; + } +} + +// +// method 3 +// (does not work) +// + +static inline int quantize_3_blocks_per_row(int k) { + return k/QK; +} + +static inline int quantize_3_quants_per_block(void) { + return QK/gq_t_bits; +} + +static inline int quantize_3_row_size(int k) { + const int nb = quantize_3_blocks_per_row(k); + const int nq = quantize_3_quants_per_block(); + + return nb*(sizeof(gq_scale_t) + nq*QB*sizeof(gq_quant_t)); +} + +void quantize_3_row(const float * restrict src, void * restrict dst, int k) { + assert(k % QK == 0); + + const int nb = quantize_3_blocks_per_row(k); + const int nq = quantize_3_quants_per_block(); + + gq_scale_t * restrict pd = (gq_scale_t *) (dst); + gq_quant_t * restrict pb = (gq_quant_t *) (pd + nb); + + gq_quant_t pp[QB]; + + static const int32_t sh[32] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + }; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // abs max + +#ifdef __ARM_NEON + { + // min / max + //float32x4_t minv = vdupq_n_f32(FLT_MAX); + //float32x4_t maxv = vdupq_n_f32(-FLT_MAX); + + //for (int l = 0; l < QK; l += 4) { + // float32x4_t v = vld1q_f32(src + i*QK + l); + // minv = vminq_f32(minv, v); + // maxv = vmaxq_f32(maxv, v); + //} + + //float32x2_t minv32 = vpmin_f32(vget_low_f32(minv), vget_high_f32(minv)); + //float32x2_t maxv32 = vpmax_f32(vget_low_f32(maxv), vget_high_f32(maxv)); + + //min = MIN(vget_lane_f32(minv32, 0), vget_lane_f32(minv32, 1)); + //max = MAX(vget_lane_f32(maxv32, 0), vget_lane_f32(maxv32, 1)); + + // abs max + float32x4_t amaxv = vdupq_n_f32(0.0f); + + for (int l = 0; l < QK; l += 4) { + float32x4_t v = vld1q_f32(src + i*QK + l); + amaxv = vmaxq_f32(amaxv, vabsq_f32(v)); + } + + float32x2_t amaxv32 = vpmax_f32(vget_low_f32(amaxv), vget_high_f32(amaxv)); + + amax = MAX(vget_lane_f32(amaxv32, 0), vget_lane_f32(amaxv32, 1)); + } +#else + { + for (int l = 0; l < QK; l++) { + const float v = src[i*QK + l]; + amax = MAX(amax, fabsf(v)); + } + } +#endif + + const float d = amax / ((1 << (QB - 1)) - 1); + const float id = d ? 1.0/d : 0.0; + + pd[i] = GGML_FP32_TO_GQ(d); + + for (int s = 0; s < nq; ++s) { + memset(pp, 0, sizeof(pp)); + +#if 0 + for (int l = 0; l < gq_t_bits; l++) { + const float v = src[i*QK + s*gq_t_bits + l]; + const uint8_t q = v*id + frand(); + + for (int b = 0; b < QB; b++) { + pp[b] |= q & (1 << b) ? (1ULL << l) : 0; + } + } +#elif defined(__ARM_NEON) + { + uint32_t ppt[2*4*QB]; + + float32x4_t idv = vdupq_n_f32(id); + + assert(gq_t_bits == 64); + + uint32x4_t p0[QB] = { vdupq_n_u32(0) }; + uint32x4_t p1[QB] = { vdupq_n_u32(0) }; + + for (int l = 0; l < gq_t_bits; l += 16) { + float32x4_t v0 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 0); + float32x4_t v1 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 4); + float32x4_t v2 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 8); + float32x4_t v3 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 12); + + v0 = vmulq_f32(v0, idv); + v1 = vmulq_f32(v1, idv); + v2 = vmulq_f32(v2, idv); + v3 = vmulq_f32(v3, idv); + +#if 1 + v0[0] += frand(); v0[1] += frand(); v0[2] += frand(); v0[3] += frand(); + v1[0] += frand(); v1[1] += frand(); v1[2] += frand(); v1[3] += frand(); + v2[0] += frand(); v2[1] += frand(); v2[2] += frand(); v2[3] += frand(); + v3[0] += frand(); v3[1] += frand(); v3[2] += frand(); v3[3] += frand(); +#endif + + uint32x4_t q0 = vcvtq_u32_f32(v0); + uint32x4_t q1 = vcvtq_u32_f32(v1); + uint32x4_t q2 = vcvtq_u32_f32(v2); + uint32x4_t q3 = vcvtq_u32_f32(v3); + + for (int b = 0; b < QB; ++b) { + uint32x4_t m = vdupq_n_u32(1 << b); + int32x4_t r = vdupq_n_s32(-b); + + if (l < 32) { + p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q0, m), r), vld1q_s32(sh + l + 0))); + p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q1, m), r), vld1q_s32(sh + l + 4))); + p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q2, m), r), vld1q_s32(sh + l + 8))); + p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q3, m), r), vld1q_s32(sh + l + 12))); + } else { + p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q0, m), r), vld1q_s32(sh + l - 32))); + p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q1, m), r), vld1q_s32(sh + l - 28))); + p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q2, m), r), vld1q_s32(sh + l - 24))); + p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q3, m), r), vld1q_s32(sh + l - 20))); + } + } + } + +#if QB == 4 + vst1q_u32((uint32_t *) ppt + 0, p0[0]); + vst1q_u32((uint32_t *) ppt + 4, p1[0]); + vst1q_u32((uint32_t *) ppt + 8, p0[1]); + vst1q_u32((uint32_t *) ppt + 12, p1[1]); + vst1q_u32((uint32_t *) ppt + 16, p0[2]); + vst1q_u32((uint32_t *) ppt + 20, p1[2]); + vst1q_u32((uint32_t *) ppt + 24, p0[3]); + vst1q_u32((uint32_t *) ppt + 28, p1[3]); + + pp[0] = (ppt[0] | ppt[1] | ppt[2] | ppt[3] ) | ((uint64_t) (ppt[4] | ppt[5] | ppt[6] | ppt[7]) ) << 32; + pp[1] = (ppt[8] | ppt[9] | ppt[10] | ppt[11]) | ((uint64_t) (ppt[12] | ppt[13] | ppt[14] | ppt[15])) << 32; + pp[2] = (ppt[16] | ppt[17] | ppt[18] | ppt[19]) | ((uint64_t) (ppt[20] | ppt[21] | ppt[22] | ppt[23])) << 32; + pp[3] = (ppt[24] | ppt[25] | ppt[26] | ppt[27]) | ((uint64_t) (ppt[28] | ppt[29] | ppt[30] | ppt[31])) << 32; +#else + for (int q = 0; q < QB; ++q) { + vst1q_u32((uint32_t *) ppt + 0, p0[q]); + vst1q_u32((uint32_t *) ppt + 4, p1[q]); + + pp[q] = (ppt[0] | ppt[1] | ppt[2] | ppt[3]) | ((uint64_t) (ppt[4] | ppt[5] | ppt[6] | ppt[7])) << 32; + } +#endif + } +#endif + memcpy(pb + i*nq*QB + s*QB, pp, sizeof(pp)); + } + } +} + +// reimplementation of quantize_3 using quantize_3_row +void quantize_3(const float * restrict src, char * restrict dst, int n, int k) { + assert(k % QK == 0); + + for (int j = 0; j < n; j++) { + quantize_3_row(src + j*k, dst, k); + dst = (char *) dst + quantize_3_row_size(k); + } +} + +void vec_dot_gq_3(const int n, float * restrict s, const void * restrict x, const void * restrict y) { + float sumf = 0.0f; + + const int nb = quantize_3_blocks_per_row(n); + const int nq = quantize_3_quants_per_block(); + + const gq_scale_t * restrict pd0 = (const gq_scale_t *) x; + const gq_scale_t * restrict pd1 = (const gq_scale_t *) y; + + const gq_quant_t * restrict pb0 = (const gq_quant_t *) (pd0 + nb); + const gq_quant_t * restrict pb1 = (const gq_quant_t *) (pd1 + nb); + +#if 1 + for (int i = 0; i < nb; i++) { + int isum = 0; + +#if QB == 4 + for (int s = 0; s < nq; ++s) { + const gq_quant_t * restrict m0 = pb0 + i*nq*QB + s*QB; + const gq_quant_t * restrict m1 = pb1 + i*nq*QB + s*QB; + + isum += (1 << 0)*(__builtin_popcountll(m0[0] & m1[0])); + isum += (1 << 1)*(__builtin_popcountll(m0[0] & m1[1]) + __builtin_popcountll(m0[1] & m1[0])); + isum += (1 << 2)*(__builtin_popcountll(m0[0] & m1[2]) + __builtin_popcountll(m0[1] & m1[1]) + __builtin_popcountll(m0[2] & m1[0])); + isum += (1 << 3)*(__builtin_popcountll(m0[0] & m1[3]) + __builtin_popcountll(m0[1] & m1[2]) + __builtin_popcountll(m0[2] & m1[1]) + __builtin_popcountll(m0[3] & m1[0])); + isum += (1 << 4)*(__builtin_popcountll(m0[1] & m1[3]) + __builtin_popcountll(m0[2] & m1[2]) + __builtin_popcountll(m0[3] & m1[1])); + isum += (1 << 5)*(__builtin_popcountll(m0[2] & m1[3]) + __builtin_popcountll(m0[3] & m1[2])); + isum += (1 << 6)*(__builtin_popcountll(m0[3] & m1[3])); + } +#else + for (int s = 0; s < nq; ++s) { + for (int q0 = 0; q0 < QB; q0++) { + const gq_quant_t mm0 = pb0[i*nq*QB + s*QB + q0]; + for (int q1 = 0; q1 < QB; q1++) { + const gq_quant_t mm1 = pb1[i*nq*QB + s*QB + q1]; + isum += (1 << (q0 + q1))*(__builtin_popcountll(mm0 & mm1)); + } + } + } +#endif + + const float d0 = GGML_GQ_TO_FP32(pd0[i]); + const float d1 = GGML_GQ_TO_FP32(pd1[i]); + + sumf += d0*d1*isum; + } +#else +#ifdef __ARM_NEON + // gq_quant_t == uint64_t + for (int i = 0; i < nb; i += 4) { + int isum[4] = {0, 0, 0, 0}; + + for (int k = 0; k < 4; ++k) { + for (int s = 0; s < nq; ++s) { + const gq_quant_t * restrict m0 = pb0 + (i+k)*nq*QB + s*QB; + const gq_quant_t * restrict m1 = pb1 + (i+k)*nq*QB + s*QB; + +#if QB == 4 +#define bpcnt(x) __builtin_popcountll(x) + //isum[k] += (1ULL << 0)*(bpcnt(m0[0] & m1[0])) + + // (1ULL << 1)*(bpcnt(m0[0] & m1[1]) + bpcnt(m0[1] & m1[0])) + + // (1ULL << 2)*(bpcnt(m0[0] & m1[2]) + bpcnt(m0[1] & m1[1]) + bpcnt(m0[2] & m1[0])) + + // (1ULL << 3)*(bpcnt(m0[0] & m1[3]) + bpcnt(m0[1] & m1[2]) + bpcnt(m0[2] & m1[1]) + bpcnt(m0[3] & m1[0])) + + // (1ULL << 4)*(bpcnt(m0[1] & m1[3]) + bpcnt(m0[2] & m1[2]) + bpcnt(m0[3] & m1[1])) + + // (1ULL << 5)*(bpcnt(m0[2] & m1[3]) + bpcnt(m0[3] & m1[2])) + + // (1ULL << 6)*(bpcnt(m0[3] & m1[3])); +#undef bpcnt + + const uint8x8_t m00 = vld1_u8((const uint8_t *) (m0 + 0)); + const uint8x8_t m01 = vld1_u8((const uint8_t *) (m0 + 1)); + const uint8x8_t m02 = vld1_u8((const uint8_t *) (m0 + 2)); + const uint8x8_t m03 = vld1_u8((const uint8_t *) (m0 + 3)); + + const uint8x8_t m10 = vld1_u8((const uint8_t *) (m1 + 0)); + const uint8x8_t m11 = vld1_u8((const uint8_t *) (m1 + 1)); + const uint8x8_t m12 = vld1_u8((const uint8_t *) (m1 + 2)); + const uint8x8_t m13 = vld1_u8((const uint8_t *) (m1 + 3)); + + const uint8x8_t m00m10 = vand_u8(m00, m10); + + const uint8x8_t m00m11 = vand_u8(m00, m11); + const uint8x8_t m01m10 = vand_u8(m01, m10); + + const uint8x8_t m00m12 = vand_u8(m00, m12); + const uint8x8_t m01m11 = vand_u8(m01, m11); + const uint8x8_t m02m10 = vand_u8(m02, m10); + + const uint8x8_t m00m13 = vand_u8(m00, m13); + const uint8x8_t m01m12 = vand_u8(m01, m12); + const uint8x8_t m02m11 = vand_u8(m02, m11); + const uint8x8_t m03m10 = vand_u8(m03, m10); + + const uint8x8_t m01m13 = vand_u8(m01, m13); + const uint8x8_t m02m12 = vand_u8(m02, m12); + const uint8x8_t m03m11 = vand_u8(m03, m11); + + const uint8x8_t m02m13 = vand_u8(m02, m13); + const uint8x8_t m03m12 = vand_u8(m03, m12); + + const uint8x8_t m03m13 = vand_u8(m03, m13); + +#define bpcnt(x) vaddv_u8(vcnt_u8(x)) + isum[k] += (1ULL << 0)*(bpcnt(m00m10)) + + (1ULL << 1)*(bpcnt(m00m11) + bpcnt(m01m10)) + + (1ULL << 2)*(bpcnt(m00m12) + bpcnt(m01m11) + bpcnt(m02m10)) + + (1ULL << 3)*(bpcnt(m00m13) + bpcnt(m01m12) + bpcnt(m02m11) + bpcnt(m03m10)) + + (1ULL << 4)*(bpcnt(m01m13) + bpcnt(m02m12) + bpcnt(m03m11)) + + (1ULL << 5)*(bpcnt(m02m13) + bpcnt(m03m12)) + + (1ULL << 6)*(bpcnt(m03m13)); +#undef bpcnt +#else + for (int q0 = 0; q0 < QB; q0++) { + const gq_quant_t mm0 = m0[q0]; + for (int q1 = 0; q1 < QB; q1++) { + const gq_quant_t mm1 = m1[q1]; + isum[k] += (1ULL << (q0 + q1))*(__builtin_popcountll(mm0 & mm1)); + } + } +#endif + } + } + + int32x4_t isumv = vld1q_s32(isum); + + float32x4_t d0v = vld1q_f32(pd0 + i); + float32x4_t d1v = vld1q_f32(pd1 + i); + + float32x4_t sumfv = vmulq_f32(d0v, d1v); + + sumfv = vmulq_f32(sumfv, vcvtq_f32_s32(isumv)); + sumf += vaddvq_f32(sumfv); + } +#else +#error "not implemented" +#endif + +#endif + *s = sumf; +} + +// use vec_dot_gq_3 to compute the dot product of two rows +void mul_mat_gq_3( + const void * src0, + const void * src1, // transposed + float * dst, + int m, int n, int k) { + assert(k % QK == 0); + + const int nb = quantize_3_blocks_per_row(k); + const int nq = quantize_3_quants_per_block(); + + for (int ir0 = 0; ir0 < m; ir0++) { + for (int ir1 = 0; ir1 < n; ir1++) { + vec_dot_gq_3(k, dst + ir1, src0, src1); + src1 = (const char *) src1 + quantize_3_row_size(k); + } + src0 = (const char *) src0 + quantize_3_row_size(k); + src1 = (const char *) src1 - n*quantize_3_row_size(k); + + dst = (float *) dst + n; + } +} + +// +// method 4 +// 4-bit quantization +// + +static inline int quantize_4_blocks_per_row(int k) { + return k/QK; +} + +static inline int quantize_4_row_size(int k) { + const int nb = quantize_4_blocks_per_row(k); + + return nb*(2*sizeof(gq_scale_t) + QK/2); +} + +void quantize_4_row(const float * restrict src, void * restrict dst, int k) { + assert(k % QK == 0); + assert(QB == 4); + + const int nb = quantize_4_blocks_per_row(k); + + gq_scale_t * restrict pm = (gq_scale_t *) (dst); + gq_scale_t * restrict pd = (gq_scale_t *) (pm + nb); + uint8_t * restrict pb = (uint8_t *) (pd + nb); + + uint8_t pp[QK/2]; + + for (int i = 0; i < nb; i++) { + memset(pp, 0, sizeof(pp)); + + float min = FLT_MAX; + float max = -FLT_MAX; + +#if defined(__AVX2__) + { + assert(QK == 64); + enum { QK8 = QK/8 }; + + __m256 srcv[QK8]; + __m256 minv[QK8]; + __m256 maxv[QK8]; + + for (int l = 0; l < QK8; l++) { + srcv[l] = _mm256_loadu_ps(src + i*QK + 8*l); + } + + for (int l = 0; l < QK8/2; l++) { + minv[2*l] = _mm256_min_ps(srcv[2*l], srcv[2*l+1]); + maxv[2*l] = _mm256_max_ps(srcv[2*l], srcv[2*l+1]); + } + + for (int l = 0; l < QK8/4; l++) { + minv[4*l] = _mm256_min_ps(minv[4*l], minv[4*l+2]); + maxv[4*l] = _mm256_max_ps(maxv[4*l], maxv[4*l+2]); + } + + for (int l = 0; l < QK8/8; l++) { + minv[8*l] = _mm256_min_ps(minv[8*l], minv[8*l+4]); + maxv[8*l] = _mm256_max_ps(maxv[8*l], maxv[8*l+4]); + } + + //min = MIN(minv[0][0], MIN(minv[0][1], MIN(minv[0][2], MIN(minv[0][3], MIN(minv[0][4], MIN(minv[0][5], MIN(minv[0][6], minv[0][7]))))))); + //max = MAX(maxv[0][0], MAX(maxv[0][1], MAX(maxv[0][2], MAX(maxv[0][3], MAX(maxv[0][4], MAX(maxv[0][5], MAX(maxv[0][6], maxv[0][7]))))))); + + const __m256 minv0_0 = _mm256_permute2f128_ps(minv[0], minv[0], 3); + const __m256 minv0_1 = _mm256_min_ps(minv[0], minv0_0); + const __m256 minv0_2 = _mm256_permute_ps(minv0_1, 0x4e); + const __m256 minv0_3 = _mm256_min_ps(minv0_1, minv0_2); + const __m256 minv0_4 = _mm256_permute_ps(minv0_3, 0xb1); + const __m256 minv0_5 = _mm256_min_ps(minv0_3, minv0_4); + + const __m256 maxv0_0 = _mm256_permute2f128_ps(maxv[0], maxv[0], 3); + const __m256 maxv0_1 = _mm256_max_ps(maxv[0], maxv0_0); + const __m256 maxv0_2 = _mm256_permute_ps(maxv0_1, 0x4e); + const __m256 maxv0_3 = _mm256_max_ps(maxv0_1, maxv0_2); + const __m256 maxv0_4 = _mm256_permute_ps(maxv0_3, 0xb1); + const __m256 maxv0_5 = _mm256_max_ps(maxv0_3, maxv0_4); + + min = _mm256_cvtss_f32(minv0_5); + max = _mm256_cvtss_f32(maxv0_5); + + const float d = (max - min) / ((1 << QB) - 2); + const float id = d ? 1.0/d : 0.0; + + pm[i] = GGML_FP32_TO_GQ(min); + pd[i] = GGML_FP32_TO_GQ(d); + + const __m256 idv = _mm256_set1_ps(id); + + for (int l = 0; l < QK/8; l++) { + __m256 v = _mm256_mul_ps(_mm256_sub_ps(srcv[l], _mm256_set1_ps(min)), idv); +#if 0 + v[0] += frand(); v[1] += frand(); v[2] += frand(); v[3] += frand(); + v[4] += frand(); v[5] += frand(); v[6] += frand(); v[7] += frand(); +#endif + + // convert to uint8 + __m256i vi = _mm256_cvtps_epi32(v); + + uint32_t vi_0 = _mm256_extract_epi32(vi, 0); + uint32_t vi_1 = _mm256_extract_epi32(vi, 1); + uint32_t vi_2 = _mm256_extract_epi32(vi, 2); + uint32_t vi_3 = _mm256_extract_epi32(vi, 3); + + uint32_t vi_4 = _mm256_extract_epi32(vi, 4); + uint32_t vi_5 = _mm256_extract_epi32(vi, 5); + uint32_t vi_6 = _mm256_extract_epi32(vi, 6); + uint32_t vi_7 = _mm256_extract_epi32(vi, 7); + + // convert to 4-bit, 2 consecutive packed into 1 byte + pp[4*l + 0] = vi_0 | (vi_1 << 4); + pp[4*l + 1] = vi_2 | (vi_3 << 4); + pp[4*l + 2] = vi_4 | (vi_5 << 4); + pp[4*l + 3] = vi_6 | (vi_7 << 4); + + //printf("vi: %7d %7d %7d %7d %7d %7d %7d %7d\n", vi_0, vi_1, vi_2, vi_3, vi_4, vi_5, vi_6, vi_7); + //printf("v : %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); + } + + memcpy(pb + i*QK/2, pp, sizeof(pp)); + } +#elif defined(__ARM_NEON) && 0 + { + // TODO + } +#else + { + for (int l = 0; l < QK; l++) { + const float v = src[i*QK + l]; + if (v < min) min = v; + if (v > max) max = v; + } + + const float d = (max - min) / ((1 << QB) - 1); + const float id = d ? 1.0/d : 0.0; + + pm[i] = GGML_FP32_TO_GQ(min); + pd[i] = GGML_FP32_TO_GQ(d); + + for (int l = 0; l < QK; l++) { + const float v = (src[i*QK + l] - min) * id; + const uint8_t vi = (uint8_t) (v + frand()); + pp[l/2] |= (vi & 0xf) << (4*(l & 1)); + } + + memcpy(pb + i*QK/2, pp, sizeof(pp)); + } +#endif + //printf("min %f max %f\n", min, max); + } +} + +// reimplementation of quantize_4 using quantize_4_row +void quantize_4(const float * restrict src, char * restrict dst, int n, int k) { + assert(k % QK == 0); + + for (int j = 0; j < n; j++) { + quantize_4_row(src + j*k, dst, k); + dst = (char *) dst + quantize_4_row_size(k); + } +} + +void vec_dot_gq_4(const int n, float * restrict s, const void * restrict x, const void * restrict y) { + const int nb = quantize_4_blocks_per_row(n); + + const gq_scale_t * restrict pm0 = (const gq_scale_t *) x; + const gq_scale_t * restrict pm1 = (const gq_scale_t *) y; + + const gq_scale_t * restrict pd0 = pm0 + nb; + const gq_scale_t * restrict pd1 = pm1 + nb; + + const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb); + const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb); + + float sumf = 0.0; + +#if 0 + // scalar + for (int i = 0; i < nb; i++) { + const float m0 = GGML_GQ_TO_FP32(pm0[i]); + const float d0 = GGML_GQ_TO_FP32(pd0[i]); + + const float m1 = GGML_GQ_TO_FP32(pm1[i]); + const float d1 = GGML_GQ_TO_FP32(pd1[i]); + + const uint8_t * restrict p0 = pb0 + i*QK/2; + const uint8_t * restrict p1 = pb1 + i*QK/2; + + for (int j = 0; j < QK/2; j++) { + const uint8_t v0 = p0[j]; + const uint8_t v1 = p1[j]; + + const float f0 = d0*(v0 & 0xf) + m0; + const float f1 = d0*(v0 >> 4) + m0; + + const float f2 = d1*(v1 & 0xf) + m1; + const float f3 = d1*(v1 >> 4) + m1; + + sumf += f0*f2 + f1*f3; + } + } +#else +#if defined(__AVX2__) +#if QK == 64 && 0 + __m256 sumv0 = _mm256_setzero_ps(); + __m256 sumv1 = _mm256_setzero_ps(); + + for (int i = 0; i < nb; i++) { + const float m0 = GGML_GQ_TO_FP32(pm0[i]); + const float d0 = GGML_GQ_TO_FP32(pd0[i]); + + const float m1 = GGML_GQ_TO_FP32(pm1[i]); + const float d1 = GGML_GQ_TO_FP32(pd1[i]); + + const uint8_t * restrict p0 = pb0 + i*QK/2; + const uint8_t * restrict p1 = pb1 + i*QK/2; + + const __m256 m0v = _mm256_set1_ps(m0); + const __m256 d0v = _mm256_set1_ps(d0); + + const __m256 m1v = _mm256_set1_ps(m1); + const __m256 d1v = _mm256_set1_ps(d1); + + const __m256i m4b = _mm256_set1_epi8(0xf); + + __m256i v0 = _mm256_loadu_si256((__m256i *) p0); + + //_mm_prefetch((const char *) (p0 + 32), _MM_HINT_T0); + //_mm_prefetch((const char *) (p1 + 32), _MM_HINT_T0); + //_mm_prefetch((const char *) (pm0 + i + 1), _MM_HINT_T0); + //_mm_prefetch((const char *) (pm1 + i + 1), _MM_HINT_T0); + //_mm_prefetch((const char *) (pd0 + i + 1), _MM_HINT_T0); + //_mm_prefetch((const char *) (pd1 + i + 1), _MM_HINT_T0); + + __m256i v00 = _mm256_and_si256(v0, _mm256_set1_epi32(0x000000FF)); + __m256i v01 = _mm256_srli_epi32(_mm256_and_si256(v0, _mm256_set1_epi32(0x0000FFFF)), 8); + __m256i v02 = _mm256_srli_epi32(_mm256_and_si256(v0, _mm256_set1_epi32(0x00FFFFFF)), 16); + __m256i v03 = _mm256_srli_epi32(v0, 24); + + ////////////////////// + + //{ + // uint32_t vi_0 = _mm256_extract_epi32(v00, 0); + // uint32_t vi_1 = _mm256_extract_epi32(v00, 1); + // uint32_t vi_2 = _mm256_extract_epi32(v00, 2); + // uint32_t vi_3 = _mm256_extract_epi32(v00, 3); + // uint32_t vi_4 = _mm256_extract_epi32(v00, 4); + // uint32_t vi_5 = _mm256_extract_epi32(v00, 5); + // uint32_t vi_6 = _mm256_extract_epi32(v00, 6); + // uint32_t vi_7 = _mm256_extract_epi32(v00, 7); + // printf("v0: %7d %7d %7d %7d %7d %7d %7d %7d\n", vi_0, vi_1, vi_2, vi_3, vi_4, vi_5, vi_6, vi_7); + // printf("p0: %7d %7d %7d %7d %7d %7d %7d %7d\n", p0[0], p0[4], p0[8], p0[12], p0[16], p0[20], p0[24], p0[28]); + // printf("p1: %7d %7d %7d %7d %7d %7d %7d %7d\n", p0[1], p0[5], p0[9], p0[13], p0[17], p0[21], p0[25], p0[29]); + // printf("p2: %7d %7d %7d %7d %7d %7d %7d %7d\n", p0[2], p0[6], p0[10], p0[14], p0[18], p0[22], p0[26], p0[30]); + // printf("p3: %7d %7d %7d %7d %7d %7d %7d %7d\n", p0[3], p0[7], p0[11], p0[15], p0[19], p0[23], p0[27], p0[31]); + //} + + // compute 32 x 4-bit values (low and high) + __m256i v00l = _mm256_and_si256(v00, m4b); + __m256i v01l = _mm256_and_si256(v01, m4b); + __m256i v02l = _mm256_and_si256(v02, m4b); + __m256i v03l = _mm256_and_si256(v03, m4b); + + __m256i v00h = _mm256_srli_epi32(v00, 4); + __m256i v01h = _mm256_srli_epi32(v01, 4); + __m256i v02h = _mm256_srli_epi32(v02, 4); + __m256i v03h = _mm256_srli_epi32(v03, 4); + + //{ + // uint32_t vi_0 = _mm256_extract_epi32(v00l, 0); + // uint32_t vi_1 = _mm256_extract_epi32(v00l, 1); + // uint32_t vi_2 = _mm256_extract_epi32(v00l, 2); + // uint32_t vi_3 = _mm256_extract_epi32(v00l, 3); + // uint32_t vi_4 = _mm256_extract_epi32(v00l, 4); + // uint32_t vi_5 = _mm256_extract_epi32(v00l, 5); + // uint32_t vi_6 = _mm256_extract_epi32(v00l, 6); + // uint32_t vi_7 = _mm256_extract_epi32(v00l, 7); + + // printf("v0l: %7d %7d %7d %7d %7d %7d %7d %7d\n", vi_0, vi_1, vi_2, vi_3, vi_4, vi_5, vi_6, vi_7); + + // vi_0 = _mm256_extract_epi32(v00h, 0); + // vi_1 = _mm256_extract_epi32(v00h, 1); + // vi_2 = _mm256_extract_epi32(v00h, 2); + // vi_3 = _mm256_extract_epi32(v00h, 3); + // vi_4 = _mm256_extract_epi32(v00h, 4); + // vi_5 = _mm256_extract_epi32(v00h, 5); + // vi_6 = _mm256_extract_epi32(v00h, 6); + // vi_7 = _mm256_extract_epi32(v00h, 7); + + // printf("v0h: %7d %7d %7d %7d %7d %7d %7d %7d\n", vi_0, vi_1, vi_2, vi_3, vi_4, vi_5, vi_6, vi_7); + //} + + // convert to float + __m256 vf00l = _mm256_cvtepi32_ps(v00l); + __m256 vf01l = _mm256_cvtepi32_ps(v01l); + __m256 vf02l = _mm256_cvtepi32_ps(v02l); + __m256 vf03l = _mm256_cvtepi32_ps(v03l); + + __m256 vf00h = _mm256_cvtepi32_ps(v00h); + __m256 vf01h = _mm256_cvtepi32_ps(v01h); + __m256 vf02h = _mm256_cvtepi32_ps(v02h); + __m256 vf03h = _mm256_cvtepi32_ps(v03h); + + //{ + // printf("vf00l: %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", vf00l[0], vf00l[1], vf00l[2], vf00l[3], vf00l[4], vf00l[5], vf00l[6], vf00l[7]); + // printf("vf01l: %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", vf01l[0], vf01l[1], vf01l[2], vf01l[3], vf01l[4], vf01l[5], vf01l[6], vf01l[7]); + // printf("vf02l: %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", vf02l[0], vf02l[1], vf02l[2], vf02l[3], vf02l[4], vf02l[5], vf02l[6], vf02l[7]); + // printf("vf03l: %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", vf03l[0], vf03l[1], vf03l[2], vf03l[3], vf03l[4], vf03l[5], vf03l[6], vf03l[7]); + //} + + // multiply by scale and add offset + vf00l = _mm256_fmadd_ps(vf00l, d0v, m0v); + vf01l = _mm256_fmadd_ps(vf01l, d0v, m0v); + vf02l = _mm256_fmadd_ps(vf02l, d0v, m0v); + vf03l = _mm256_fmadd_ps(vf03l, d0v, m0v); + + vf00h = _mm256_fmadd_ps(vf00h, d0v, m0v); + vf01h = _mm256_fmadd_ps(vf01h, d0v, m0v); + vf02h = _mm256_fmadd_ps(vf02h, d0v, m0v); + vf03h = _mm256_fmadd_ps(vf03h, d0v, m0v); + + __m256i v1 = _mm256_loadu_si256((__m256i *) p1); + + __m256i v10 = _mm256_and_si256(v1, _mm256_set1_epi32(0x000000FF)); + __m256i v11 = _mm256_srli_epi32(_mm256_and_si256(v1, _mm256_set1_epi32(0x0000FFFF)), 8); + __m256i v12 = _mm256_srli_epi32(_mm256_and_si256(v1, _mm256_set1_epi32(0x00FFFFFF)), 16); + __m256i v13 = _mm256_srli_epi32(v1, 24); + + __m256i v10l = _mm256_and_si256(v10, m4b); + __m256i v11l = _mm256_and_si256(v11, m4b); + __m256i v12l = _mm256_and_si256(v12, m4b); + __m256i v13l = _mm256_and_si256(v13, m4b); + + __m256i v10h = _mm256_srli_epi32(v10, 4); + __m256i v11h = _mm256_srli_epi32(v11, 4); + __m256i v12h = _mm256_srli_epi32(v12, 4); + __m256i v13h = _mm256_srli_epi32(v13, 4); + + __m256 vf10l = _mm256_cvtepi32_ps(v10l); + __m256 vf11l = _mm256_cvtepi32_ps(v11l); + __m256 vf12l = _mm256_cvtepi32_ps(v12l); + __m256 vf13l = _mm256_cvtepi32_ps(v13l); + + __m256 vf10h = _mm256_cvtepi32_ps(v10h); + __m256 vf11h = _mm256_cvtepi32_ps(v11h); + __m256 vf12h = _mm256_cvtepi32_ps(v12h); + __m256 vf13h = _mm256_cvtepi32_ps(v13h); + + vf10l = _mm256_fmadd_ps(vf10l, d1v, m1v); + vf11l = _mm256_fmadd_ps(vf11l, d1v, m1v); + vf12l = _mm256_fmadd_ps(vf12l, d1v, m1v); + vf13l = _mm256_fmadd_ps(vf13l, d1v, m1v); + + vf10h = _mm256_fmadd_ps(vf10h, d1v, m1v); + vf11h = _mm256_fmadd_ps(vf11h, d1v, m1v); + vf12h = _mm256_fmadd_ps(vf12h, d1v, m1v); + vf13h = _mm256_fmadd_ps(vf13h, d1v, m1v); + + // compute dot product + sumv0 = _mm256_fmadd_ps(vf00l, vf10l, sumv0); + sumv0 = _mm256_fmadd_ps(vf01l, vf11l, sumv0); + sumv0 = _mm256_fmadd_ps(vf02l, vf12l, sumv0); + sumv0 = _mm256_fmadd_ps(vf03l, vf13l, sumv0); + + sumv1 = _mm256_fmadd_ps(vf00h, vf10h, sumv1); + sumv1 = _mm256_fmadd_ps(vf01h, vf11h, sumv1); + sumv1 = _mm256_fmadd_ps(vf02h, vf12h, sumv1); + sumv1 = _mm256_fmadd_ps(vf03h, vf13h, sumv1); + } + + // accumulate (horizontal sum) + const __m256 vdot = _mm256_add_ps(sumv0, sumv1); + const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(vdot), _mm256_extractf128_ps(vdot, 1)); + const __m128 t1 = _mm_hadd_ps(t0, t0); + + sumf += _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); +#elif QK == 64 && 0 + float sum00 = 0.0f; + float sum01 = 0.0f; + float sum10 = 0.0f; + float sum11 = 0.0f; + + const __m256i m4b = _mm256_set1_epi8(0xf); + + for (int i = 0; i < nb; i++) { + const float m0 = GGML_GQ_TO_FP32(pm0[i]); + const float d0 = GGML_GQ_TO_FP32(pd0[i]); + + const float m1 = GGML_GQ_TO_FP32(pm1[i]); + const float d1 = GGML_GQ_TO_FP32(pd1[i]); + + const uint8_t * restrict p0 = pb0 + i*QK/2; + const uint8_t * restrict p1 = pb1 + i*QK/2; + + // 64 x 4 + const __m256i v0 = _mm256_loadu_si256((__m256i *) p0); + const __m256i v1 = _mm256_loadu_si256((__m256i *) p1); + + // 32 x 8 + const __m256i v0l = _mm256_and_si256(v0, m4b); + const __m256i v1l = _mm256_and_si256(v1, m4b); + + const __m256i v0h = _mm256_and_si256(_mm256_srli_epi16(v0, 4), m4b); + const __m256i v1h = _mm256_and_si256(_mm256_srli_epi16(v1, 4), m4b); + + const __m256i pl = _mm256_maddubs_epi16(v0l, v1l); + const __m256i ph = _mm256_maddubs_epi16(v0h, v1h); + + const __m256i p16 = _mm256_add_epi16(ph, pl); + const __m256i p = _mm256_madd_epi16(_mm256_set1_epi16(1), p16); + + sum00 += m0*m1; + sum01 += m1*d0*(_mm256_hadd_epi8_gg(_mm256_add_epi8(v0l, v0h))); + sum10 += m0*d1*(_mm256_hadd_epi8_gg(_mm256_add_epi8(v1l, v1h))); + sum11 += d0*d1*(_mm256_hadd_epi32_gg(p)); + } + + sumf = 64.0*sum00 + sum01 + sum10 + sum11; +#elif QK == 64 && 1 // this is the best when using min + d + float sum00 = 0.0f; + + __m256 sum01 = _mm256_setzero_ps(); + __m256 sum10 = _mm256_setzero_ps(); + __m256 sum11 = _mm256_setzero_ps(); + + for (int i = 0; i < nb; i++) { + const float m0 = GGML_GQ_TO_FP32(pm0[i]); + const float d0 = GGML_GQ_TO_FP32(pd0[i]); + + const float m1 = GGML_GQ_TO_FP32(pm1[i]); + const float d1 = GGML_GQ_TO_FP32(pd1[i]); + + const uint8_t * restrict p0 = pb0 + i*QK/2; + const uint8_t * restrict p1 = pb1 + i*QK/2; + + const __m256 m0v = _mm256_set1_ps(m0); + const __m256 d0v = _mm256_set1_ps(d0); + + const __m256 m1v = _mm256_set1_ps(m1); + const __m256 d1v = _mm256_set1_ps(d1); + + const __m256 m1d0v = _mm256_mul_ps(m1v, d0v); + const __m256 m0d1v = _mm256_mul_ps(m0v, d1v); + const __m256 d0d1v = _mm256_mul_ps(d0v, d1v); + + const __m256i m4b = _mm256_set1_epi8(0xf); + + // 64 x 4 + const __m256i v0 = _mm256_loadu_si256((__m256i *) p0); + const __m256i v1 = _mm256_loadu_si256((__m256i *) p1); + + // 32 x 8 + const __m256i v0l = _mm256_and_si256(v0, m4b); + const __m256i v1l = _mm256_and_si256(v1, m4b); + + const __m256i v0h = _mm256_and_si256(_mm256_srli_epi16(v0, 4), m4b); + const __m256i v1h = _mm256_and_si256(_mm256_srli_epi16(v1, 4), m4b); + + const __m256i v0a = _mm256_add_epi8(v0l, v0h); + const __m256i v1a = _mm256_add_epi8(v1l, v1h); + + const __m128i v0al = _mm256_extracti128_si256(v0a, 0); + const __m128i v0ah = _mm256_extracti128_si256(v0a, 1); + + const __m128i v1al = _mm256_extracti128_si256(v1a, 0); + const __m128i v1ah = _mm256_extracti128_si256(v1a, 1); + + const __m128i v0as = _mm_add_epi8(v0al, v0ah); + const __m128i v1as = _mm_add_epi8(v1al, v1ah); + + const __m256i v0as_0 = _mm256_cvtepu8_epi32(v0as); + const __m256i v0as_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(v0as, 8)); + + const __m256i v1as_0 = _mm256_cvtepu8_epi32(v1as); + const __m256i v1as_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(v1as, 8)); + + const __m256i v0ass = _mm256_add_epi32(v0as_0, v0as_1); + const __m256i v1ass = _mm256_add_epi32(v1as_0, v1as_1); + + const __m256 v0f = _mm256_cvtepi32_ps(v0ass); + const __m256 v1f = _mm256_cvtepi32_ps(v1ass); + + const __m256i pl = _mm256_maddubs_epi16(v0l, v1l); + const __m256i ph = _mm256_maddubs_epi16(v0h, v1h); + + const __m256i p16 = _mm256_add_epi16(ph, pl); + const __m256i p = _mm256_madd_epi16(_mm256_set1_epi16(1), p16); + + sum00 += m0*m1; + sum01 = _mm256_fmadd_ps(m1d0v, v0f, sum01); + sum10 = _mm256_fmadd_ps(m0d1v, v1f, sum10); + sum11 = _mm256_fmadd_ps(d0d1v, _mm256_cvtepi32_ps(p), sum11); + } + + sumf = 64.0*sum00 + _mm256_hadd_ps_gg(sum01) + _mm256_hadd_ps_gg(sum10) + _mm256_hadd_ps_gg(sum11); +#endif +#elif defined (__ARM_NEON) + float sum00 = 0.0f; + float sum01 = 0.0f; + float sum10 = 0.0f; + float sum11 = 0.0f; + + for (int i = 0; i < nb; i++) { + const float m0 = GGML_GQ_TO_FP32(pm0[i]); + const float d0 = GGML_GQ_TO_FP32(pd0[i]); + + const float m1 = GGML_GQ_TO_FP32(pm1[i]); + const float d1 = GGML_GQ_TO_FP32(pd1[i]); + + const uint8_t * restrict p0 = pb0 + i*QK/2; + const uint8_t * restrict p1 = pb1 + i*QK/2; + + const uint8x16_t m4b = vdupq_n_u8(0xf); + + const uint8x16_t v0_0 = vld1q_u8(p0); + const uint8x16_t v0_1 = vld1q_u8(p0 + 16); + const uint8x16_t v1_0 = vld1q_u8(p1); + const uint8x16_t v1_1 = vld1q_u8(p1 + 16); + + // and with 0xf + const uint8x16_t v0_0l = vandq_u8(v0_0, m4b); + const uint8x16_t v0_1l = vandq_u8(v0_1, m4b); + const uint8x16_t v1_0l = vandq_u8(v1_0, m4b); + const uint8x16_t v1_1l = vandq_u8(v1_1, m4b); + + const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4); + const uint8x16_t v0_1h = vshrq_n_u8(v0_1, 4); + const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4); + const uint8x16_t v1_1h = vshrq_n_u8(v1_1, 4); + + // dot product into uint16x8_t + const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l)); + const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l)); + const uint16x8_t pl1l = vmull_u8(vget_low_u8 (v0_1l), vget_low_u8 (v1_1l)); + const uint16x8_t pl1h = vmull_u8(vget_high_u8(v0_1l), vget_high_u8(v1_1l)); + + const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h)); + const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h)); + const uint16x8_t ph1l = vmull_u8(vget_low_u8 (v0_1h), vget_low_u8 (v1_1h)); + const uint16x8_t ph1h = vmull_u8(vget_high_u8(v0_1h), vget_high_u8(v1_1h)); + + const uint16x8_t pl0 = vaddq_u16(pl0l, pl0h); + const uint16x8_t pl1 = vaddq_u16(pl1l, pl1h); + const uint16x8_t ph0 = vaddq_u16(ph0l, ph0h); + const uint16x8_t ph1 = vaddq_u16(ph1l, ph1h); + + const uint16x8_t pl = vaddq_u16(pl0, pl1); + const uint16x8_t ph = vaddq_u16(ph0, ph1); + + sum00 += m0*m1; + sum01 += m1*d0*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h) + vaddvq_u8(v0_1l) + vaddvq_u8(v0_1h)); + sum10 += m0*d1*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h) + vaddvq_u8(v1_1l) + vaddvq_u8(v1_1h)); + //sum11 += d0*d1*( + // vaddvq_u16(vaddq_u16(vaddq_u16(pl0l, pl0h), vaddq_u16(pl1l, pl1h))) + + // vaddvq_u16(vaddq_u16(vaddq_u16(ph0l, ph0h), vaddq_u16(ph1l, ph1h)))); + sum11 += d0*d1*vaddvq_u16(vaddq_u16(pl, ph)); + } + + sumf = 64.0*sum00 + sum01 + sum10 + sum11; +#endif +#endif + + *s = sumf; +} + +// use vec_dot_gq_4 to compute the dot product of two rows +void mul_mat_gq_4( + const void * src0, + const void * src1, // transposed + float * dst, + int m, int n, int k) { + assert(k % QK == 0); + + const int nb = quantize_4_blocks_per_row(k); + + for (int ir0 = 0; ir0 < m; ir0++) { + for (int ir1 = 0; ir1 < n; ir1++) { + vec_dot_gq_4(k, dst + ir1, src0, src1); + src1 = (const char *) src1 + quantize_4_row_size(k); + } + src0 = (const char *) src0 + quantize_4_row_size(k); + src1 = (const char *) src1 - n*quantize_4_row_size(k); + + dst = (float *) dst + n; + } +} + +// +// method 5 +// 4-bit quantization (without min, only delta) +// + +static inline int quantize_5_blocks_per_row(int k) { + return k/QK; +} + +static inline int quantize_5_row_size(int k) { + const int nb = quantize_5_blocks_per_row(k); + + return nb*(sizeof(gq_scale_t) + QK/2); +} + +void quantize_5_row(const float * restrict src, void * restrict dst, int k) { + assert(k % QK == 0); + assert(QB == 4); + + const int nb = quantize_5_blocks_per_row(k); + + gq_scale_t * restrict pd = (gq_scale_t *) (dst); + uint8_t * restrict pb = (uint8_t *) (pd + nb); + + uint8_t pp[QK/2]; + + for (int i = 0; i < nb; i++) { + memset(pp, 0, sizeof(pp)); + + float amax = 0.0f; // absolute max + +#if defined(__AVX2__) + { + assert(QK == 64); + enum { QK8 = QK/8 }; + + __m256 srcv [QK8]; + __m256 asrcv[QK8]; + __m256 amaxv[QK8]; + + for (int l = 0; l < QK8; l++) { + srcv[l] = _mm256_loadu_ps(src + i*QK + 8*l); + } + + for (int l = 0; l < QK8; l++) { + asrcv[l] = _mm256_and_ps(srcv[l], _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); + } + + + for (int l = 0; l < QK8/2; l++) { + amaxv[2*l] = _mm256_max_ps(asrcv[2*l], asrcv[2*l+1]); + } + + for (int l = 0; l < QK8/4; l++) { + amaxv[4*l] = _mm256_max_ps(amaxv[4*l], amaxv[4*l+2]); + } + + for (int l = 0; l < QK8/8; l++) { + amaxv[8*l] = _mm256_max_ps(amaxv[8*l], amaxv[8*l+4]); + } + + //amax = MAX(amaxv[0][0], MAX(amaxv[0][1], MAX(amaxv[0][2], MAX(amaxv[0][3], MAX(amaxv[0][4], MAX(amaxv[0][5], MAX(amaxv[0][6], amaxv[0][7]))))))); + + const __m256 amaxv0_0 = _mm256_permute2f128_ps(amaxv[0], amaxv[0], 3); + const __m256 amaxv0_1 = _mm256_max_ps(amaxv[0], amaxv0_0); + const __m256 amaxv0_2 = _mm256_permute_ps(amaxv0_1, 0x4e); + const __m256 amaxv0_3 = _mm256_max_ps(amaxv0_1, amaxv0_2); + const __m256 amaxv0_4 = _mm256_permute_ps(amaxv0_3, 0xb1); + const __m256 amaxv0_5 = _mm256_max_ps(amaxv0_3, amaxv0_4); + + amax = _mm256_cvtss_f32(amaxv0_5); + + //printf("amax = %f\n", amax); + + const float d = amax / ((1 << (QB - 1)) - 1); + const float id = d ? 1.0/d : 0.0; + + pd[i] = GGML_FP32_TO_GQ(d); + + const __m256 idv = _mm256_set1_ps(id); + + for (int l = 0; l < QK/8; l++) { + __m256 v = _mm256_mul_ps(srcv[l], idv); +#if 0 + v[0] += frand(); v[1] += frand(); v[2] += frand(); v[3] += frand(); + v[4] += frand(); v[5] += frand(); v[6] += frand(); v[7] += frand(); +#endif + + // convert to int8 + __m256i vi = _mm256_cvtps_epi32(v); + vi = _mm256_add_epi32(vi, _mm256_set1_epi32(8)); + + int32_t vi_0 = _mm256_extract_epi32(vi, 0); + int32_t vi_1 = _mm256_extract_epi32(vi, 1); + int32_t vi_2 = _mm256_extract_epi32(vi, 2); + int32_t vi_3 = _mm256_extract_epi32(vi, 3); + + int32_t vi_4 = _mm256_extract_epi32(vi, 4); + int32_t vi_5 = _mm256_extract_epi32(vi, 5); + int32_t vi_6 = _mm256_extract_epi32(vi, 6); + int32_t vi_7 = _mm256_extract_epi32(vi, 7); + + // convert to 4-bit, 2 consecutive packed into 1 byte + pp[4*l + 0] = vi_0 | (vi_1 << 4); + pp[4*l + 1] = vi_2 | (vi_3 << 4); + pp[4*l + 2] = vi_4 | (vi_5 << 4); + pp[4*l + 3] = vi_6 | (vi_7 << 4); + + //printf("vi: %7d %7d %7d %7d %7d %7d %7d %7d\n", vi_0, vi_1, vi_2, vi_3, vi_4, vi_5, vi_6, vi_7); + ////printf("v : %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); + + assert(vi_0 >= 0 && vi_0 < 16); + assert(vi_1 >= 0 && vi_1 < 16); + assert(vi_2 >= 0 && vi_2 < 16); + assert(vi_3 >= 0 && vi_3 < 16); + + assert(vi_4 >= 0 && vi_4 < 16); + assert(vi_5 >= 0 && vi_5 < 16); + assert(vi_6 >= 0 && vi_6 < 16); + assert(vi_7 >= 0 && vi_7 < 16); + } + + memcpy(pb + i*QK/2, pp, sizeof(pp)); + } +#elif defined(__ARM_NEON) && 0 + { + // TODO + } +#else + { + for (int l = 0; l < QK; l++) { + const float v = src[i*QK + l]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax / ((1 << (QB - 1)) - 1); + const float id = d ? 1.0/d : 0.0; + + pd[i] = GGML_FP32_TO_GQ(d); + + for (int l = 0; l < QK; l++) { + const float v = src[i*QK + l]*id; + const int8_t vi = ((int8_t) (round(v))) + 8; + assert(vi >= 0 && vi < 16); + pp[l/2] |= (vi & 0xf) << (4*(l & 1)); + } + + memcpy(pb + i*QK/2, pp, sizeof(pp)); + } +#endif + //printf("min %f max %f\n", min, max); + } +} + +// reimplementation of quantize_5 using quantize_5_row +void quantize_5(const float * restrict src, char * restrict dst, int n, int k) { + assert(k % QK == 0); + + for (int j = 0; j < n; j++) { + quantize_5_row(src + j*k, dst, k); + dst = (char *) dst + quantize_5_row_size(k); + } +} + +void vec_dot_gq_5(const int n, float * restrict s, const void * restrict x, const void * restrict y) { + const int nb = quantize_5_blocks_per_row(n); + + const gq_scale_t * restrict pd0 = (const gq_scale_t *) x; + const gq_scale_t * restrict pd1 = (const gq_scale_t *) y; + + const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb); + const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb); + + float sumf = 0.0; + +#if 0 + // scalar + for (int i = 0; i < nb; i++) { + const float d0 = GGML_GQ_TO_FP32(pd0[i]); + const float d1 = GGML_GQ_TO_FP32(pd1[i]); + + const uint8_t * restrict p0 = pb0 + i*QK/2; + const uint8_t * restrict p1 = pb1 + i*QK/2; + + for (int j = 0; j < QK/2; j++) { + const uint8_t v0 = p0[j]; + const uint8_t v1 = p1[j]; + + const float f0 = d0*((int8_t) (v0 & 0xf) - 8); + const float f1 = d0*((int8_t) (v0 >> 4) - 8); + + const float f2 = d1*((int8_t) (v1 & 0xf) - 8); + const float f3 = d1*((int8_t) (v1 >> 4) - 8); + + sumf += f0*f2 + f1*f3; + } + } +#else +#if defined(__AVX2__) +#if QK == 64 && 1 + __m256 sum11 = _mm256_setzero_ps(); + + for (int i = 0; i < nb; i++) { + const float d0 = GGML_GQ_TO_FP32(pd0[i]); + const float d1 = GGML_GQ_TO_FP32(pd1[i]); + + const uint8_t * restrict p0 = pb0 + i*QK/2; + const uint8_t * restrict p1 = pb1 + i*QK/2; + + const __m256 d0v = _mm256_set1_ps(d0); + const __m256 d1v = _mm256_set1_ps(d1); + + const __m256 d0d1v = _mm256_mul_ps(d0v, d1v); + + const __m256i m4b = _mm256_set1_epi8(0xf); + + // 64 x 4 + const __m256i v0 = _mm256_loadu_si256((__m256i *) p0); + const __m256i v1 = _mm256_loadu_si256((__m256i *) p1); + + // 32 x 8 + __m256i v0l = _mm256_and_si256(v0, m4b); + __m256i v1l = _mm256_and_si256(v1, m4b); + + __m256i v0h = _mm256_and_si256(_mm256_srli_epi16(v0, 4), m4b); + __m256i v1h = _mm256_and_si256(_mm256_srli_epi16(v1, 4), m4b); + + // sub 8 + v0l = _mm256_sub_epi8(v0l, _mm256_set1_epi8(8)); + v0h = _mm256_sub_epi8(v0h, _mm256_set1_epi8(8)); + + v1l = _mm256_sub_epi8(v1l, _mm256_set1_epi8(8)); + v1h = _mm256_sub_epi8(v1h, _mm256_set1_epi8(8)); + + // abs + const __m256i v0la = _mm256_sign_epi8(v0l, v0l); + const __m256i v0ha = _mm256_sign_epi8(v0h, v0h); + + // sign + const __m256i v1ls = _mm256_sign_epi8(v1l, v0l); + const __m256i v1hs = _mm256_sign_epi8(v1h, v0h); + + const __m256i pl = _mm256_maddubs_epi16(v0la, v1ls); + const __m256i ph = _mm256_maddubs_epi16(v0ha, v1hs); + + const __m256i p16 = _mm256_add_epi16(ph, pl); + const __m256i p = _mm256_madd_epi16(_mm256_set1_epi16(1), p16); + + sum11 = _mm256_fmadd_ps(d0d1v, _mm256_cvtepi32_ps(p), sum11); + } + + sumf = _mm256_hadd_ps_gg(sum11); +#endif +#elif defined (__ARM_NEON) + float sum11 = 0.0f; + + //float32x4_t sum_0 = vdupq_n_f32(0.0f); + //float32x4_t sum_1 = vdupq_n_f32(0.0f); + + //float16x8_t sum_0 = vdupq_n_f16(0.0f); + //float16x8_t sum_1 = vdupq_n_f16(0.0f); + + for (int i = 0; i < nb; i++) { + const float d0 = GGML_GQ_TO_FP32(pd0[i]); + const float d1 = GGML_GQ_TO_FP32(pd1[i]); + + //float32x4_t d0d1v = vdupq_n_f32(d0*d1); + //float16x8_t d0d1v = vdupq_n_f16(d0*d1); + + const uint8_t * restrict p0 = pb0 + i*QK/2; + const uint8_t * restrict p1 = pb1 + i*QK/2; + + const uint8x16_t m4b = vdupq_n_u8(0xf); + const int8x16_t s8b = vdupq_n_s8(0x8); + + const uint8x16_t v0_0 = vld1q_u8(p0); + const uint8x16_t v0_1 = vld1q_u8(p0 + 16); + const uint8x16_t v1_0 = vld1q_u8(p1); + const uint8x16_t v1_1 = vld1q_u8(p1 + 16); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b)); + const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b)); + const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b)); + + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4)); + const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4)); + + // sub 8 + const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); + const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); + const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b); + const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b); + + const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); + const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); + const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b); + const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b); + + // dot product into int16x8_t + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls)); + + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs)); + + const int16x8_t pl0 = vaddq_s16(pl0l, pl0h); + const int16x8_t pl1 = vaddq_s16(pl1l, pl1h); + const int16x8_t ph0 = vaddq_s16(ph0l, ph0h); + const int16x8_t ph1 = vaddq_s16(ph1l, ph1h); + + const int16x8_t pl = vaddq_s16(pl0, pl1); + const int16x8_t ph = vaddq_s16(ph0, ph1); + + //const int8x16_t pl0 = vmulq_s8(v0_0ls, v1_0ls); + //const int8x16_t pl1 = vmulq_s8(v0_1ls, v1_1ls); + //const int8x16_t ph0 = vmulq_s8(v0_0hs, v1_0hs); + //const int8x16_t ph1 = vmulq_s8(v0_1hs, v1_1hs); + + //const int16x8_t pll = vaddl_s8(vget_low_s8(pl0), vget_low_s8(pl1)); + //const int16x8_t plh = vaddl_s8(vget_high_s8(pl0), vget_high_s8(pl1)); + //const int16x8_t phl = vaddl_s8(vget_low_s8(ph0), vget_low_s8(ph1)); + //const int16x8_t phh = vaddl_s8(vget_high_s8(ph0), vget_high_s8(ph1)); + + //const int16x8_t pl = vaddq_s16(pll, plh); + //const int16x8_t ph = vaddq_s16(phl, phh); + + const int16x8_t p = vaddq_s16(pl, ph); + + // convert to float + //const float32x4_t pf0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (p))); + //const float32x4_t pf1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(p))); + + // scalar + sum11 += d0*d1*vaddvq_s16(p); + //sum11 += d0*d1*(vaddvq_s16(pl) + vaddvq_s16(ph)); + //sum11 += d0*d1*vaddvq_s16(vaddq_s16(pl, ph)); + //sum11 += d0*d1*(vaddvq_s8(pl0) + vaddvq_s8(pl1) + vaddvq_s8(ph0) + vaddvq_s8(ph1)); + //sum11 += d0*d1*(vaddvq_s16(pll) + vaddvq_s16(plh) + vaddvq_s16(phl) + vaddvq_s16(phh)); + + //sum_0 = vfmaq_f16(sum_0, d0d1v, vcvtq_f16_s16(p)); + //sum_0 = vfmaq_f16(sum_0, d0d1v, vcvtq_f16_s16(pl)); + //sum_1 = vfmaq_f16(sum_1, d0d1v, vcvtq_f16_s16(ph)); + + // vectorize + //sum_0 = vmlaq_f32(sum_0, d0d1v, pf0); + //sum_1 = vmlaq_f32(sum_1, d0d1v, pf1); + } + + sumf = sum11; + //sumf = vaddvq_f32(sum_0) + vaddvq_f32(sum_1); + //sumf = sum_0[0] + sum_0[1] + sum_0[2] + sum_0[3] + sum_0[4] + sum_0[5] + sum_0[6] + sum_0[7]; + //sum_0 = vaddq_f16(sum_0, sum_1); + //sumf = sum_0[0] + sum_0[1] + sum_0[2] + sum_0[3] + sum_0[4] + sum_0[5] + sum_0[6] + sum_0[7]; +#endif +#endif + + *s = sumf; +} + +// use vec_dot_gq_5 to compute the dot product of two rows +void mul_mat_gq_5( + const void * src0, + const void * src1, // transposed + float * dst, + int m, int n, int k) { + assert(k % QK == 0); + + const int nb = quantize_5_blocks_per_row(k); + + for (int ir0 = 0; ir0 < m; ir0++) { + for (int ir1 = 0; ir1 < n; ir1++) { + vec_dot_gq_5(k, dst + ir1, src0, src1); + src1 = (const char *) src1 + quantize_5_row_size(k); + } + src0 = (const char *) src0 + quantize_5_row_size(k); + src1 = (const char *) src1 - n*quantize_5_row_size(k); + + dst = (float *) dst + n; + } +} + +// +// method 6 +// same as 5 but with 32 element blocks +// + +static inline int quantize_6_blocks_per_row(int k) { + return k/32; +} + +static inline int quantize_6_row_size(int k) { + const int nb = quantize_6_blocks_per_row(k); + + return nb*(sizeof(gq_scale_t) + 16); +} + +void quantize_6_row(const float * restrict src, void * restrict dst, int k) { + assert(k % 32 == 0); + assert(QB == 4); + + const int nb = quantize_6_blocks_per_row(k); + + gq_scale_t * restrict pd = (gq_scale_t *) (dst); + uint8_t * restrict pb = (uint8_t *) (pd + nb); + + uint8_t pp[16]; + + for (int i = 0; i < nb; i++) { + memset(pp, 0, sizeof(pp)); + + float amax = 0.0f; // absolute max + +#if defined(__AVX2__) + { + enum { QK8 = 4 }; + + __m256 srcv [QK8]; + __m256 asrcv[QK8]; + __m256 amaxv[QK8]; + + for (int l = 0; l < QK8; l++) { + srcv[l] = _mm256_loadu_ps(src + i*32 + 8*l); + } + + for (int l = 0; l < QK8; l++) { + asrcv[l] = _mm256_and_ps(srcv[l], _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); + } + + for (int l = 0; l < QK8/2; l++) { + amaxv[2*l] = _mm256_max_ps(asrcv[2*l], asrcv[2*l+1]); + } + + for (int l = 0; l < QK8/4; l++) { + amaxv[4*l] = _mm256_max_ps(amaxv[4*l], amaxv[4*l+2]); + } + + const __m256 amaxv0_0 = _mm256_permute2f128_ps(amaxv[0], amaxv[0], 3); + const __m256 amaxv0_1 = _mm256_max_ps(amaxv[0], amaxv0_0); + const __m256 amaxv0_2 = _mm256_permute_ps(amaxv0_1, 0x4e); + const __m256 amaxv0_3 = _mm256_max_ps(amaxv0_1, amaxv0_2); + const __m256 amaxv0_4 = _mm256_permute_ps(amaxv0_3, 0xb1); + const __m256 amaxv0_5 = _mm256_max_ps(amaxv0_3, amaxv0_4); + + amax = _mm256_cvtss_f32(amaxv0_5); + + const float d = amax / ((1 << (QB - 1)) - 1); + const float id = d ? 1.0/d : 0.0; + + pd[i] = GGML_FP32_TO_GQ(d); + + const __m256 idv = _mm256_set1_ps(id); + + for (int l = 0; l < 4; l++) { + __m256 v = _mm256_mul_ps(srcv[l], idv); + + // convert to int8 + __m256i vi = _mm256_cvtps_epi32(v); + vi = _mm256_add_epi32(vi, _mm256_set1_epi32(8)); + + int32_t vi_0 = _mm256_extract_epi32(vi, 0); + int32_t vi_1 = _mm256_extract_epi32(vi, 1); + int32_t vi_2 = _mm256_extract_epi32(vi, 2); + int32_t vi_3 = _mm256_extract_epi32(vi, 3); + + int32_t vi_4 = _mm256_extract_epi32(vi, 4); + int32_t vi_5 = _mm256_extract_epi32(vi, 5); + int32_t vi_6 = _mm256_extract_epi32(vi, 6); + int32_t vi_7 = _mm256_extract_epi32(vi, 7); + + // convert to 4-bit, 2 consecutive packed into 1 byte + pp[4*l + 0] = vi_0 | (vi_1 << 4); + pp[4*l + 1] = vi_2 | (vi_3 << 4); + pp[4*l + 2] = vi_4 | (vi_5 << 4); + pp[4*l + 3] = vi_6 | (vi_7 << 4); + + assert(vi_0 >= 0 && vi_0 < 16); + assert(vi_1 >= 0 && vi_1 < 16); + assert(vi_2 >= 0 && vi_2 < 16); + assert(vi_3 >= 0 && vi_3 < 16); + + assert(vi_4 >= 0 && vi_4 < 16); + assert(vi_5 >= 0 && vi_5 < 16); + assert(vi_6 >= 0 && vi_6 < 16); + assert(vi_7 >= 0 && vi_7 < 16); + } + + memcpy(pb + i*16, pp, sizeof(pp)); + } +#elif defined(__ARM_NEON) + { + float32x4_t srcv [8]; + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(src + i*32 + 4*l); + for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]); + + for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]); + for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]); + for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]); + + amax = MAX( + MAX(vgetq_lane_f32(amaxv[0], 0), vgetq_lane_f32(amaxv[0], 1)), + MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3))); + + const float d = amax / ((1 << 3) - 1); + const float id = d ? 1.0/d : 0.0; + + pd[i] = GGML_FP32_TO_GQ(d); + + for (int l = 0; l < 8; l++) { + const float32x4_t v = vmulq_n_f32(srcv[l], id); + const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f)); + const int32x4_t vi = vcvtq_s32_f32(vf); + + pp[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4); + pp[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4); + } + + memcpy(pb + i*16, pp, sizeof(pp)); + } +#else + { + for (int l = 0; l < 32; l++) { + const float v = src[i*32 + l]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax / ((1 << (QB - 1)) - 1); + const float id = d ? 1.0/d : 0.0; + + pd[i] = GGML_FP32_TO_GQ(d); + + for (int l = 0; l < 32; l++) { + const float v = src[i*32 + l]*id; + const int8_t vi = ((int8_t) (round(v))) + 8; + assert(vi >= 0 && vi < 16); + pp[l/2] |= (vi & 0xf) << (4*(l & 1)); + } + + memcpy(pb + i*16, pp, sizeof(pp)); + } +#endif + //printf("amax = %f\n", amax); + } +} + +// reimplementation of quantize__6using quantize_6_row +void quantize_6(const float * restrict src, char * restrict dst, int n, int k) { + assert(k % 32 == 0); + + for (int j = 0; j < n; j++) { + quantize_6_row(src + j*k, dst, k); + dst = (char *) dst + quantize_6_row_size(k); + } +} + +void vec_dot_gq_6(const int n, float * restrict s, const void * restrict x, const void * restrict y) { + const int nb = quantize_6_blocks_per_row(n); + + const gq_scale_t * restrict pd0 = (const gq_scale_t *) x; + const gq_scale_t * restrict pd1 = (const gq_scale_t *) y; + + const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb); + const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb); + + float sumf = 0.0; + +#if 0 + // scalar + for (int i = 0; i < nb; i++) { + const float d0 = GGML_GQ_TO_FP32(pd0[i]); + const float d1 = GGML_GQ_TO_FP32(pd1[i]); + + const uint8_t * restrict p0 = pb0 + i*16; + const uint8_t * restrict p1 = pb1 + i*16; + + for (int j = 0; j < 16; j++) { + const uint8_t v0 = p0[j]; + const uint8_t v1 = p1[j]; + + const float f0 = d0*((int8_t) (v0 & 0xf) - 8); + const float f1 = d0*((int8_t) (v0 >> 4) - 8); + + const float f2 = d1*((int8_t) (v1 & 0xf) - 8); + const float f3 = d1*((int8_t) (v1 >> 4) - 8); + + sumf += f0*f2 + f1*f3; + } + } +#else +#if defined(__AVX2__) + // TODO +#elif defined (__ARM_NEON) +#if 0 + float sum0 = 0.0f; + + for (int i = 0; i < nb; i++) { + const float d0 = GGML_GQ_TO_FP32(pd0[i]); + const float d1 = GGML_GQ_TO_FP32(pd1[i]); + + //float32x4_t d0d1v = vdupq_n_f32(d0*d1); + //float16x8_t d0d1v = vdupq_n_f16(d0*d1); + + const uint8_t * restrict p0 = pb0 + i*16; + const uint8_t * restrict p1 = pb1 + i*16; + + const uint8x16_t m4b = vdupq_n_u8(0xf); + const int8x16_t s8b = vdupq_n_s8(0x8); + + const uint8x16_t v0_0 = vld1q_u8(p0); + const uint8x16_t v1_0 = vld1q_u8(p1); + + // 4-bit -> 8-bit + const uint8x16_t v0_0l = vandq_u8(v0_0, m4b); + const uint8x16_t v1_0l = vandq_u8(v1_0, m4b); + + const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4); + const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4); + + // sub 8 + const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); + const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b); + + const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); + const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b); + + // dot product into int16x8_t + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); + + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs)); + + const int16x8_t pl = vaddq_s16(pl0l, pl0h); + const int16x8_t ph = vaddq_s16(ph0l, ph0h); + + const int16x8_t p = vaddq_s16(pl, ph); + + // scalar + sum0 += d0*d1*vaddvq_s16(p); + } + + sumf = sum0; +#elif 1 // this is a bit faster than the above + float sum0 = 0.0f; + float sum1 = 0.0f; + + for (int i = 0; i < nb; i += 2) { + const float d0_0 = GGML_GQ_TO_FP32(pd0[i + 0]); + const float d1_0 = GGML_GQ_TO_FP32(pd1[i + 0]); + const float d0_1 = GGML_GQ_TO_FP32(pd0[i + 1]); + const float d1_1 = GGML_GQ_TO_FP32(pd1[i + 1]); + + const uint8_t * restrict p0 = pb0 + i*16; + const uint8_t * restrict p1 = pb1 + i*16; + + const uint8x16_t m4b = vdupq_n_u8(0xf); + const int8x16_t s8b = vdupq_n_s8(0x8); + + const uint8x16_t v0_0 = vld1q_u8(p0); + const uint8x16_t v0_1 = vld1q_u8(p0 + 16); + const uint8x16_t v1_0 = vld1q_u8(p1); + const uint8x16_t v1_1 = vld1q_u8(p1 + 16); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b)); + const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b)); + + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4)); + + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b)); + const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b)); + + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4)); + + // sub 8 + const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); + const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b); + + const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); + const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b); + + const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); + const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b); + + const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); + const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b); + + // dot product into int16x8_t + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); + + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls)); + + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs)); + + const int16x8_t pl_0 = vaddq_s16(pl0l, pl0h); + const int16x8_t ph_0 = vaddq_s16(ph0l, ph0h); + + const int16x8_t pl_1 = vaddq_s16(pl1l, pl1h); + const int16x8_t ph_1 = vaddq_s16(ph1l, ph1h); + + const int16x8_t p_0 = vaddq_s16(pl_0, ph_0); + const int16x8_t p_1 = vaddq_s16(pl_1, ph_1); + + // scalar + sum0 += d0_0*d1_0*vaddvq_s16(p_0); + sum1 += d0_1*d1_1*vaddvq_s16(p_1); + } + + sumf = sum0 + sum1; +#endif +#endif +#endif + + *s = sumf; +} + +// use vec_dot_gq_6 to compute the dot product of two rows +void mul_mat_gq_6( + const void * src0, + const void * src1, // transposed + float * dst, + int m, int n, int k) { + assert(k % 32 == 0); + + for (int ir0 = 0; ir0 < m; ir0++) { + for (int ir1 = 0; ir1 < n; ir1++) { + vec_dot_gq_6(k, dst + ir1, src0, src1); + src1 = (const char *) src1 + quantize_6_row_size(k); + } + src0 = (const char *) src0 + quantize_6_row_size(k); + src1 = (const char *) src1 - n*quantize_6_row_size(k); + + dst = (float *) dst + n; + } +} + +int main(int argc, const char ** argv) { + assert(sizeof(gq_quant_t)*8 == gq_t_bits); + ggml_time_init(); + + // needed to initialize f16 tables + { + struct ggml_init_params params = { 0, NULL, false }; + struct ggml_context * ctx = ggml_init(params); + ggml_free(ctx); + } + + int method = 0; + if (argc > 1) { + method = atoi(argv[1]); + } + + float * src0 = malloc(sizeof(float)*M*K); + float * src1 = malloc(sizeof(float)*N*K); + float * dst = malloc(sizeof(float)*M*N); + + // allocate aligned memory + //float * src0 = (float *)aligned_alloc(32, sizeof(float)*M*K); + //float * src1 = (float *)aligned_alloc(32, sizeof(float)*N*K); + //float * dst = (float *)aligned_alloc(32, sizeof(float)*M*N); + + for (int i = 0; i < M*K; i++) { + src0[i] = 0.8 - rand() / (float)RAND_MAX; + /*src0[i] = rand() / (float)RAND_MAX;*/ + /*src0[i] = i % 2;*/ + } + + for (int i = 0; i < N*K; i++) { + src1[i] = 0.8 - rand() / (float)RAND_MAX; + /*src1[i] = rand() / (float)RAND_MAX;*/ + /*src1[i] = i % 3;*/ + } + + void * src0_gq = NULL; + void * src1_gq = NULL; + + size_t sizegq = 0; + + { + if (method == 1) { + src0_gq = calloc(1, quantize_1_row_size(K)*M); + src1_gq = calloc(1, quantize_1_row_size(K)*N); + + sizegq = quantize_1_row_size(K)*M + quantize_1_row_size(K)*N; + } + + if (method == 2) { + src0_gq = calloc(1, quantize_2_row_size(K)*M); + src1_gq = calloc(1, quantize_2_row_size(K)*N); + + sizegq = quantize_2_row_size(K)*M + quantize_2_row_size(K)*N; + } + + if (method == 3) { + src0_gq = calloc(1, quantize_3_row_size(K)*M); + src1_gq = calloc(1, quantize_3_row_size(K)*N); + + sizegq = quantize_3_row_size(K)*M + quantize_3_row_size(K)*N; + } + + if (method == 4) { + src0_gq = calloc(1, quantize_4_row_size(K)*M); + src1_gq = calloc(1, quantize_4_row_size(K)*N); + + sizegq = quantize_4_row_size(K)*M + quantize_4_row_size(K)*N; + } + + if (method == 5) { + src0_gq = calloc(1, quantize_5_row_size(K)*M); + src1_gq = calloc(1, quantize_5_row_size(K)*N); + + sizegq = quantize_5_row_size(K)*M + quantize_5_row_size(K)*N; + } + + if (method == 6) { + src0_gq = calloc(1, quantize_6_row_size(K)*M); + src1_gq = calloc(1, quantize_6_row_size(K)*N); + + sizegq = quantize_6_row_size(K)*M + quantize_6_row_size(K)*N; + } + } + + const size_t sizef16 = sizeof(ggml_fp16_t)*M*K + sizeof(ggml_fp16_t)*N*K; + + printf("compression: %f\n", (float)sizegq/sizef16); + + // convert fp32 -> gq + { + const int64_t t_start = ggml_time_us(); + + if (method == 1) { + quantize_1(src0, src0_gq, M, K); + quantize_1(src1, src1_gq, N, K); + } + + if (method == 2) { + quantize_2(src0, src0_gq, M, K); + quantize_2(src1, src1_gq, N, K); + } + + if (method == 3) { + quantize_3(src0, src0_gq, M, K); + quantize_3(src1, src1_gq, N, K); + } + + if (method == 4) { + quantize_4(src0, src0_gq, M, K); + quantize_4(src1, src1_gq, N, K); + } + + if (method == 5) { + quantize_5(src0, src0_gq, M, K); + quantize_5(src1, src1_gq, N, K); + } + + if (method == 6) { + quantize_6(src0, src0_gq, M, K); + quantize_6(src1, src1_gq, N, K); + } + + const int64_t t_end = ggml_time_us(); + printf("convert time: %f ms / method = %d\n", (t_end - t_start) / 1000.0, method); + } + + for (int i = 0; i < 16; ++i) { + printf("%f %f\n", src0[i], src1[i]); + } + + const int nIter = 1; + + const int64_t start = ggml_cycles(); + const int64_t start_us = ggml_time_us(); + + double iM = 1.0/M; + double sum = 0.0f; + for (int i = 0; i < nIter; i++) { + if (method == 0) { + mul_mat_f32_naive(src0, src1, dst, M, N, K); + } + + if (method == 1) { + mul_mat_gq_1(src0_gq, src1_gq, dst, M, N, K); + } + + if (method == 2) { + mul_mat_gq_2(src0_gq, src1_gq, dst, M, N, K); + } + + if (method == 3) { + mul_mat_gq_3(src0_gq, src1_gq, dst, M, N, K); + } + + if (method == 4) { + mul_mat_gq_4(src0_gq, src1_gq, dst, M, N, K); + } + + if (method == 5) { + mul_mat_gq_5(src0_gq, src1_gq, dst, M, N, K); + } + + if (method == 6) { + mul_mat_gq_6(src0_gq, src1_gq, dst, M, N, K); + } + } + + for (int i = 0; i < N; i++) { + sum += dst[i]*iM; + } + + { + const int64_t end = ggml_cycles(); + const int64_t end_us = ggml_time_us(); + printf("%s: elapsed ticks: %" PRIu64 "\n", __func__, end - start); + printf("%s: elapsed us: %d / %f ms\n", __func__, (int)(end_us - start_us), (end_us - start_us) / 1000.0 / nIter); + } + +#if 0 + // print src0 + printf("src0:\n"); + for (int i = 0; i < M; i++) { + for (int j = 0; j < K; j++) { + printf("%4.1f ", src0[i*K+j]); + } + printf("\n"); + } + + // print src1 + printf("src1:\n"); + for (int i = 0; i < N; i++) { + for (int j = 0; j < K; j++) { + printf("%4.1f ", src1[i*K+j]); + } + printf("\n"); + } + + printf("dst:\n"); + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + printf("%4.1f ", dst[i*N+j]); + } + printf("\n"); + } +#endif + + printf("%f\n", sum); + + free(src0); + free(src1); + free(dst); + + if (src0_gq) free(src0_gq); + if (src1_gq) free(src1_gq); + + return 0; +} diff --git a/seamless_communication/ggml/tests/test-opt.cpp b/seamless_communication/ggml/tests/test-opt.cpp new file mode 100644 index 0000000..8ab2402 --- /dev/null +++ b/seamless_communication/ggml/tests/test-opt.cpp @@ -0,0 +1,212 @@ +#include "ggml.h" + +#include +#include +#include +#include + +#define MAX_NARGS 2 + +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wdouble-promotion" +#endif + +// +// logging +// +#define GGML_DEBUG 0 +#if (GGML_DEBUG >= 1) +#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG(...) +#endif + +#if (GGML_DEBUG >= 5) +#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG_5(...) +#endif + +#if (GGML_DEBUG >= 10) +#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG_10(...) +#endif + +#define GGML_PRINT(...) printf(__VA_ARGS__) + + +float frand(void) { + return (float)rand()/(float)RAND_MAX; +} + +int irand(int n) { + return rand()%n; +} + +void get_random_dims(int64_t * dims, int ndims) { + dims[0] = dims[1] = dims[2] = dims[3] = 1; + + for (int i = 0; i < ndims; i++) { + dims[i] = 1 + irand(4); + } +} + +void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) { + dims[0] = dims[1] = dims[2] = dims[3] = 1; + + for (int i = 0; i < ndims; i++) { + dims[i] = min + irand(max-min); + } +} + + +struct ggml_tensor * get_random_tensor( + struct ggml_context * ctx0, + int ndims, + int64_t ne[], + float fmin, + float fmax) { + struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne); + + switch (ndims) { + case 1: + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin; + } + break; + case 2: + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + break; + case 3: + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + } + break; + case 4: + for (int i3 = 0; i3 < ne[3]; i3++) { + for (int i2 = 0; i2 < ne[2]; i2++) { + for (int i1 = 0; i1 < ne[1]; i1++) { + for (int i0 = 0; i0 < ne[0]; i0++) { + ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; + } + } + } + } + break; + default: + assert(false); + }; + + return result; +} + +float get_element(const struct ggml_tensor * t, int idx) { + return ((float *)t->data)[idx]; +} + +void set_element(struct ggml_tensor * t, int idx, float value) { + ((float *)t->data)[idx] = value; +} + +int main(void) { + struct ggml_init_params params = { + /* .mem_size = */ 1024*1024*1024, + /* .mem_buffer = */ NULL, + /* .no_alloc = */ false, + }; + + struct ggml_context * ctx = ggml_init(params); + + int64_t ne1[4] = {4, 128, 1, 1}; + int64_t ne2[4] = {4, 256, 1, 1};; + int64_t ne3[4] = {128, 256, 1, 1}; + + struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1); + struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1); + ggml_set_param(ctx, a); + ggml_set_param(ctx, b); + + struct ggml_tensor * c = get_random_tensor(ctx, 2, ne3, -1, +1); + + struct ggml_tensor * ab = ggml_mul_mat(ctx, a, b); + struct ggml_tensor * d = ggml_sub(ctx, c, ab); + struct ggml_tensor * e = ggml_sum(ctx, ggml_sqr(ctx, d)); + + struct ggml_cgraph ge = ggml_build_forward(e); + ggml_graph_reset(&ge); + + ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1); + + const float fe = ggml_get_f32_1d(e, 0); + printf("%s: e = %.4f\n", __func__, fe); + + struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM); + + ggml_opt(ctx, opt_params, e); + + ggml_graph_reset(&ge); + + ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1); + + const float fe_opt = ggml_get_f32_1d(e, 0); + printf("%s: original e = %.4f\n", __func__, fe); + printf("%s: optimized e = %.4f\n", __func__, fe_opt); + + const bool success = (fe_opt <= fe); + assert(success); + + ggml_free(ctx); + return success ? 0 : -1; +} +// int64_t ne1[4] = {4, 128, 1, 1}; +// int64_t ne2[4] = {4, 256, 1, 1};; +// int64_t ne3[4] = {128, 256, 1, 1}; +// main: original e = 25890.9375 +// main: optimized e = 10094.7031 + +// int64_t ne1[4] = {8, 128, 1, 1}; +// int64_t ne2[4] = {8, 256, 1, 1};; +// int64_t ne3[4] = {128, 256, 1, 1}; +// main: original e = 39429.5078 +// main: optimized e = 9275.8936 + +// int64_t ne1[4] = {16, 128, 1, 1}; +// int64_t ne2[4] = {16, 256, 1, 1};; +// int64_t ne3[4] = {128, 256, 1, 1}; +// main: original e = 68371.1328 +// main: optimized e = 7854.4502 + + +// int64_t ne1[4] = {32, 128, 1, 1}; +// int64_t ne2[4] = {32, 256, 1, 1};; +// int64_t ne3[4] = {128, 256, 1, 1}; +// main: original e = 126061.1953 +// main: optimized e = 5451.0166 + +// int64_t ne1[4] = {4, 1024, 1, 1}; +// int64_t ne2[4] = {4, 2048, 1, 1};; +// int64_t ne3[4] = {1024, 2048, 1, 1}; +// main: original e = 1620817.8750 +// main: optimized e = 698387.6875 + +// another run on M1 +// int64_t ne1[4] = {4, 1024, 1, 1}; +// int64_t ne2[4] = {4, 2048, 1, 1};; +// int64_t ne3[4] = {1024, 2048, 1, 1}; +// main: original e = 1629595.6250 +// main: optimized e = 698169.1250 + +// int64_t ne1[4] = {32, 1024, 1, 1}; +// int64_t ne2[4] = {32, 2048, 1, 1};; +// int64_t ne3[4] = {1024, 2048, 1, 1}; +// main: original e = 8146770.5000 +// main: optimized e = 651119.1250 diff --git a/seamless_communication/ggml/tests/test-pool.c b/seamless_communication/ggml/tests/test-pool.c new file mode 100644 index 0000000..cdf00f4 --- /dev/null +++ b/seamless_communication/ggml/tests/test-pool.c @@ -0,0 +1,143 @@ +#include "ggml/ggml.h" + +#include +#include +#include + +struct ggml_context* make_ctx(void) { + struct ggml_init_params params = { + .mem_size = 2 * 1024 * 1024, + }; + + return ggml_init(params); +} + +int main(int argc, const char** argv) { + + float buf_f32[1024]; + for (int i = 0; i < 1024; ++i) { + buf_f32[i] = (float)(i + 1); + } + + // avg pool 1d + { + struct ggml_context * ctx = make_ctx(); + struct ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2); + memcpy(t->data, buf_f32, ggml_nbytes(t)); + + struct ggml_tensor * t_pooled = ggml_pool_1d(ctx, t, GGML_OP_POOL_AVG, 3, 3, 0); + GGML_ASSERT(t_pooled->ne[0] == 3); + GGML_ASSERT(t_pooled->ne[1] == 2); + GGML_ASSERT(t_pooled->ne[2] == 1); + + struct ggml_cgraph graph = ggml_build_forward(t_pooled); + + ggml_graph_compute_with_ctx(ctx, &graph, 4); + + const float * output = ggml_get_data_f32(t_pooled); + + GGML_ASSERT(output[0] == 2); + GGML_ASSERT(output[1] == 5); + GGML_ASSERT(output[2] == 8); + GGML_ASSERT(output[3] == 12); + GGML_ASSERT(output[4] == 15); + GGML_ASSERT(output[5] == 18); + + ggml_free(ctx); + } + + // max pool 1d + { + struct ggml_context * ctx = make_ctx(); + struct ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2); + memcpy(t->data, buf_f32, ggml_nbytes(t)); + + struct ggml_tensor * t_pooled = ggml_pool_1d(ctx, t, GGML_OP_POOL_MAX, 3, 3, 0); + GGML_ASSERT(t_pooled->ne[0] == 3); + GGML_ASSERT(t_pooled->ne[1] == 2); + GGML_ASSERT(t_pooled->ne[2] == 1); + + struct ggml_cgraph graph = ggml_build_forward(t_pooled); + + ggml_graph_compute_with_ctx(ctx, &graph, 4); + + const float * output = ggml_get_data_f32(t_pooled); + GGML_ASSERT(output[0] == 3); + GGML_ASSERT(output[1] == 6); + GGML_ASSERT(output[2] == 9); + GGML_ASSERT(output[3] == 13); + GGML_ASSERT(output[4] == 16); + GGML_ASSERT(output[5] == 19); + + ggml_free(ctx); + } + + // avg pool 2d + { + struct ggml_context * ctx = make_ctx(); + struct ggml_tensor * t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 10, 10, 2); + memcpy(t->data, buf_f32, ggml_nbytes(t)); + + struct ggml_tensor * t_pooled = ggml_pool_2d(ctx, t, GGML_OP_POOL_AVG, 3, 4, 3, 4, 0, 0); + GGML_ASSERT(t_pooled->ne[0] == 3); + GGML_ASSERT(t_pooled->ne[1] == 2); + GGML_ASSERT(t_pooled->ne[2] == 2); + GGML_ASSERT(t_pooled->ne[3] == 1); + + struct ggml_cgraph graph = ggml_build_forward(t_pooled); + + ggml_graph_compute_with_ctx(ctx, &graph, 4); + + const float * output = ggml_get_data_f32(t_pooled); + GGML_ASSERT(output[0] == 17); + GGML_ASSERT(output[1] == 20); + GGML_ASSERT(output[2] == 23); + GGML_ASSERT(output[3] == 57); + GGML_ASSERT(output[4] == 60); + GGML_ASSERT(output[5] == 63); + GGML_ASSERT(output[6] == 117); + GGML_ASSERT(output[7] == 120); + GGML_ASSERT(output[8] == 123); + GGML_ASSERT(output[9] == 157); + GGML_ASSERT(output[10] == 160); + GGML_ASSERT(output[11] == 163); + + + ggml_free(ctx); + } + + // max pool 2d + { + struct ggml_context * ctx = make_ctx(); + struct ggml_tensor * t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 10, 10, 2); + memcpy(t->data, buf_f32, ggml_nbytes(t)); + + struct ggml_tensor * t_pooled = ggml_pool_2d(ctx, t, GGML_OP_POOL_MAX, 3, 4, 3, 4, 0, 0); + GGML_ASSERT(t_pooled->ne[0] == 3); + GGML_ASSERT(t_pooled->ne[1] == 2); + GGML_ASSERT(t_pooled->ne[2] == 2); + GGML_ASSERT(t_pooled->ne[3] == 1); + + struct ggml_cgraph graph = ggml_build_forward(t_pooled); + + ggml_graph_compute_with_ctx(ctx, &graph, 4); + + const float * output = ggml_get_data_f32(t_pooled); + GGML_ASSERT(output[0] == 33); + GGML_ASSERT(output[1] == 36); + GGML_ASSERT(output[2] == 39); + GGML_ASSERT(output[3] == 73); + GGML_ASSERT(output[4] == 76); + GGML_ASSERT(output[5] == 79); + GGML_ASSERT(output[6] == 133); + GGML_ASSERT(output[7] == 136); + GGML_ASSERT(output[8] == 139); + GGML_ASSERT(output[9] == 173); + GGML_ASSERT(output[10] == 176); + GGML_ASSERT(output[11] == 179); + + ggml_free(ctx); + } + + return 0; +} diff --git a/seamless_communication/ggml/tests/test-quantize-fns.cpp b/seamless_communication/ggml/tests/test-quantize-fns.cpp new file mode 100644 index 0000000..8d3c162 --- /dev/null +++ b/seamless_communication/ggml/tests/test-quantize-fns.cpp @@ -0,0 +1,164 @@ +// Unit tests for quantization specific functions - quantize, dequantize and dot product + +#include "ggml.h" + +#undef NDEBUG +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f; +const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f; +const float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f; +const float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f; +const float MAX_DOT_PRODUCT_ERROR = 0.02f; + +const char* RESULT_STR[] = {"ok", "FAILED"}; + + +// Generate synthetic data +void generate_data(float offset, size_t n, float * dst) { + for (size_t i = 0; i < n; i++) { + dst[i] = 0.1 + 2*cosf(i + offset); + } +} + +// Calculate RMSE between two float arrays +float array_rmse(const float * a1, const float * a2, size_t n) { + double sum = 0; + for (size_t i = 0; i < n; i++) { + double diff = a1[i] - a2[i]; + sum += diff * diff; + } + return sqrtf(sum) / n; +} + +// Total quantization error on test data +float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { + std::vector tmp_q(2*test_size); + std::vector tmp_out(test_size); + + qfns.from_float(test_data, tmp_q.data(), test_size); + qfns.to_float(tmp_q.data(), tmp_out.data(), test_size); + return array_rmse(test_data, tmp_out.data(), test_size); +} + +// Total quantization error on test data +float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { + std::vector tmp_q(2*test_size); + std::vector tmp_out(test_size); + std::vector tmp_out_ref(test_size); + + qfns.from_float(test_data, tmp_q.data(), test_size); + qfns.to_float(tmp_q.data(), tmp_out.data(), test_size); + + qfns.from_float_reference(test_data, tmp_q.data(), test_size); + qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size); + + return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size); +} + +float dot_product(const float * a1, const float * a2, size_t test_size) { + double sum = 0; + for (size_t i = 0; i < test_size; i++) { + sum += a1[i] * a2[i]; + } + return sum; +} + +// Total dot product error +float dot_product_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) { + std::vector tmp_q1(2*test_size); + std::vector tmp_q2(2*test_size); + + auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type); + + qfns.from_float(test_data1, tmp_q1.data(), test_size); + vdot.from_float(test_data2, tmp_q2.data(), test_size); + + float result = INFINITY; + qfns.vec_dot(test_size, &result, tmp_q1.data(), tmp_q2.data()); + + const float dot_ref = dot_product(test_data1, test_data2, test_size); + + return fabsf(result - dot_ref) / test_size; +} + +int main(int argc, char * argv[]) { + bool verbose = false; + const size_t test_size = 32 * 128; + + std::string arg; + for (int i = 1; i < argc; i++) { + arg = argv[i]; + + if (arg == "-v") { + verbose = true; + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + return 1; + } + } + + std::vector test_data(test_size); + std::vector test_data2(test_size); + + generate_data(0.0, test_data.size(), test_data.data()); + generate_data(1.0, test_data2.size(), test_data2.data()); + + // Initialize GGML, ensures float conversion tables are initialized + struct ggml_init_params ggml_params = { + /* .mem_size = */ 1*1024, + /* .mem_buffer = */ NULL, + /* .no_alloc = */ true, + }; + struct ggml_context * ctx = ggml_init(ggml_params); + + int num_failed = 0; + bool failed = false; + + for (int i = 0; i < GGML_TYPE_COUNT; i++) { + ggml_type type = (ggml_type) i; + ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); + + if (qfns.from_float && qfns.to_float) { + const float total_error = total_quantization_error(qfns, test_size, test_data.data()); + const float max_quantization_error = + type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : + type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : MAX_QUANTIZATION_TOTAL_ERROR; + failed = !(total_error < max_quantization_error); + num_failed += failed; + if (failed || verbose) { + printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error); + } + + const float reference_error = reference_quantization_error(qfns, test_size, test_data.data()); + failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR); + num_failed += failed; + if (failed || verbose) { + printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error); + } + + const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data()); + failed = !(vec_dot_error < MAX_DOT_PRODUCT_ERROR); + num_failed += failed; + if (failed || verbose) { + printf("%5s dot product error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error); + } + } + } + + if (num_failed || verbose) { + printf("%d tests failed\n", num_failed); + } + + ggml_free(ctx); + + return num_failed > 0; +} diff --git a/seamless_communication/ggml/tests/test-quantize-perf.cpp b/seamless_communication/ggml/tests/test-quantize-perf.cpp new file mode 100644 index 0000000..cbea7d4 --- /dev/null +++ b/seamless_communication/ggml/tests/test-quantize-perf.cpp @@ -0,0 +1,362 @@ +// Benchmark quantization specific functions on synthetic data + +#include "ggml.h" + +#undef NDEBUG +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +#define MAX_ALIGNMENT 64 +#define QK 32 +#define WARMUP 5 +#define ITERATIONS 10 +#define MAX_ITERATIONS 100000000 + +#define L1_SIZE 32*128 +#define L2_SIZE 32*2048 +#define L3_SIZE 32*20480 +#define MEM_SIZE 32*2048000 + +struct quantize_perf_params { + std::vector include_types; + std::vector test_sizes; + size_t alignment_offset = 0; + bool op_quantize_row_q_reference = false; + bool op_quantize_row_q = false; + bool op_dequantize_row_q = false; + bool op_quantize_row_q_dot = false; + bool op_vec_dot_q = false; + int64_t iterations = ITERATIONS; +}; + +#if defined(__x86_64__) || defined(__i386__) + +#include +inline int64_t cpu_cycles() { +// Rough way to detect new-ish CPUs +#ifdef __POPCNT__ + unsigned int dummy; + return __rdtscp(&dummy); +#else + return __rdtsc(); +#endif +} + +#else + +#define cpu_cycles() 0 + +#endif + + +// Generate synthetic data +void generate_data(float offset, size_t n, float * dst) { + for (size_t i = 0; i < n; i++) { + dst[i] = 0.1 + 2*cosf(i + offset); + } +} + +float gigabytes_per_second(size_t bytes, int64_t usecs) { + return bytes / (float) usecs * 1000000 / (1024*1024*1024); +} + +void * align_with_offset(void * ptr, int offset) { + size_t dummy_size = MAX_ALIGNMENT * 4; + return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset; +} + +void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function & function) { + int64_t min_time_us = INT64_MAX; + int64_t total_time_us = 0; + int64_t min_time_cycles = INT64_MAX; + int64_t total_time_cycles = 0; + + for (int i = 0; i < WARMUP; i++) { + function(); + } + + + for (int i = 0; i < iterations; i++) { + const int64_t start_time = ggml_time_us(); + const int64_t start_cycles = cpu_cycles(); + + function(); + + const int64_t end_cycles = cpu_cycles(); + const int64_t end_time = ggml_time_us(); + + total_time_cycles += end_cycles - start_cycles; + min_time_cycles = std::min(min_time_cycles, end_cycles - start_cycles); + total_time_us += end_time - start_time; + min_time_us = std::min(min_time_us, end_time - start_time); + } + + printf(" min cycles/%d vals : %9.2f\n", QK, QK * min_time_cycles / (float) size); + printf(" avg cycles/%d vals : %9.2f\n", QK, QK * total_time_cycles / (float) (size * iterations)); + printf(" float32 throughput : %9.2f GB/s\n", gigabytes_per_second(4 * size * iterations, total_time_us)); + printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * iterations, total_time_us)); +} + +void usage(char * argv[]) { + printf("Benchmark quantization specific functions on synthetic data\n"); + printf("\n"); + printf("usage: %s [options]\n", argv[0]); + printf("\n"); + printf("options: (default)\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" --size SIZE set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE); + printf(" -3 use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE); + printf(" -4 use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE); + printf(" --op OP set test opration as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n"); + printf(" quantize_row_q_dot, vec_dot_q (all)\n"); + printf(" --type TYPE set test type as"); + for (int i = 0; i < GGML_TYPE_COUNT; i++) { + ggml_type type = (ggml_type) i; + ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); + if (ggml_type_name(type) != NULL) { + if (qfns.from_float && qfns.to_float) { + printf(" %s", ggml_type_name(type)); + } + } + } + printf(" (all)\n"); + printf(" --alignment-offset OFFSET\n"); + printf(" set alignment offset as OFFSET (0)\n"); + printf(" -i NUM, --iterations NUM\n"); + printf(" set test iteration number (%d)\n", ITERATIONS); +} + +int main(int argc, char * argv[]) { + quantize_perf_params params {}; + + // read command line + + bool invalid_param = false; + std::string arg; + for (int i = 1; i < argc; i++) { + arg = argv[i]; + + if (arg == "--size") { + if (++i >= argc) { + invalid_param = true; + break; + } + size_t size = std::stoi(argv[i]); + if (size % 32 != 0) { + fprintf(stderr, "error: size %zu not divisible by 32\n", size); + invalid_param = true; + break; + } + params.test_sizes.push_back(size); + } else if (arg == "-3") { + // quick select sizes that probably fit in CPU caches + params.test_sizes.push_back(L1_SIZE); + params.test_sizes.push_back(L2_SIZE); + params.test_sizes.push_back(L3_SIZE); + } else if (arg == "-4") { + // quick select cache sizes + memory + params.test_sizes.push_back(L1_SIZE); + params.test_sizes.push_back(L2_SIZE); + params.test_sizes.push_back(L3_SIZE); + params.test_sizes.push_back(MEM_SIZE); + } else if (arg == "--op") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string op {argv[i]}; + if (op == "quantize_row_q_reference") { + params.op_quantize_row_q_reference = true; + } else if (op == "quantize_row_q") { + params.op_quantize_row_q = true; + } else if (op == "dequantize_row_q") { + params.op_dequantize_row_q = true; + } else if (op == "quantize_row_q_dot") { + params.op_quantize_row_q_dot = true; + } else if (op == "vec_dot_q") { + params.op_vec_dot_q = true; + } else { + invalid_param = true; + break; + } + } else if (arg == "--type") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.include_types.push_back(argv[i]); + } else if (arg == "--alignment-offset") { + if (++i >= argc) { + invalid_param = true; + break; + } + int alignment = std::stoi(argv[i]); + if (alignment < 0 || alignment > MAX_ALIGNMENT) { + fprintf(stderr, "error: aligment-offset must be less than %d\n", MAX_ALIGNMENT); + invalid_param = true; + break; + } + params.alignment_offset = alignment; + } else if ((arg == "-i") || (arg == "--iterations")) { + if (++i >= argc) { + invalid_param = true; + break; + } + int number = std::stoi(argv[i]); + if (number < 0 || number > MAX_ITERATIONS) { + fprintf(stderr, "error: iterations must be less than %d\n", MAX_ITERATIONS); + invalid_param = true; + break; + } + params.iterations = number; + } else if ((arg == "-h") || (arg == "--help")) { + usage(argv); + return 1; + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + return 1; + } + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + return 1; + } + + if (params.test_sizes.empty()) { + params.test_sizes.push_back(L1_SIZE); + } + if (!(params.op_quantize_row_q_reference || params.op_quantize_row_q || params.op_dequantize_row_q || params.op_quantize_row_q_dot || params.op_vec_dot_q)) { + params.op_quantize_row_q_reference = params.op_quantize_row_q = params.op_dequantize_row_q = params.op_quantize_row_q_dot = params.op_vec_dot_q = true; + } + + std::sort(params.test_sizes.begin(), params.test_sizes.end()); + size_t largest = params.test_sizes.back(); + + std::vector test_data1_v(largest*4 + MAX_ALIGNMENT*2); + std::vector test_data2_v(largest*4 + MAX_ALIGNMENT*2); + std::vector test_q1_v(largest*4 + MAX_ALIGNMENT*2); + std::vector test_q2_v(largest*4 + MAX_ALIGNMENT*2); + std::vector test_out_v(largest*4 + MAX_ALIGNMENT*2); + + float * test_data1 = (float *) align_with_offset(test_data1_v.data(), params.alignment_offset); + float * test_data2 = (float *) align_with_offset(test_data2_v.data(), params.alignment_offset); + float * test_q1 = (float *) align_with_offset(test_q1_v.data(), params.alignment_offset); + float * test_q2 = (float *) align_with_offset(test_q2_v.data(), params.alignment_offset); + float * test_out = (float *) align_with_offset(test_out_v.data(), params.alignment_offset); + + generate_data(0, largest, test_data1); + generate_data(1, largest, test_data2); + + int64_t iterations = params.iterations; + + + // Initialize GGML, ensures float conversion tables are initialized + struct ggml_init_params ggml_params = { + /* .mem_size = */ 1*1024, + /* .mem_buffer = */ NULL, + /* .no_alloc = */ true, + }; + struct ggml_context * ctx = ggml_init(ggml_params); + + for (int i = 0; i < GGML_TYPE_COUNT; i++) { + ggml_type type = (ggml_type) i; + ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); + if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) { + continue; + } + + if (qfns.from_float && qfns.to_float) { + printf("%s\n", ggml_type_name(type)); + + if (params.op_quantize_row_q_reference) { + printf(" quantize_row_q_reference\n"); + for (size_t size : params.test_sizes) { + printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); + auto quantize_fn = [&](void ) { + qfns.from_float_reference(test_data1, test_q1, size); + return test_q1[0]; + }; + size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); + benchmark_function(size, quantized_size, iterations, quantize_fn); + } + printf("\n"); + } + + if (params.op_quantize_row_q) { + printf(" quantize_row_q\n"); + for (size_t size : params.test_sizes) { + printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); + auto quantize_fn = [&](void ) { + qfns.from_float(test_data1, test_q1, size); + return test_q1[0]; + }; + size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); + benchmark_function(size, quantized_size, iterations, quantize_fn); + } + printf("\n"); + } + + if (params.op_dequantize_row_q) { + printf(" dequantize_row_q\n"); + qfns.from_float(test_data1, test_q1, largest); + for (size_t size : params.test_sizes) { + printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); + auto quantize_fn = [&](void ) { + qfns.to_float(test_q1, test_out, size); + return test_out[0]; + }; + size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); + benchmark_function(size, quantized_size, iterations, quantize_fn); + } + printf("\n"); + } + + if (params.op_quantize_row_q_dot) { + printf(" quantize_row_q_dot\n"); + for (size_t size : params.test_sizes) { + printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); + auto quantize_fn = [&](void ) { + auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type); + vdot.from_float(test_data1, test_q1, size); + return test_q1[0]; + }; + size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); + benchmark_function(size, quantized_size, iterations, quantize_fn); + } + printf("\n"); + } + + if (params.op_vec_dot_q) { + printf(" vec_dot_q\n"); + qfns.from_float(test_data1, test_q1, largest); + qfns.from_float(test_data2, test_q2, largest); + for (size_t size : params.test_sizes) { + printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); + auto quantize_fn = [&](void ) { + float result; + qfns.vec_dot(size, &result, test_q1, test_q2); + return result; + }; + size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); + benchmark_function(size, quantized_size, iterations, quantize_fn); + } + printf("\n"); + } + } + } + + ggml_free(ctx); + + return 0; +} diff --git a/seamless_communication/ggml/tests/test-rel-pos.c b/seamless_communication/ggml/tests/test-rel-pos.c new file mode 100644 index 0000000..19960b4 --- /dev/null +++ b/seamless_communication/ggml/tests/test-rel-pos.c @@ -0,0 +1,84 @@ +#include "ggml/ggml.h" + +#include +#include +#include + +struct ggml_context* make_ctx(void) { + struct ggml_init_params params = { + .mem_size = 2 * 1024 * 1024, + }; + + return ggml_init(params); +} + +void check_tensor(struct ggml_tensor * t, float * expected_t_d, int ne0, int ne1, int ne2) { + GGML_ASSERT(t->type == GGML_TYPE_F32); + GGML_ASSERT(t->ne[0] == ne0); + GGML_ASSERT(t->ne[1] == ne1); + GGML_ASSERT(t->ne[2] == ne2); + for (int i2 = 0; i2 < ne2; ++i2) { + for (int i1 = 0; i1 < ne1; ++i1) { + for (int i0 = 0; i0 < ne0; ++i0) { + float expected = *(expected_t_d + i2 * ne1 * ne0 + i1 * ne0 + i0); + float actual = ggml_get_data_f32(t)[i2 * ne1 * ne0 + i1 * ne0 + i0]; + GGML_ASSERT(expected == actual); + } + } + } +} + +int main(int argc, const char** argv) { + ggml_fp16_t buf_f16[1024]; + for (int i = 0; i < 1024; ++i) { + buf_f16[i] = ggml_fp32_to_fp16((float)i); + } + + float expected_out[4][9] = { + { 8.0, 9.0, 10.0, 9.0, 10.0, 11.0, 10.0, 11.0, 12.0 }, + { 2.0, 3.0, 4.0, 3.0, 4.0, 5.0, 4.0, 5.0, 6.0 }, + { 14.0, 15.0, 16.0, 15.0, 16.0, 17.0, 16.0, 17.0, 18.0 }, + { 8.0, 9.0, 10.0, 9.0, 10.0, 11.0, 10.0, 11.0, 12.0 }, + }; + + { + struct ggml_context * ctx = make_ctx(); + + + struct ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 3, 3); + ggml_fp16_t* t_d = (ggml_fp16_t*)t->data; + memcpy(t_d, buf_f16, ggml_nbytes(t)); + + struct ggml_tensor * t_2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 3, 3); + ggml_fp16_t* t_d_2 = (ggml_fp16_t*)t_2->data; + memcpy(t_d_2, buf_f16 + 1, ggml_nbytes(t_2)); + + struct ggml_tensor * rw = ggml_get_rel_pos(ctx, t, 2, 2); + struct ggml_tensor * rh = ggml_get_rel_pos(ctx, t_2, 2, 2); + + struct ggml_tensor * rw_f32 = ggml_cpy(ctx, rw, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 3, 2, 2)); + struct ggml_tensor * rh_f32 = ggml_cpy(ctx, rh, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 3, 2, 2)); + + struct ggml_tensor * in = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 9, 4); + struct ggml_tensor * out_inplace = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 9, 4); + float * in_d = (float*)in->data; + float * out_inplace_d = (float*)out_inplace->data; + for (int i = 0; i < ggml_nelements(in); ++i) { + in_d[i] = 1.f; + out_inplace_d[i] = 1.f; + } + + struct ggml_tensor * out = ggml_add_rel_pos(ctx, in, rw_f32, rh_f32); + struct ggml_cgraph gf = ggml_build_forward(out); + ggml_graph_compute_with_ctx(ctx, &gf, 1); + + out_inplace = ggml_add_rel_pos_inplace(ctx, out_inplace, rw_f32, rh_f32); + struct ggml_cgraph gf_2 = ggml_build_forward(out_inplace); + ggml_graph_compute_with_ctx(ctx, &gf_2, 1); + + check_tensor(out, (float*)expected_out, 9, 4, 1); + check_tensor(out_inplace, (float*)expected_out, 9, 4, 1); + } + + return 0; +} diff --git a/seamless_communication/ggml/tests/test-svd0.c b/seamless_communication/ggml/tests/test-svd0.c new file mode 100644 index 0000000..2295c9d --- /dev/null +++ b/seamless_communication/ggml/tests/test-svd0.c @@ -0,0 +1,218 @@ +// SVD dimensionality reduction + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef GGML_USE_ACCELERATE +#include +#endif + +float frand() { + return (float) rand() / (float) RAND_MAX; +} + +//int sgesvd_(char *__jobu, char *__jobvt, __CLPK_integer *__m, +// __CLPK_integer *__n, __CLPK_real *__a, __CLPK_integer *__lda, +// __CLPK_real *__s, __CLPK_real *__u, __CLPK_integer *__ldu, +// __CLPK_real *__vt, __CLPK_integer *__ldvt, __CLPK_real *__work, +// __CLPK_integer *__lwork, +// __CLPK_integer *__info) + +int main(int argc, const char ** argv) { + int m = 10; + int n = 5; + + float * A = malloc(n * m * sizeof(float)); + float * A0 = malloc(n * m * sizeof(float)); + + for (int i = 0; i < n; ++i) { + for (int j = 0; j < m; ++j) { + A[i * m + j] = (float) (10.0f*(i + 1) + 1.0f * frand()); + //A[i * m + j] = (float) (10.0f*(i%2 + 1) + 0.1f * frand()); + //if (i == 2) { + // A[i * m + j] += 20*frand(); + //} + if ((i == 1 || i == 3) && j > m/2) { + A[i * m + j] = -A[i * m + j]; + } + } + } + + // average vector + //float * M = malloc(m * sizeof(float)); + + //{ + // for (int j = 0; j < m; ++j) { + // M[j] = 0.0f; + // } + // for (int i = 0; i < n; ++i) { + // for (int j = 0; j < m; ++j) { + // M[j] += A[i * m + j]; + // } + // } + // for (int j = 0; j < m; ++j) { + // M[j] /= (float) n; + // } + //} + + //// subtract average vector + //for (int i = 0; i < n; ++i) { + // for (int j = 0; j < m; ++j) { + // A[i * m + j] -= M[j]; + // } + //} + + memcpy(A0, A, n * m * sizeof(float)); + + // print A + printf("A:\n"); + for (int i = 0; i < n; ++i) { + printf("col %d : ", i); + for (int j = 0; j < m; ++j) { + printf("%9.5f ", A[i * m + j]); + } + printf("\n"); + } + printf("\n"); + + // SVD + // A = U * S * V^T + + float * U = malloc(n * m * sizeof(float)); + float * S = malloc(n * sizeof(float)); + float * V = malloc(n * n * sizeof(float)); + + int lda = m; + int ldu = m; + int ldvt = n; + + float work_size; + int lwork = -1; + int info = 0; + + sgesvd_("S", "S", &m, &n, A, &lda, S, U, &ldu, V, &ldvt, &work_size, &lwork, &info); + + lwork = (int) work_size; + + printf("work_size = %f, info = %d, lwork = %d\n", work_size, info, lwork); + + float * work = malloc(lwork * sizeof(float)); + + sgesvd_("S", "S", &m, &n, A, &lda, S, U, &ldu, V, &ldvt, work, &lwork, &info); + + // print U + printf("U:\n"); + for (int i = 0; i < n; ++i) { + printf("col %d : ", i); + for (int j = 0; j < m; ++j) { + printf("%9.5f ", U[i * m + j]); + } + printf("\n"); + } + printf("\n"); + + // normalize S + { + double sum = 0.0; + for (int i = 0; i < n; ++i) { + sum += S[i]; + } + sum *= sqrt((double) m); + for (int i = 0; i < n; ++i) { + S[i] /= sum; + } + } + + // print S + printf("S:\n"); + for (int i = 0; i < n; ++i) { + printf("- %d = %9.5f\n", i, S[i]); + } + printf("\n"); + + // print V + printf("V:\n"); + for (int i = 0; i < n; ++i) { + printf("col %d : ", i); + for (int j = 0; j < n; ++j) { + printf("%9.5f ", V[i * n + j]); + } + printf("\n"); + } + printf("\n"); + + // print A + printf("A:\n"); + for (int i = 0; i < n; ++i) { + printf("col %d : ", i); + for (int j = 0; j < m; ++j) { + printf("%9.5f ", A[i * m + j]); + } + printf("\n"); + } + printf("\n"); + + // compute singular vectors in U + for (int i = 0; i < n; ++i) { + for (int j = 0; j < m; ++j) { + U[i * m + j] *= S[i]; + } + } + + // normalize U + for (int i = 0; i < n; ++i) { + double sum = 0.0; + for (int j = 0; j < m; ++j) { + sum += U[i * m + j] * U[i * m + j]; + } + sum = sqrt(sum); + for (int j = 0; j < m; ++j) { + U[i * m + j] /= sum*sqrt((double) m); + } + } + + // print U + printf("U:\n"); + for (int i = 0; i < n; ++i) { + printf("col %d : ", i); + for (int j = 0; j < m; ++j) { + printf("%9.5f ", U[i * m + j]); + } + printf("\n"); + } + printf("\n"); + + + // project A0 onto U + float * A1 = malloc(n * n * sizeof(float)); + + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + A1[i * n + j] = 0.0f; + for (int k = 0; k < m; ++k) { + A1[i * n + j] += A0[i * m + k] * U[j * m + k]; + } + } + } + + // print A1 + printf("A1:\n"); + for (int i = 0; i < n; ++i) { + printf("col %d : ", i); + for (int j = 0; j < n; ++j) { + printf("%9.5f ", A1[i * n + j]); + } + printf("\n"); + } + printf("\n"); + + return 0; +} diff --git a/seamless_communication/ggml/tests/test-vec0.c b/seamless_communication/ggml/tests/test-vec0.c new file mode 100644 index 0000000..5e23f8e --- /dev/null +++ b/seamless_communication/ggml/tests/test-vec0.c @@ -0,0 +1,133 @@ +#include +#include +#include +#include + +const int N = 1 << 14; +const int M = 1 << 14; + +void mul_mat_vec_f32_0( + const float * src0, + const float * src1, + float * dst, + unsigned nrows, + unsigned ncols) { + for (unsigned i = 0; i < nrows; i++) { + float sum = 0.0f; + for (unsigned j = 0; j < ncols; j++) { + sum += src0[i*ncols + j]*src1[j]; + } + dst[i] = sum; + } +} +#if defined(_MSC_VER) +typedef float __declspec(align(32)) afloat; +#else +typedef float afloat __attribute__((__aligned__(32))); +#endif +void mul_mat_vec_f32_1( + const afloat *restrict src0, + const afloat *restrict src1, + afloat *restrict dst, + unsigned nrows, + unsigned ncols) { + for (unsigned i = 0; i < nrows; i++) { + const afloat * restrict row = src0 + i*ncols; + const afloat * restrict col = src1; + + float sum = 0.0f; + + for (unsigned j = 0; j < ncols; j++) { + sum += *row++ * *col++; + } + + dst[i] = sum; + + //float sum[8] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + + //for (unsigned j = 0; j < ncols; j += 8) { + // sum[0] += row[0]*col[0]; + // sum[1] += row[1]*col[1]; + // sum[2] += row[2]*col[2]; + // sum[3] += row[3]*col[3]; + // sum[4] += row[4]*col[4]; + // sum[5] += row[5]*col[5]; + // sum[6] += row[6]*col[6]; + // sum[7] += row[7]*col[7]; + + // row += 8; + // col += 8; + //} + + //dst[i] = sum[0] + sum[1] + sum[2] + sum[3] + sum[4] + sum[5] + sum[6] + sum[7]; + } +} + +void mul_mat_vec_f32_2( + const void * src0, + const void * src1, + void * dst, + unsigned nrows, + unsigned ncols) { + void * d = dst; + for (unsigned i = 0; i < nrows; i++) { + float sum = 0.0f; + + const char * row = (const char*)src0 + i*ncols*sizeof(float); + const char * col = (const char*)src1; + for (unsigned j = 0; j < ncols; j++) { + sum += (*(float *)row) * (*(float *)col); + row += sizeof(float); + col += sizeof(float); + } + *(float *)d = sum; + d = (char*)d + sizeof(float); + } +} + +#if defined(_MSC_VER) +void* aligned_alloc(size_t alignment, size_t size) { + return _aligned_malloc(size, alignment); +} +#endif + +int main(int argc, const char ** argv) { + //float * src0 = malloc(sizeof(float)*N*M); + //float * src1 = malloc(sizeof(float)*M); + //float * dst = malloc(sizeof(float)*N); + + afloat * src0 = (float *)(aligned_alloc(32, sizeof(float)*N*M)); + afloat * src1 = (float *)(aligned_alloc(32, sizeof(float)*M)); + afloat * dst = (float *)(aligned_alloc(32, sizeof(float)*N)); + + for (int i = 0; i < N*M; i++) { + src0[i] = (afloat)i; + } + + for (int i = 0; i < M; i++) { + src1[i] = (afloat)i; + } + + const int nIter = 10; + + const clock_t start = clock(); + + double sum = 0.0f; + for (int i = 0; i < nIter; i++) { + //mul_mat_vec_f32_0(src0, src1, dst, N, M); + mul_mat_vec_f32_1(src0, src1, dst, N, M); + //mul_mat_vec_f32_2(src0, src1, dst, N, M); + for (int i = 0; i < N; i++) { + sum += dst[i]; + } + } + + { + const clock_t end = clock(); + printf("%s: elapsed ticks: %ld\n", __func__, end - start); + } + + printf("%f\n", sum); + + return 0; +} diff --git a/seamless_communication/ggml/tests/test-vec1.c b/seamless_communication/ggml/tests/test-vec1.c new file mode 100644 index 0000000..567cb06 --- /dev/null +++ b/seamless_communication/ggml/tests/test-vec1.c @@ -0,0 +1,576 @@ +#include +#include +#include +#include +#include +#include + +#include + +#include + +const int N = 1 << 14; +const int M = 768; + +// +// naive implementation +// + +void mul_mat_vec_f32_0( + const float * restrict src0, + const float * restrict src1, + float * dst, + int nrows, + int ncols) { + for (int i = 0; i < nrows; i++) { + float sum = 0.0f; + for (int j = 0; j < ncols; j++) { + sum += src0[i*ncols + j]*src1[j]; + } + dst[i] = sum; + } +} + +// +// SIMD with 8 32-bit floats +// + +float reduce_vector8_0(__m256 v) { + __m128 v1 = _mm256_extractf128_ps(v, 0); + __m128 v2 = _mm256_extractf128_ps(v, 1); + __m128 v3 = _mm_add_ps(v1, v2); + __m128 v4 = _mm_shuffle_ps(v3, v3, 0x4e); + __m128 v5 = _mm_add_ps(v3, v4); + __m128 v6 = _mm_shuffle_ps(v5, v5, 0x11); + __m128 v7 = _mm_add_ps(v5, v6); + return _mm_cvtss_f32(v7); +} + +// vectorized implementation using AVX +void mul_mat_vec_f32_1( + const float * restrict src0, + const float * restrict src1, + float * dst, + int nrows, + int ncols) { + + const int ncols8 = ncols & ~7; + + for (int i = 0; i < nrows; i++) { + __m256 sum = _mm256_setzero_ps(); + for (int j = 0; j < ncols8; j += 8) { + __m256 a = _mm256_loadu_ps(src0 + i*ncols + j); + __m256 b = _mm256_loadu_ps(src1 + j); + __m256 c = _mm256_mul_ps(a, b); + sum = _mm256_add_ps(sum, c); + } + dst[i] = reduce_vector8_0(sum); + + for (int j = ncols8; j < ncols; j++) { + dst[i] += src0[i*ncols + j]*src1[j]; + } + } +} + +void mul_mat_vec_f32_2( + const float * restrict src0, + const float * restrict src1, + float * dst, + int nrows, + int ncols) { + + const int ncols32 = ncols & ~31; + + for (int i = 0; i < nrows; i++) { + __m256 sum0 = _mm256_setzero_ps(); + __m256 sum1 = _mm256_setzero_ps(); + __m256 sum2 = _mm256_setzero_ps(); + __m256 sum3 = _mm256_setzero_ps(); + + const float * restrict src0_row = src0 + i*ncols; + for (int j = 0; j < ncols32; j += 32) { + __m256 a0 = _mm256_loadu_ps(src0_row + j + 0); + __m256 a1 = _mm256_loadu_ps(src0_row + j + 8); + __m256 a2 = _mm256_loadu_ps(src0_row + j + 16); + __m256 a3 = _mm256_loadu_ps(src0_row + j + 24); + __m256 b0 = _mm256_loadu_ps(src1 + j + 0); + __m256 b1 = _mm256_loadu_ps(src1 + j + 8); + __m256 b2 = _mm256_loadu_ps(src1 + j + 16); + __m256 b3 = _mm256_loadu_ps(src1 + j + 24); +#if defined(__FMA__) + sum0 = _mm256_fmadd_ps(a0, b0, sum0); + sum1 = _mm256_fmadd_ps(a1, b1, sum1); + sum2 = _mm256_fmadd_ps(a2, b2, sum2); + sum3 = _mm256_fmadd_ps(a3, b3, sum3); +#else + sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0); + sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1); + sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2); + sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3); +#endif + } + dst[i] = reduce_vector8_0(_mm256_add_ps(_mm256_add_ps(sum0, sum1), _mm256_add_ps(sum2, sum3))); + + for (int j = ncols32; j < ncols; j++) { + dst[i] += src0[i*ncols + j]*src1[j]; + } + } +} + +// +// SIMD with 8 16-bit floats +// + +static inline float fp32_from_bits(uint32_t w) { +#if defined(__OPENCL_VERSION__) + return as_float(w); +#elif defined(__CUDA_ARCH__) + return __uint_as_float((unsigned int) w); +#elif defined(__INTEL_COMPILER) + return _castu32_f32(w); +#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64)) + return _CopyFloatFromInt32((__int32) w); +#else + union { + uint32_t as_bits; + float as_value; + } fp32 = { w }; + return fp32.as_value; +#endif +} + +static inline uint32_t fp32_to_bits(float f) { +#if defined(__OPENCL_VERSION__) + return as_uint(f); +#elif defined(__CUDA_ARCH__) + return (uint32_t) __float_as_uint(f); +#elif defined(__INTEL_COMPILER) + return _castf32_u32(f); +#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64)) + return (uint32_t) _CopyInt32FromFloat(f); +#else + union { + float as_value; + uint32_t as_bits; + } fp32 = { f }; + return fp32.as_bits; +#endif +} + +/* + * Convert a 16-bit floating-point number in IEEE half-precision format, in bit representation, to + * a 32-bit floating-point number in IEEE single-precision format. + * + * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals) + * floating-point operations and bitcasts between integer and floating-point variables. + */ +static inline float fp16_ieee_to_fp32_value(uint16_t h) { + /* + * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word: + * +---+-----+------------+-------------------+ + * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| + * +---+-----+------------+-------------------+ + * Bits 31 26-30 16-25 0-15 + * + * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits. + */ + const uint32_t w = (uint32_t) h << 16; + /* + * Extract the sign of the input number into the high bit of the 32-bit word: + * + * +---+----------------------------------+ + * | S |0000000 00000000 00000000 00000000| + * +---+----------------------------------+ + * Bits 31 0-31 + */ + const uint32_t sign = w & UINT32_C(0x80000000); + /* + * Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word: + * + * +-----+------------+---------------------+ + * |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000| + * +-----+------------+---------------------+ + * Bits 27-31 17-26 0-16 + */ + const uint32_t two_w = w + w; + + /* + * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent + * of a single-precision floating-point number: + * + * S|Exponent | Mantissa + * +-+---+-----+------------+----------------+ + * |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000| + * +-+---+-----+------------+----------------+ + * Bits | 23-31 | 0-22 + * + * Next, there are some adjustments to the exponent: + * - The exponent needs to be corrected by the difference in exponent bias between single-precision and half-precision + * formats (0x7F - 0xF = 0x70) + * - Inf and NaN values in the inputs should become Inf and NaN values after conversion to the single-precision number. + * Therefore, if the biased exponent of the half-precision input was 0x1F (max possible value), the biased exponent + * of the single-precision output must be 0xFF (max possible value). We do this correction in two steps: + * - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset below) rather than by 0x70 suggested + * by the difference in the exponent bias (see above). + * - Then we multiply the single-precision result of exponent adjustment by 2**(-112) to reverse the effect of + * exponent adjustment by 0xE0 less the necessary exponent adjustment by 0x70 due to difference in exponent bias. + * The floating-point multiplication hardware would ensure than Inf and NaN would retain their value on at least + * partially IEEE754-compliant implementations. + * + * Note that the above operations do not handle denormal inputs (where biased exponent == 0). However, they also do not + * operate on denormal inputs, and do not produce denormal results. + */ + const uint32_t exp_offset = UINT32_C(0xE0) << 23; +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) + const float exp_scale = 0x1.0p-112f; +#else + const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); +#endif + const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; + + /* + * Convert denormalized half-precision inputs into single-precision results (always normalized). + * Zero inputs are also handled here. + * + * In a denormalized number the biased exponent is zero, and mantissa has on-zero bits. + * First, we shift mantissa into bits 0-9 of the 32-bit word. + * + * zeros | mantissa + * +---------------------------+------------+ + * |0000 0000 0000 0000 0000 00|MM MMMM MMMM| + * +---------------------------+------------+ + * Bits 10-31 0-9 + * + * Now, remember that denormalized half-precision numbers are represented as: + * FP16 = mantissa * 2**(-24). + * The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input + * and with an exponent which would scale the corresponding mantissa bits to 2**(-24). + * A normalized single-precision floating-point number is represented as: + * FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127) + * Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision + * number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount. + * + * The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number + * is zero, the constructed single-precision number has the value of + * FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5 + * Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of + * the input half-precision number. + */ + const uint32_t magic_mask = UINT32_C(126) << 23; + const float magic_bias = 0.5f; + const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; + + /* + * - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the + * input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the + * input is either a denormal number, or zero. + * - Combine the result of conversion of exponent and mantissa with the sign of the input number. + */ + const uint32_t denormalized_cutoff = UINT32_C(1) << 27; + const uint32_t result = sign | + (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); + return fp32_from_bits(result); +} + +/* + * Convert a 32-bit floating-point number in IEEE single-precision format to a 16-bit floating-point number in + * IEEE half-precision format, in bit representation. + * + * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals) + * floating-point operations and bitcasts between integer and floating-point variables. + */ +static inline uint16_t fp16_ieee_from_fp32_value(float f) { +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) + const float scale_to_inf = 0x1.0p+112f; + const float scale_to_zero = 0x1.0p-110f; +#else + const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); + const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); +#endif + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; + + const uint32_t w = fp32_to_bits(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) { + bias = UINT32_C(0x71000000); + } + + base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = fp32_to_bits(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); +} + +void mul_mat_vec_f16_0( + const uint16_t * src0, + const uint16_t * src1, + float * dst, + int nrows, + int ncols) { + + const int ncols8 = ncols & ~7; + + for (int i = 0; i < nrows; i++) { + __m256 sum = _mm256_setzero_ps(); + + const uint16_t * src0_row = src0 + i * ncols; + for (int j = 0; j < ncols8; j += 8) { + __m256 a = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j))); + __m256 b = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j))); +#if defined(__FMA__) + sum = _mm256_fmadd_ps(a, b, sum); +#else + sum = _mm256_add_ps(_mm256_mul_ps(a, b), sum); +#endif + } + dst[i] = reduce_vector8_0(sum); + + for (int j = ncols8; j < ncols; j++) { + dst[i] += fp16_ieee_to_fp32_value(src0_row[j]) * fp16_ieee_to_fp32_value(src1[j]); + } + } +} + +void mul_mat_vec_f16_1( + const uint16_t * src0, + const uint16_t * src1, + float * dst, + int nrows, + int ncols) { + + const int ncols16 = ncols & ~15; + + for (int i = 0; i < nrows; i++) { + __m256 sum0 = _mm256_setzero_ps(); + __m256 sum1 = _mm256_setzero_ps(); + + const uint16_t * src0_row = src0 + i * ncols; + for (int j = 0; j < ncols16; j += 16) { + __m256 a0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 0))); + __m256 a1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 8))); + __m256 b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j))); + __m256 b1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 8))); +#if defined(__FMA__) + sum0 = _mm256_fmadd_ps(a0, b0, sum0); + sum1 = _mm256_fmadd_ps(a1, b1, sum1); +#else + sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0); + sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1); +#endif + } + dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1); + + for (int j = ncols16; j < ncols; j++) { + dst[i] += fp16_ieee_to_fp32_value(src0_row[j]) * fp16_ieee_to_fp32_value(src1[j]); + } + } +} + +void mul_mat_vec_f16_2( + const uint16_t * src0, + const uint16_t * src1, + float * dst, + int nrows, + int ncols) { + + const int ncols32 = ncols & ~31; + + for (int i = 0; i < nrows; i++) { + __m256 sum0 = _mm256_setzero_ps(); + __m256 sum1 = _mm256_setzero_ps(); + __m256 sum2 = _mm256_setzero_ps(); + __m256 sum3 = _mm256_setzero_ps(); + + const uint16_t * src0_row = src0 + i * ncols; + for (int j = 0; j < ncols32; j += 32) { + __m256 a0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 0))); + __m256 a1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 8))); + __m256 a2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 16))); + __m256 a3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 24))); + __m256 b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j))); + __m256 b1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 8))); + __m256 b2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 16))); + __m256 b3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 24))); +#if defined(__FMA__) + sum0 = _mm256_fmadd_ps(a0, b0, sum0); + sum1 = _mm256_fmadd_ps(a1, b1, sum1); + sum2 = _mm256_fmadd_ps(a2, b2, sum2); + sum3 = _mm256_fmadd_ps(a3, b3, sum3); +#else + sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0); + sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1); + sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2); + sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3); +#endif + } + dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1) + reduce_vector8_0(sum2) + reduce_vector8_0(sum3); + + for (int j = ncols32; j < ncols; j++) { + dst[i] += fp16_ieee_to_fp32_value(src0_row[j]) * fp16_ieee_to_fp32_value(src1[j]); + } + } +} + +void mul_mat_vec_f16_3( + const uint16_t * src0, + const float * src1, + float * dst, + int nrows, + int ncols) { + + const int ncols32 = ncols & ~31; + + for (int i = 0; i < nrows; i++) { + __m256 sum0 = _mm256_setzero_ps(); + __m256 sum1 = _mm256_setzero_ps(); + __m256 sum2 = _mm256_setzero_ps(); + __m256 sum3 = _mm256_setzero_ps(); + + const uint16_t * src0_row = src0 + i * ncols; + for (int j = 0; j < ncols32; j += 32) { + __m256 a0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 0))); + __m256 a1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 8))); + __m256 a2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 16))); + __m256 a3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 24))); + __m256 b0 = _mm256_loadu_ps(src1 + j); + __m256 b1 = _mm256_loadu_ps(src1 + j + 8); + __m256 b2 = _mm256_loadu_ps(src1 + j + 16); + __m256 b3 = _mm256_loadu_ps(src1 + j + 24); +#if defined(__FMA__) + sum0 = _mm256_fmadd_ps(a0, b0, sum0); + sum1 = _mm256_fmadd_ps(a1, b1, sum1); + sum2 = _mm256_fmadd_ps(a2, b2, sum2); + sum3 = _mm256_fmadd_ps(a3, b3, sum3); +#else + sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0); + sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1); + sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2); + sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3); +#endif + } + dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1) + reduce_vector8_0(sum2) + reduce_vector8_0(sum3); + + for (int j = ncols32; j < ncols; j++) { + dst[i] += fp16_ieee_to_fp32_value(src0_row[j]) * fp16_ieee_to_fp32_value(src1[j]); + } + } +} + +uint64_t get_time_us(void) { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000000 + tv.tv_usec; +} + +int main(int argc, const char ** argv) { + float * src0 = malloc(sizeof(float)*N*M); + float * src1 = malloc(sizeof(float)*M); + float * dst = malloc(sizeof(float)*N); + + //float * src0 = (float *)(aligned_alloc(64, sizeof(float)*N*M)); + //float * src1 = (float *)(aligned_alloc(64, sizeof(float)*M)); + //float * dst = (float *)(aligned_alloc(64, sizeof(float)*N)); + + for (int i = 0; i < N*M; i++) { + src0[i] = rand() / (float)RAND_MAX; + } + + for (int i = 0; i < M; i++) { + src1[i] = rand() / (float)RAND_MAX; + } + + // convert src0 and src1 to __fp16 + uint16_t * src0_fp16 = (uint16_t *)(malloc(sizeof(uint16_t)*N*M)); + uint16_t * src1_fp16 = (uint16_t *)(malloc(sizeof(uint16_t)*M)); + //uint16_t * src0_fp16 = (uint16_t *)(aligned_alloc(64, sizeof(uint16_t)*N*M)); + //uint16_t * src1_fp16 = (uint16_t *)(aligned_alloc(64, sizeof(uint16_t)*M)); + + { + const uint64_t t_start = get_time_us(); + + for (int i = 0; i < N*M; i++) { + src0_fp16[i] = fp16_ieee_from_fp32_value(src0[i]); + //printf("%f %f\n", src0[i], fp16_ieee_to_fp32_value(src0_fp16[i])); + //assert(!isnan(fp16_ieee_to_fp32_value(src0_fp16[i]))); + } + + for (int i = 0; i < M; i++) { + src1_fp16[i] = fp16_ieee_from_fp32_value(src1[i]); + } + + const uint64_t t_end = get_time_us(); + printf("convert time: %f ms\n", (t_end - t_start) / 1000.0); + } + + for (int i = 0; i < 16; ++i) { + printf("%f %f\n", src0[i], fp16_ieee_to_fp32_value(src0_fp16[i])); + } + + int method = 0; + if (argc > 1) { + method = atoi(argv[1]); + } + + const int nIter = 1000; + + const clock_t start = clock(); + const uint64_t start_us = get_time_us(); + + double iM = 1.0/M; + double sum = 0.0f; + for (int i = 0; i < nIter; i++) { + if (method == 0) { + mul_mat_vec_f32_0(src0, src1, dst, N, M); + } + + if (method == 1) { + mul_mat_vec_f32_1(src0, src1, dst, N, M); + } + + if (method == 2) { + mul_mat_vec_f32_2(src0, src1, dst, N, M); + } + + if (method == 3) { + mul_mat_vec_f16_0(src0_fp16, src1_fp16, dst, N, M); + } + + if (method == 4) { + mul_mat_vec_f16_1(src0_fp16, src1_fp16, dst, N, M); + } + + if (method == 5) { + mul_mat_vec_f16_2(src0_fp16, src1_fp16, dst, N, M); + } + + if (method == 6) { + mul_mat_vec_f16_3(src0_fp16, src1, dst, N, M); + } + } + + for (int i = 0; i < N; i++) { + sum += dst[i]*iM; + } + + { + const clock_t end = clock(); + const uint64_t end_us = get_time_us(); + printf("%s: elapsed ticks: %ld\n", __func__, end - start); + printf("%s: elapsed us: %ld\n", __func__, end_us - start_us); + } + + printf("%f\n", sum); + + free(src0); + free(src1); + free(dst); + + free(src0_fp16); + free(src1_fp16); + + return 0; +} diff --git a/seamless_communication/ggml/tests/test-vec2.c b/seamless_communication/ggml/tests/test-vec2.c new file mode 100644 index 0000000..465cf53 --- /dev/null +++ b/seamless_communication/ggml/tests/test-vec2.c @@ -0,0 +1,268 @@ +#include +#include +#include +#include +#include +#include + +#include + +#include + +const int N = 1 << 12; +const int M = 1 << 12; + +// +// naive implementation +// + +void mul_mat_vec_f32_0( + const float * restrict src0, + const float * restrict src1, + float * dst, + int nrows, + int ncols) { + for (int i = 0; i < nrows; i++) { + float sum = 0.0f; + for (int j = 0; j < ncols; j++) { + sum += src0[i*ncols + j]*src1[j]; + } + dst[i] = sum; + } +} + +void mul_mat_vec_f16_0( + const __fp16 * src0, + const __fp16 * src1, + float * dst, + int nrows, + int ncols) { + + const int n64 = ncols & ~63; + + for (int r = 0; r < nrows; r++) { + float sumf = 0.0; + + float16x8_t sum0 = vdupq_n_f16(0.0f); + float16x8_t sum1 = vdupq_n_f16(0.0f); + float16x8_t sum2 = vdupq_n_f16(0.0f); + float16x8_t sum3 = vdupq_n_f16(0.0f); + float16x8_t sum4 = vdupq_n_f16(0.0f); + float16x8_t sum5 = vdupq_n_f16(0.0f); + float16x8_t sum6 = vdupq_n_f16(0.0f); + float16x8_t sum7 = vdupq_n_f16(0.0f); + + float16x8_t x0, x1, x2, x3, x4, x5, x6, x7; + float16x8_t y0, y1, y2, y3, y4, y5, y6, y7; + + const __fp16 * restrict p0 = src0 + r*ncols; + + for (int i = 0; i < n64; i += 64) { + x0 = vld1q_f16(p0 + i + 0 ); + x1 = vld1q_f16(p0 + i + 8 ); + x2 = vld1q_f16(p0 + i + 16); + x3 = vld1q_f16(p0 + i + 24); + x4 = vld1q_f16(p0 + i + 32); + x5 = vld1q_f16(p0 + i + 40); + x6 = vld1q_f16(p0 + i + 48); + x7 = vld1q_f16(p0 + i + 56); + + y0 = vld1q_f16(src1 + i + 0 ); + y1 = vld1q_f16(src1 + i + 8 ); + y2 = vld1q_f16(src1 + i + 16); + y3 = vld1q_f16(src1 + i + 24); + y4 = vld1q_f16(src1 + i + 32); + y5 = vld1q_f16(src1 + i + 40); + y6 = vld1q_f16(src1 + i + 48); + y7 = vld1q_f16(src1 + i + 56); + + sum0 = vfmaq_f16(sum0, x0, y0); + sum1 = vfmaq_f16(sum1, x1, y1); + sum2 = vfmaq_f16(sum2, x2, y2); + sum3 = vfmaq_f16(sum3, x3, y3); + sum4 = vfmaq_f16(sum4, x4, y4); + sum5 = vfmaq_f16(sum5, x5, y5); + sum6 = vfmaq_f16(sum6, x6, y6); + sum7 = vfmaq_f16(sum7, x7, y7); + } + + // TODO: F16 - better way to reduce this ? + float16x8_t sum = vaddq_f16(sum0, sum1); + + sum = vaddq_f16(sum, sum2); + sum = vaddq_f16(sum, sum3); + sum = vaddq_f16(sum, sum4); + sum = vaddq_f16(sum, sum5); + sum = vaddq_f16(sum, sum6); + sum = vaddq_f16(sum, sum7); + + sumf += sum[0] + sum[1] + sum[2] + sum[3] + sum[4] + sum[5] + sum[6] + sum[7]; + + for (int j = n64; j < n64; j++) { + sumf += src0[r*ncols + j]*src1[j]; + } + + dst[r] = sumf; + } +} + +void mul_mat_vec_f16_1( + const __fp16 * src0, + const __fp16 * src1, + float * dst, + int nrows, + int ncols) { + + const int n32 = ncols & ~31; + + for (int r = 0; r < nrows; r++) { + float sumf = 0.0; + + float16x8_t sum0 = vdupq_n_f16(0.0f); + float16x8_t sum1 = vdupq_n_f16(0.0f); + float16x8_t sum2 = vdupq_n_f16(0.0f); + float16x8_t sum3 = vdupq_n_f16(0.0f); + + float16x8_t x0, x1, x2, x3; + float16x8_t y0, y1, y2, y3; + + const __fp16 * restrict p0 = src0 + r*ncols; + + for (int i = 0; i < n32; i += 32) { + x0 = vld1q_f16(p0 + i + 0 ); + x1 = vld1q_f16(p0 + i + 8 ); + x2 = vld1q_f16(p0 + i + 16); + x3 = vld1q_f16(p0 + i + 24); + + y0 = vld1q_f16(src1 + i + 0 ); + y1 = vld1q_f16(src1 + i + 8 ); + y2 = vld1q_f16(src1 + i + 16); + y3 = vld1q_f16(src1 + i + 24); + + sum0 = vfmaq_f16(sum0, x0, y0); + sum1 = vfmaq_f16(sum1, x1, y1); + sum2 = vfmaq_f16(sum2, x2, y2); + sum3 = vfmaq_f16(sum3, x3, y3); + } + + // reduce sum0..sum3 to sum0 + sum0 = vaddq_f16(sum0, sum1); + sum2 = vaddq_f16(sum2, sum3); + sum0 = vaddq_f16(sum0, sum2); + + // load sum0 into 2 float32x4_t + float32x4_t sum0f32 = vcvt_f32_f16(vget_low_f16(sum0)); + float32x4_t sum1f32 = vcvt_f32_f16(vget_high_f16(sum0)); + + // reduce sum0f32 and sum1f32 to sumf + sum0f32 = vaddq_f32(sum0f32, sum1f32); + + float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0f32), vget_high_f32(sum0f32)); + sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1); + + //sumf = sum0[0] + sum0[1] + sum0[2] + sum0[3] + sum0[4] + sum0[5] + sum0[6] + sum0[7]; + + for (int j = n32; j < n32; j++) { + sumf += src0[r*ncols + j]*src1[j]; + } + + dst[r] = sumf; + } +} + +uint64_t get_time_us() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000000 + tv.tv_usec; +} + +int main(int argc, const char ** argv) { + float * src0 = malloc(sizeof(float)*N*M); + float * src1 = malloc(sizeof(float)*M); + float * dst = malloc(sizeof(float)*N); + + //float * src0 = (float *)(aligned_alloc(64, sizeof(float)*N*M)); + //float * src1 = (float *)(aligned_alloc(64, sizeof(float)*M)); + //float * dst = (float *)(aligned_alloc(64, sizeof(float)*N)); + + for (int i = 0; i < N*M; i++) { + src0[i] = rand() / (float)RAND_MAX; + } + + for (int i = 0; i < M; i++) { + src1[i] = rand() / (float)RAND_MAX; + } + + // convert src0 and src1 to __fp16 + __fp16 * src0_fp16 = (__fp16 *)(malloc(sizeof(__fp16)*N*M)); + __fp16 * src1_fp16 = (__fp16 *)(malloc(sizeof(__fp16)*M)); + + { + const uint64_t t_start = get_time_us(); + + for (int i = 0; i < N*M; i++) { + src0_fp16[i] = src0[i]; + //printf("%f %f\n", src0[i], src0_fp16[i]); + //assert(!isnan(src0_fp16[i])); + } + + for (int i = 0; i < M; i++) { + src1_fp16[i] = src1[i]; + } + + const uint64_t t_end = get_time_us(); + printf("convert time: %f ms\n", (t_end - t_start) / 1000.0); + } + + for (int i = 0; i < 16; ++i) { + printf("%f %f\n", src0[i], src0_fp16[i]); + } + + int method = 0; + if (argc > 1) { + method = atoi(argv[1]); + } + + const int nIter = 1000; + + const clock_t start = clock(); + const uint64_t start_us = get_time_us(); + + double iM = 1.0/M; + double sum = 0.0f; + for (int i = 0; i < nIter; i++) { + if (method == 0) { + mul_mat_vec_f32_0(src0, src1, dst, N, M); + } + + if (method == 1) { + mul_mat_vec_f16_0(src0_fp16, src1_fp16, dst, N, M); + } + + if (method == 2) { + mul_mat_vec_f16_1(src0_fp16, src1_fp16, dst, N, M); + } + } + + for (int i = 0; i < N; i++) { + sum += dst[i]*iM; + } + + { + const clock_t end = clock(); + const uint64_t end_us = get_time_us(); + printf("%s: elapsed ticks: %ld\n", __func__, end - start); + printf("%s: elapsed us: %llu / %f ms\n", __func__, end_us - start_us, (end_us - start_us) / 1000.0 / nIter); + } + + printf("%f\n", sum); + + free(src0); + free(src1); + free(dst); + + free(src0_fp16); + free(src1_fp16); + + return 0; +} diff --git a/seamless_communication/ggml/tests/test-xpos.c b/seamless_communication/ggml/tests/test-xpos.c new file mode 100644 index 0000000..a8c64e5 --- /dev/null +++ b/seamless_communication/ggml/tests/test-xpos.c @@ -0,0 +1,87 @@ +#include "ggml/ggml.h" + +#include +#include +#include + +bool is_close(float a, float b, float epsilon) { + return fabs(a - b) < epsilon; +} + +int main(int argc, char ** argv) { + const int n_threads = 1; + const int n_embd_head = 4; // aka head_dim + const int n_head = 1; + const int N = 8; + + struct ggml_init_params params = { + .mem_size = 16*1024*1024, + .mem_buffer = NULL, + }; + + // memory allocation happens here + struct ggml_context * ctx = ggml_init(params); + + struct ggml_tensor * Q = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, N); + struct ggml_tensor * K = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, N); + + for (int i = 0; i < ggml_nelements(Q); i++) { + ((float*) Q->data)[i] = 2.0f; + ((float*) K->data)[i] = 2.0f; + } + + struct ggml_tensor * Qx = ggml_rope_xpos_inplace(ctx, Q, 1, n_embd_head, 512.0f, false); + struct ggml_tensor * Kx = ggml_rope_xpos_inplace(ctx, K, 1, n_embd_head, 512.0f, true); + + struct ggml_cgraph gf = ggml_build_forward(Qx); + ggml_build_forward_expand(&gf, Kx); + ggml_graph_compute_with_ctx(ctx, &gf, n_threads); + + // expected output for Qx: + // -0.6009 2.7568 1.9782 2.0182 + // -2.6379 0.9815 1.9562 2.0361 + // -2.2457 -1.6853 1.9341 2.0538 + // 0.2043 -2.7934 1.9118 2.0712 + // 2.4550 -1.3341 1.8894 2.0884 + // 2.4430 1.3417 1.8668 2.1054 + // 0.1905 2.7739 1.8440 2.1221 + // -2.2257 1.6550 1.8212 2.1386 + + for (int i = 0; i < ggml_nelements(Q); i++) { + if (((float*) Qx->data)[i] > 0) printf(" "); + printf("%.4f ", ((float*) Qx->data)[i]); + if ((i+1) % n_embd_head == 0) printf("\n"); + } + printf("\n"); + + GGML_ASSERT(is_close(((float*) Qx->data)[7 * n_embd_head + 0], -2.2257f, 0.0001f)); + GGML_ASSERT(is_close(((float*) Qx->data)[7 * n_embd_head + 1], 1.6550f, 0.0001f)); + GGML_ASSERT(is_close(((float*) Qx->data)[7 * n_embd_head + 2], 1.8212f, 0.0001f)); + GGML_ASSERT(is_close(((float*) Qx->data)[7 * n_embd_head + 3], 2.1386f, 0.0001f)); + + // expected output for Kx: + // -0.6038 2.7703 1.9816 2.0216 + // -2.6639 0.9911 1.9630 2.0431 + // -2.2789 -1.7103 1.9441 2.0644 + // 0.2083 -2.8486 1.9251 2.0856 + // 2.5158 -1.3671 1.9057 2.1065 + // 2.5158 1.3816 1.8862 2.1273 + // 0.1972 2.8705 1.8665 2.1479 + // -2.3146 1.7211 1.8465 2.1684 + + for (int i = 0; i < ggml_nelements(K); i++) { + if (((float*) Kx->data)[i] > 0) printf(" "); + printf("%.4f ", ((float*) Kx->data)[i]); + if ((i+1) % n_embd_head == 0) printf("\n"); + } + printf("\n"); + + GGML_ASSERT(is_close(((float*) Kx->data)[7 * n_embd_head + 0], -2.3146f, 0.0001f)); + GGML_ASSERT(is_close(((float*) Kx->data)[7 * n_embd_head + 1], 1.7211f, 0.0001f)); + GGML_ASSERT(is_close(((float*) Kx->data)[7 * n_embd_head + 2], 1.8465f, 0.0001f)); + GGML_ASSERT(is_close(((float*) Kx->data)[7 * n_embd_head + 3], 2.1684f, 0.0001f)); + + ggml_free(ctx); + + return 0; +} diff --git a/seamless_communication/ggml/tests/test0.c b/seamless_communication/ggml/tests/test0.c new file mode 100644 index 0000000..7fba63e --- /dev/null +++ b/seamless_communication/ggml/tests/test0.c @@ -0,0 +1,42 @@ +#include "ggml/ggml.h" + +#include +#include + +int main(int argc, const char ** argv) { + struct ggml_init_params params = { + .mem_size = 128*1024*1024, + .mem_buffer = NULL, + .no_alloc = false, + }; + + struct ggml_context * ctx0 = ggml_init(params); + + struct ggml_tensor * t1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 10); + struct ggml_tensor * t2 = ggml_new_tensor_2d(ctx0, GGML_TYPE_I16, 10, 20); + struct ggml_tensor * t3 = ggml_new_tensor_3d(ctx0, GGML_TYPE_I32, 10, 20, 30); + + GGML_ASSERT(t1->n_dims == 1); + GGML_ASSERT(t1->ne[0] == 10); + GGML_ASSERT(t1->nb[1] == 10*sizeof(float)); + + GGML_ASSERT(t2->n_dims == 2); + GGML_ASSERT(t2->ne[0] == 10); + GGML_ASSERT(t2->ne[1] == 20); + GGML_ASSERT(t2->nb[1] == 10*sizeof(int16_t)); + GGML_ASSERT(t2->nb[2] == 10*20*sizeof(int16_t)); + + GGML_ASSERT(t3->n_dims == 3); + GGML_ASSERT(t3->ne[0] == 10); + GGML_ASSERT(t3->ne[1] == 20); + GGML_ASSERT(t3->ne[2] == 30); + GGML_ASSERT(t3->nb[1] == 10*sizeof(int32_t)); + GGML_ASSERT(t3->nb[2] == 10*20*sizeof(int32_t)); + GGML_ASSERT(t3->nb[3] == 10*20*30*sizeof(int32_t)); + + ggml_print_objects(ctx0); + + ggml_free(ctx0); + + return 0; +} diff --git a/seamless_communication/ggml/tests/test0.zig b/seamless_communication/ggml/tests/test0.zig new file mode 100644 index 0000000..e47bf36 --- /dev/null +++ b/seamless_communication/ggml/tests/test0.zig @@ -0,0 +1,41 @@ +const std = @import("std"); +const c = @cImport({ + @cInclude("ggml/ggml.h"); +}); + +pub fn main() !void { + const params = .{ + .mem_size = 128*1024*1024, + .mem_buffer = null, + .no_alloc = false, + }; + + const ctx0 = c.ggml_init(params); + defer c.ggml_free(ctx0); + + const t1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 10); + const t2 = c.ggml_new_tensor_2d(ctx0, c.GGML_TYPE_I16, 10, 20); + const t3 = c.ggml_new_tensor_3d(ctx0, c.GGML_TYPE_I32, 10, 20, 30); + + try std.testing.expect(t1.*.n_dims == 1); + try std.testing.expect(t1.*.ne[0] == 10); + try std.testing.expect(t1.*.nb[1] == 10*@sizeOf(f32)); + + try std.testing.expect(t2.*.n_dims == 2); + try std.testing.expect(t2.*.ne[0] == 10); + try std.testing.expect(t2.*.ne[1] == 20); + try std.testing.expect(t2.*.nb[1] == 10*@sizeOf(i16)); + try std.testing.expect(t2.*.nb[2] == 10*20*@sizeOf(i16)); + + try std.testing.expect(t3.*.n_dims == 3); + try std.testing.expect(t3.*.ne[0] == 10); + try std.testing.expect(t3.*.ne[1] == 20); + try std.testing.expect(t3.*.ne[2] == 30); + try std.testing.expect(t3.*.nb[1] == 10*@sizeOf(i32)); + try std.testing.expect(t3.*.nb[2] == 10*20*@sizeOf(i32)); + try std.testing.expect(t3.*.nb[3] == 10*20*30*@sizeOf(i32)); + + c.ggml_print_objects(ctx0); + + _ = try std.io.getStdIn().reader().readByte(); +} diff --git a/seamless_communication/ggml/tests/test1.c b/seamless_communication/ggml/tests/test1.c new file mode 100644 index 0000000..c313bf8 --- /dev/null +++ b/seamless_communication/ggml/tests/test1.c @@ -0,0 +1,438 @@ +#include "ggml/ggml.h" + +#include +#include + +int main(int argc, const char ** argv) { + const int n_threads = 2; + + struct ggml_init_params params = { + .mem_size = 128*1024*1024, + .mem_buffer = NULL, + .no_alloc = false, + }; + + struct ggml_context * ctx0 = ggml_init(params); + + { + struct ggml_tensor * x = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + + ggml_set_param(ctx0, x); + + struct ggml_tensor * a = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + struct ggml_tensor * b = ggml_mul(ctx0, x, x); + struct ggml_tensor * f = ggml_mul(ctx0, b, a); + + // a*x^2 + // 2*a*x + + ggml_print_objects(ctx0); + + struct ggml_cgraph gf = ggml_build_forward(f); + struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); + + ggml_set_f32(x, 2.0f); + ggml_set_f32(a, 3.0f); + + ggml_graph_reset(&gf); + ggml_set_f32(f->grad, 1.0f); + + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); + + printf("f = %f\n", ggml_get_f32_1d(f, 0)); + printf("df/dx = %f\n", ggml_get_f32_1d(x->grad, 0)); + + GGML_ASSERT(ggml_get_f32_1d(f, 0) == 12.0f); + GGML_ASSERT(ggml_get_f32_1d(x->grad, 0) == 12.0f); + + ggml_set_f32(x, 3.0f); + + ggml_graph_reset(&gf); + ggml_set_f32(f->grad, 1.0f); + + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); + + printf("f = %f\n", ggml_get_f32_1d(f, 0)); + printf("df/dx = %f\n", ggml_get_f32_1d(x->grad, 0)); + + GGML_ASSERT(ggml_get_f32_1d(f, 0) == 27.0f); + GGML_ASSERT(ggml_get_f32_1d(x->grad, 0) == 18.0f); + + ggml_graph_dump_dot(&gf, NULL, "test1-1-forward.dot"); + ggml_graph_dump_dot(&gb, &gf, "test1-1-backward.dot"); + } + + /////////////////////////////////////////////////////////////// + + { + struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + struct ggml_tensor * x3 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + + ggml_set_f32(x1, 3.0f); + ggml_set_f32(x2, 1.0f); + ggml_set_f32(x3, 0.0f); + + ggml_set_param(ctx0, x1); + ggml_set_param(ctx0, x2); + + struct ggml_tensor * y = ggml_add(ctx0, ggml_mul(ctx0, x1, x1), ggml_mul(ctx0, x1, x2)); + + struct ggml_cgraph gf = ggml_build_forward(y); + struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); + + ggml_graph_reset(&gf); + ggml_set_f32(y->grad, 1.0f); + + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); + + printf("y = %f\n", ggml_get_f32_1d(y, 0)); + printf("df/dx1 = %f\n", ggml_get_f32_1d(x1->grad, 0)); + printf("df/dx2 = %f\n", ggml_get_f32_1d(x2->grad, 0)); + + GGML_ASSERT(ggml_get_f32_1d(y, 0) == 12.0f); + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 7.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 3.0f); + + struct ggml_tensor * g1 = x1->grad; + struct ggml_tensor * g2 = x2->grad; + + struct ggml_cgraph gbb = ggml_build_backward(ctx0, &gb, true); + + ggml_graph_reset(&gb); + ggml_set_f32(g1->grad, 1.0f); + ggml_set_f32(g2->grad, 1.0f); + + ggml_graph_compute_with_ctx(ctx0, &gbb, n_threads); + + printf("H * [1, 1] = [ %f %f ]\n", ggml_get_f32_1d(x1->grad, 0), ggml_get_f32_1d(x2->grad, 0)); + + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 3.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 1.0f); + + ggml_graph_dump_dot(&gf, NULL, "test1-2-forward.dot"); + ggml_graph_dump_dot(&gb, &gf, "test1-2-backward.dot"); + } + + /////////////////////////////////////////////////////////////// + + { + struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + + ggml_set_param(ctx0, x1); + ggml_set_param(ctx0, x2); + + struct ggml_tensor * y = ggml_mul(ctx0, ggml_add(ctx0, ggml_mul(ctx0, x1, x1), ggml_mul(ctx0, x1, x2)), x1); + + struct ggml_cgraph gf = ggml_build_forward(y); + struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); + + ggml_set_f32(x1, 3.0f); + ggml_set_f32(x2, 4.0f); + + ggml_graph_reset(&gf); + ggml_set_f32(y->grad, 1.0f); + + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); + + printf("y = %f\n", ggml_get_f32_1d(y, 0)); + printf("df/dx1 = %f\n", ggml_get_f32_1d(x1->grad, 0)); + printf("df/dx2 = %f\n", ggml_get_f32_1d(x2->grad, 0)); + + GGML_ASSERT(ggml_get_f32_1d(y, 0) == 63.0f); + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 51.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 9.0f); + + ggml_graph_dump_dot(&gf, NULL, "test1-3-forward.dot"); + ggml_graph_dump_dot(&gb, &gf, "test1-3-backward.dot"); + } + + /////////////////////////////////////////////////////////////// + + { + struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + struct ggml_tensor * x3 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + + ggml_set_param(ctx0, x1); + ggml_set_param(ctx0, x2); + ggml_set_param(ctx0, x3); + + struct ggml_tensor * y = ggml_mul(ctx0, ggml_mul(ctx0, ggml_mul(ctx0, x1, x1), ggml_mul(ctx0, x2, x2)), x3); + + struct ggml_cgraph gf = ggml_build_forward(y); + struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); + + ggml_set_f32(x1, 1.0f); + ggml_set_f32(x2, 2.0f); + ggml_set_f32(x3, 3.0f); + + ggml_graph_reset(&gf); + ggml_set_f32(y->grad, 1.0f); + + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); + + printf("y = %f\n", ggml_get_f32_1d(y, 0)); + printf("df/dx1 = %f\n", ggml_get_f32_1d(x1->grad, 0)); + printf("df/dx2 = %f\n", ggml_get_f32_1d(x2->grad, 0)); + printf("df/dx3 = %f\n", ggml_get_f32_1d(x3->grad, 0)); + + GGML_ASSERT(ggml_get_f32_1d(y, 0) == 12.0f); + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 24.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 12.0f); + GGML_ASSERT(ggml_get_f32_1d(x3->grad, 0) == 4.0f); + + struct ggml_tensor * g1 = x1->grad; + struct ggml_tensor * g2 = x2->grad; + struct ggml_tensor * g3 = x3->grad; + + struct ggml_cgraph gbb = ggml_build_backward(ctx0, &gb, true); + + ggml_graph_reset(&gb); + ggml_set_f32(g1->grad, 1.0f); + ggml_set_f32(g2->grad, 1.0f); + ggml_set_f32(g3->grad, 1.0f); + + ggml_graph_compute_with_ctx(ctx0, &gbb, n_threads); + + printf("H * [1, 1, 1] = [ %f %f %f ]\n", + ggml_get_f32_1d(x1->grad, 0), + ggml_get_f32_1d(x2->grad, 0), + ggml_get_f32_1d(x3->grad, 0)); + + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 56.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 34.0f); + GGML_ASSERT(ggml_get_f32_1d(x3->grad, 0) == 12.0f); + + ggml_graph_dump_dot(&gf, NULL, "test1-4-forward.dot"); + ggml_graph_dump_dot(&gb, &gf, "test1-4-backward.dot"); + } + + /////////////////////////////////////////////////////////////// + + { + struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3); + struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3); + + ggml_set_param(ctx0, x1); + ggml_set_param(ctx0, x2); + + struct ggml_tensor * y = ggml_sum(ctx0, ggml_mul(ctx0, x1, x2)); + + struct ggml_cgraph gf = ggml_build_forward(y); + struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); + + ggml_set_f32(x1, 3.0f); + ggml_set_f32(x2, 5.0f); + + ggml_graph_reset(&gf); + ggml_set_f32(y->grad, 1.0f); + + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); + + printf("y = %f\n", ggml_get_f32_1d(y, 0)); + printf("df/dx1 = %f %f %f\n", + ggml_get_f32_1d(x1->grad, 0), + ggml_get_f32_1d(x1->grad, 1), + ggml_get_f32_1d(x1->grad, 2)); + printf("df/dx2 = %f %f %f\n", + ggml_get_f32_1d(x2->grad, 0), + ggml_get_f32_1d(x2->grad, 1), + ggml_get_f32_1d(x2->grad, 2)); + + GGML_ASSERT(ggml_get_f32_1d(y, 0) == 45.0f); + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 5.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 3.0f); + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 1) == 5.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 1) == 3.0f); + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 2) == 5.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 2) == 3.0f); + + ggml_graph_dump_dot(&gf, NULL, "test1-5-forward.dot"); + ggml_graph_dump_dot(&gb, &gf, "test1-5-backward.dot"); + } + + /////////////////////////////////////////////////////////////// + + { + struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3); + struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3); + + ggml_set_param(ctx0, x1); + ggml_set_param(ctx0, x2); + + struct ggml_tensor * y = + ggml_sum(ctx0, + ggml_add(ctx0, + ggml_mul(ctx0, x1, x2), + ggml_mul(ctx0, + ggml_repeat(ctx0, ggml_new_f32(ctx0, -2.0f), x1), + ggml_mul(ctx0, x1, x1) + ) + ) + ); + + struct ggml_cgraph gf = ggml_build_forward(y); + struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); + + ggml_set_f32(x1, 3.0f); + ggml_set_f32(x2, 5.0f); + + ggml_graph_reset(&gf); + ggml_set_f32(y->grad, 1.0f); + + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); + + printf("y = %f\n", ggml_get_f32_1d(y, 0)); + printf("df/dx1 = %f %f %f\n", + ggml_get_f32_1d(x1->grad, 0), + ggml_get_f32_1d(x1->grad, 1), + ggml_get_f32_1d(x1->grad, 2)); + printf("df/dx2 = %f %f %f\n", + ggml_get_f32_1d(x2->grad, 0), + ggml_get_f32_1d(x2->grad, 1), + ggml_get_f32_1d(x2->grad, 2)); + + GGML_ASSERT(ggml_get_f32_1d(y, 0) == -9.0f); + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == -7.0f); + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 1) == -7.0f); + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 2) == -7.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 3.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 1) == 3.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 2) == 3.0f); + + ggml_graph_dump_dot(&gf, NULL, "test1-6-forward.dot"); + ggml_graph_dump_dot(&gb, &gf, "test1-6-backward.dot"); + } + + /////////////////////////////////////////////////////////////// + + { + struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3); + struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3); + + ggml_set_param(ctx0, x1); + ggml_set_param(ctx0, x2); + + struct ggml_tensor * y = + ggml_sum(ctx0, + ggml_sub(ctx0, + ggml_mul(ctx0, x1, x2), + ggml_mul(ctx0, + ggml_mul(ctx0, x1, x1), + ggml_repeat(ctx0, ggml_new_f32(ctx0, -2.0f), x1) + ) + ) + ); + + struct ggml_cgraph gf = ggml_build_forward(y); + struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); + + ggml_set_f32(x1, 3.0f); + ggml_set_f32(x2, 5.0f); + + ggml_graph_reset(&gf); + ggml_set_f32(y->grad, 1.0f); + + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); + + printf("y = %f\n", ggml_get_f32_1d(y, 0)); + printf("df/dx1 = %f %f %f\n", + ggml_get_f32_1d(x1->grad, 0), + ggml_get_f32_1d(x1->grad, 1), + ggml_get_f32_1d(x1->grad, 2)); + printf("df/dx2 = %f %f %f\n", + ggml_get_f32_1d(x2->grad, 0), + ggml_get_f32_1d(x2->grad, 1), + ggml_get_f32_1d(x2->grad, 2)); + + GGML_ASSERT(ggml_get_f32_1d(y, 0) == 99.0f); + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 17.0f); + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 1) == 17.0f); + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 2) == 17.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 3.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 1) == 3.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 2) == 3.0f); + + ggml_graph_dump_dot(&gf, NULL, "test1-7-forward.dot"); + ggml_graph_dump_dot(&gb, &gf, "test1-7-backward.dot"); + } + + /////////////////////////////////////////////////////////////// + + { + struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3); + struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3); + + ggml_set_param(ctx0, x1); + ggml_set_param(ctx0, x2); + + struct ggml_tensor * y = + ggml_abs(ctx0, + ggml_sub(ctx0, x1, x2) + ); + + struct ggml_cgraph gf = ggml_build_forward(y); + struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); + + ggml_set_f32(x1, 3.0f); + ggml_set_f32(x2, 5.0f); + + ggml_graph_reset(&gf); + ggml_set_f32(y->grad, 1.0f); + + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); + + printf("y = %f\n", ggml_get_f32_1d(y, 0)); + printf("df/dx1 = %f %f %f\n", + ggml_get_f32_1d(x1->grad, 0), + ggml_get_f32_1d(x1->grad, 1), + ggml_get_f32_1d(x1->grad, 2)); + printf("df/dx2 = %f %f %f\n", + ggml_get_f32_1d(x2->grad, 0), + ggml_get_f32_1d(x2->grad, 1), + ggml_get_f32_1d(x2->grad, 2)); + + GGML_ASSERT(ggml_get_f32_1d(y, 0) == 2.0f); + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == -1.0f); + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 1) == -1.0f); + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 2) == -1.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 1.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 1) == 1.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 2) == 1.0f); + + ggml_set_f32(x1, 7.0f); + ggml_set_f32(x2, 5.0f); + + ggml_graph_reset(&gf); + ggml_set_f32(y->grad, 1.0f); + + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); + + printf("y = %f\n", ggml_get_f32_1d(y, 0)); + printf("df/dx1 = %f %f %f\n", + ggml_get_f32_1d(x1->grad, 0), + ggml_get_f32_1d(x1->grad, 1), + ggml_get_f32_1d(x1->grad, 2)); + printf("df/dx2 = %f %f %f\n", + ggml_get_f32_1d(x2->grad, 0), + ggml_get_f32_1d(x2->grad, 1), + ggml_get_f32_1d(x2->grad, 2)); + + GGML_ASSERT(ggml_get_f32_1d(y, 0) == 2.0f); + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 1.0f); + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 1) == 1.0f); + GGML_ASSERT(ggml_get_f32_1d(x1->grad, 2) == 1.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == -1.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 1) == -1.0f); + GGML_ASSERT(ggml_get_f32_1d(x2->grad, 2) == -1.0f); + + ggml_graph_dump_dot(&gf, NULL, "test1-8-forward.dot"); + ggml_graph_dump_dot(&gb, &gf, "test1-8-backward.dot"); + } + + ggml_free(ctx0); + + return 0; +} diff --git a/seamless_communication/ggml/tests/test1.zig b/seamless_communication/ggml/tests/test1.zig new file mode 100644 index 0000000..f331acb --- /dev/null +++ b/seamless_communication/ggml/tests/test1.zig @@ -0,0 +1,459 @@ +const std = @import("std"); +const c = @cImport({ + @cInclude("ggml/ggml.h"); +}); + +pub fn main() !void { + const n_threads = 2; + + const params = .{ + .mem_size = 128*1024*1024, + .mem_buffer = null, + .no_alloc = false, + }; + + const ctx0 = c.ggml_init(params); + defer c.ggml_free(ctx0); + + { + const x = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1); + + c.ggml_set_param(ctx0, x); + + const a = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1); + const b = c.ggml_mul(ctx0, x, x); + const f = c.ggml_mul(ctx0, b, a); + + // a*x^2 + // 2*a*x + + c.ggml_print_objects(ctx0); + + const gf = c.ggml_build_forward(f); + const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false); + + _ = c.ggml_set_f32(x, 2.0); + _ = c.ggml_set_f32(a, 3.0); + + c.ggml_graph_reset(@constCast(&gf)); + _ = c.ggml_set_f32(f.*.grad, 1.0); + + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); + + std.debug.print("f = {d:.6}\n", .{c.ggml_get_f32_1d(f, 0)}); + std.debug.print("df/dx = {d:.6}\n", .{c.ggml_get_f32_1d(x.*.grad, 0)}); + + try std.testing.expect(c.ggml_get_f32_1d(f, 0) == 12.0); + try std.testing.expect(c.ggml_get_f32_1d(x.*.grad, 0) == 12.0); + + _ = c.ggml_set_f32(x, 3.0); + + c.ggml_graph_reset(@constCast(&gf)); + _ = c.ggml_set_f32(f.*.grad, 1.0); + + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); + + std.debug.print("f = {d:.6}\n", .{c.ggml_get_f32_1d(f, 0)}); + std.debug.print("df/dx = {d:.6}\n", .{c.ggml_get_f32_1d(x.*.grad, 0)}); + + try std.testing.expect(c.ggml_get_f32_1d(f, 0) == 27.0); + try std.testing.expect(c.ggml_get_f32_1d(x.*.grad, 0) == 18.0); + + c.ggml_graph_dump_dot(&gf, null, "test1-1-forward.dot"); + c.ggml_graph_dump_dot(&gb, &gf, "test1-1-backward.dot"); + } + + ///////////////////////////////////////////////////////////// + + { + const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1); + const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1); + const x3 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1); + + _ = c.ggml_set_f32(x1, 3.0); + _ = c.ggml_set_f32(x2, 1.0); + _ = c.ggml_set_f32(x3, 0.0); + + c.ggml_set_param(ctx0, x1); + c.ggml_set_param(ctx0, x2); + + const y = c.ggml_add(ctx0, c.ggml_mul(ctx0, x1, x1), c.ggml_mul(ctx0, x1, x2)); + + const gf = c.ggml_build_forward(y); + const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false); + + c.ggml_graph_reset(@constCast(&gf)); + _ = c.ggml_set_f32(y.*.grad, 1.0); + + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); + + std.debug.print("y = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)}); + std.debug.print("df/dx1 = {d:.6}\n", .{c.ggml_get_f32_1d(x1.*.grad, 0)}); + std.debug.print("df/dx2 = {d:.6}\n", .{c.ggml_get_f32_1d(x2.*.grad, 0)}); + + try std.testing.expect(c.ggml_get_f32_1d(y, 0) == 12.0); + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0) == 7.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0) == 3.0); + + const g1 = x1.*.grad; + const g2 = x2.*.grad; + + const gbb = c.ggml_build_backward(ctx0, @constCast(&gb), true); + + c.ggml_graph_reset(@constCast(&gb)); + _ = c.ggml_set_f32(g1.*.grad, 1.0); + _ = c.ggml_set_f32(g2.*.grad, 1.0); + + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gbb), n_threads); + + std.debug.print("H * [1, 1] = [ {d:.6} {d:.6} ]\n", .{c.ggml_get_f32_1d(x1.*.grad, 0), c.ggml_get_f32_1d(x2.*.grad, 0)}); + + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0) == 3.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0) == 1.0); + + c.ggml_graph_dump_dot(&gf, null, "test1-2-forward.dot"); + c.ggml_graph_dump_dot(&gb, &gf, "test1-2-backward.dot"); + } + + /////////////////////////////////////////////////////////////// + + { + const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1); + const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1); + + c.ggml_set_param(ctx0, x1); + c.ggml_set_param(ctx0, x2); + + const y = c.ggml_mul(ctx0, c.ggml_add(ctx0, c.ggml_mul(ctx0, x1, x1), c.ggml_mul(ctx0, x1, x2)), x1); + + const gf = c.ggml_build_forward(y); + const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false); + + _ = c.ggml_set_f32(x1, 3.0); + _ = c.ggml_set_f32(x2, 4.0); + + c.ggml_graph_reset(@constCast(&gf)); + _ = c.ggml_set_f32(y.*.grad, 1.0); + + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); + + std.debug.print("y = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)}); + std.debug.print("df/dx1 = {d:.6}\n", .{c.ggml_get_f32_1d(x1.*.grad, 0)}); + std.debug.print("df/dx2 = {d:.6}\n", .{c.ggml_get_f32_1d(x2.*.grad, 0)}); + + try std.testing.expect(c.ggml_get_f32_1d(y, 0) == 63.0); + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0) == 51.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0) == 9.0); + + c.ggml_graph_dump_dot(&gf, null, "test1-3-forward.dot"); + c.ggml_graph_dump_dot(&gb, &gf, "test1-3-backward.dot"); + } + + /////////////////////////////////////////////////////////////// + + { + const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1); + const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1); + const x3 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1); + + c.ggml_set_param(ctx0, x1); + c.ggml_set_param(ctx0, x2); + c.ggml_set_param(ctx0, x3); + + const y = c.ggml_mul(ctx0, c.ggml_mul(ctx0, c.ggml_mul(ctx0, x1, x1), c.ggml_mul(ctx0, x2, x2)), x3); + + const gf = c.ggml_build_forward(y); + const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false); + + _ = c.ggml_set_f32(x1, 1.0); + _ = c.ggml_set_f32(x2, 2.0); + _ = c.ggml_set_f32(x3, 3.0); + + c.ggml_graph_reset(@constCast(&gf)); + _ = c.ggml_set_f32(y.*.grad, 1.0); + + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); + + std.debug.print("y = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)}); + std.debug.print("df/dx1 = {d:.6}\n", .{c.ggml_get_f32_1d(x1.*.grad, 0)}); + std.debug.print("df/dx2 = {d:.6}\n", .{c.ggml_get_f32_1d(x2.*.grad, 0)}); + std.debug.print("df/dx3 = {d:.6}\n", .{c.ggml_get_f32_1d(x3.*.grad, 0)}); + + try std.testing.expect(c.ggml_get_f32_1d(y, 0) == 12.0); + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0) == 24.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0) == 12.0); + try std.testing.expect(c.ggml_get_f32_1d(x3.*.grad, 0) == 4.0); + + const g1 = x1.*.grad; + const g2 = x2.*.grad; + const g3 = x3.*.grad; + + const gbb = c.ggml_build_backward(ctx0, @constCast(&gb), true); + + c.ggml_graph_reset(@constCast(&gb)); + _ = c.ggml_set_f32(g1.*.grad, 1.0); + _ = c.ggml_set_f32(g2.*.grad, 1.0); + _ = c.ggml_set_f32(g3.*.grad, 1.0); + + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gbb), n_threads); + + std.debug.print("H * [1, 1, 1] = [ {d:.6} {d:.6} {d:.6}]\n", + .{ + c.ggml_get_f32_1d(x1.*.grad, 0), + c.ggml_get_f32_1d(x2.*.grad, 0), + c.ggml_get_f32_1d(x3.*.grad, 0), + }); + + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0) == 56.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0) == 34.0); + try std.testing.expect(c.ggml_get_f32_1d(x3.*.grad, 0) == 12.0); + + c.ggml_graph_dump_dot(&gf, null, "test1-4-forward.dot"); + c.ggml_graph_dump_dot(&gb, &gf, "test1-4-backward.dot"); + } + + /////////////////////////////////////////////////////////////// + + { + const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3); + const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3); + + c.ggml_set_param(ctx0, x1); + c.ggml_set_param(ctx0, x2); + + const y = c.ggml_sum(ctx0, c.ggml_mul(ctx0, x1, x2)); + + const gf = c.ggml_build_forward(y); + const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false); + + _ = c.ggml_set_f32(x1, 3.0); + _ = c.ggml_set_f32(x2, 5.0); + + c.ggml_graph_reset(@constCast(&gf)); + _ = c.ggml_set_f32(y.*.grad, 1.0); + + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); + + std.debug.print("y = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)}); + std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n", + .{ + c.ggml_get_f32_1d(x1.*.grad, 0), + c.ggml_get_f32_1d(x1.*.grad, 1), + c.ggml_get_f32_1d(x1.*.grad, 2), + }); + std.debug.print("df/dx2 = {d:.6} {d:.6} {d:.6}\n", + .{ + c.ggml_get_f32_1d(x2.*.grad, 0), + c.ggml_get_f32_1d(x2.*.grad, 1), + c.ggml_get_f32_1d(x2.*.grad, 2), + }); + + try std.testing.expect(c.ggml_get_f32_1d(y, 0) == 45.0); + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0) == 5.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0) == 3.0); + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 1) == 5.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 1) == 3.0); + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 2) == 5.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 2) == 3.0); + + c.ggml_graph_dump_dot(&gf, null, "test1-5-forward.dot"); + c.ggml_graph_dump_dot(&gb, &gf, "test1-5-backward.dot"); + } + + /////////////////////////////////////////////////////////////// + + { + const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3); + const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3); + + c.ggml_set_param(ctx0, x1); + c.ggml_set_param(ctx0, x2); + + const y = + c.ggml_sum(ctx0, + c.ggml_add(ctx0, + c.ggml_mul(ctx0, x1, x2), + c.ggml_mul(ctx0, + c.ggml_repeat(ctx0, c.ggml_new_f32(ctx0, -2.0), x1), + c.ggml_mul(ctx0, x1, x1) + ) + ) + ); + + const gf = c.ggml_build_forward(y); + const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false); + + _ = c.ggml_set_f32(x1, 3.0); + _ = c.ggml_set_f32(x2, 5.0); + + c.ggml_graph_reset(@constCast(&gf)); + _ = c.ggml_set_f32(y.*.grad, 1.0); + + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); + + std.debug.print("y = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)}); + std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n", + .{ + c.ggml_get_f32_1d(x1.*.grad, 0), + c.ggml_get_f32_1d(x1.*.grad, 1), + c.ggml_get_f32_1d(x1.*.grad, 2), + }); + std.debug.print("df/dx2 = {d:.6} {d:.6} {d:.6}\n", + .{ + c.ggml_get_f32_1d(x2.*.grad, 0), + c.ggml_get_f32_1d(x2.*.grad, 1), + c.ggml_get_f32_1d(x2.*.grad, 2), + }); + + try std.testing.expect(c.ggml_get_f32_1d(y, 0) == -9.0); + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0) == -7.0); + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 1) == -7.0); + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 2) == -7.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0) == 3.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 1) == 3.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 2) == 3.0); + + c.ggml_graph_dump_dot(&gf, null, "test1-6-forward.dot"); + c.ggml_graph_dump_dot(&gb, &gf, "test1-6-backward.dot"); + } + + /////////////////////////////////////////////////////////////// + + { + const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3); + const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3); + + c.ggml_set_param(ctx0, x1); + c.ggml_set_param(ctx0, x2); + + const y = + c.ggml_sum(ctx0, + c.ggml_sub(ctx0, + c.ggml_mul(ctx0, x1, x2), + c.ggml_mul(ctx0, + c.ggml_mul(ctx0, x1, x1), + c.ggml_repeat(ctx0, c.ggml_new_f32(ctx0, -2.0), x1) + ) + ) + ); + + const gf = c.ggml_build_forward(y); + const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false); + + _ = c.ggml_set_f32(x1, 3.0); + _ = c.ggml_set_f32(x2, 5.0); + + c.ggml_graph_reset(@constCast(&gf)); + _ = c.ggml_set_f32(y.*.grad, 1.0); + + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); + + std.debug.print("y = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)}); + std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n", + .{ + c.ggml_get_f32_1d(x1.*.grad, 0), + c.ggml_get_f32_1d(x1.*.grad, 1), + c.ggml_get_f32_1d(x1.*.grad, 2), + }); + std.debug.print("df/dx2 = {d:.6} {d:.6} {d:.6}\n", + .{ + c.ggml_get_f32_1d(x2.*.grad, 0), + c.ggml_get_f32_1d(x2.*.grad, 1), + c.ggml_get_f32_1d(x2.*.grad, 2), + }); + + try std.testing.expect(c.ggml_get_f32_1d(y, 0) == 99.0); + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0) == 17.0); + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 1) == 17.0); + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 2) == 17.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0) == 3.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 1) == 3.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 2) == 3.0); + + c.ggml_graph_dump_dot(&gf, null, "test1-7-forward.dot"); + c.ggml_graph_dump_dot(&gb, &gf, "test1-7-backward.dot"); + } + + /////////////////////////////////////////////////////////////// + + { + const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3); + const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3); + + c.ggml_set_param(ctx0, x1); + c.ggml_set_param(ctx0, x2); + + const y = + c.ggml_abs(ctx0, + c.ggml_sub(ctx0, x1, x2) + ); + + const gf = c.ggml_build_forward(y); + const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false); + + _ = c.ggml_set_f32(x1, 3.0); + _ = c.ggml_set_f32(x2, 5.0); + + c.ggml_graph_reset(@constCast(&gf)); + _ = c.ggml_set_f32(y.*.grad, 1.0); + + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); + + std.debug.print("y = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)}); + std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n", + .{ + c.ggml_get_f32_1d(x1.*.grad, 0), + c.ggml_get_f32_1d(x1.*.grad, 1), + c.ggml_get_f32_1d(x1.*.grad, 2), + }); + std.debug.print("df/dx2 = {d:.6} {d:.6} {d:.6}\n", + .{ + c.ggml_get_f32_1d(x2.*.grad, 0), + c.ggml_get_f32_1d(x2.*.grad, 1), + c.ggml_get_f32_1d(x2.*.grad, 2), + }); + + try std.testing.expect(c.ggml_get_f32_1d(y, 0) == 2.0); + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0) == -1.0); + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 1) == -1.0); + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 2) == -1.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0) == 1.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 1) == 1.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 2) == 1.0); + + _ = c.ggml_set_f32(x1, 7.0); + _ = c.ggml_set_f32(x2, 5.0); + + c.ggml_graph_reset(@constCast(&gf)); + _ = c.ggml_set_f32(y.*.grad, 1.0); + + c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads); + + std.debug.print("y = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)}); + std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n", + .{ + c.ggml_get_f32_1d(x1.*.grad, 0), + c.ggml_get_f32_1d(x1.*.grad, 1), + c.ggml_get_f32_1d(x1.*.grad, 2), + }); + std.debug.print("df/dx2 = {d:.6} {d:.6} {d:.6}\n", + .{ + c.ggml_get_f32_1d(x2.*.grad, 0), + c.ggml_get_f32_1d(x2.*.grad, 1), + c.ggml_get_f32_1d(x2.*.grad, 2), + }); + + try std.testing.expect(c.ggml_get_f32_1d(y, 0) == 2.0); + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0) == 1.0); + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 1) == 1.0); + try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 2) == 1.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0) == -1.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 1) == -1.0); + try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 2) == -1.0); + + c.ggml_graph_dump_dot(&gf, null, "test1-8-forward.dot"); + c.ggml_graph_dump_dot(&gb, &gf, "test1-8-backward.dot"); + } + + _ = try std.io.getStdIn().reader().readByte(); +} diff --git a/seamless_communication/ggml/tests/test2.c b/seamless_communication/ggml/tests/test2.c new file mode 100644 index 0000000..839e3e6 --- /dev/null +++ b/seamless_communication/ggml/tests/test2.c @@ -0,0 +1,181 @@ +#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows +#include "ggml/ggml.h" + +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +bool is_close(float a, float b, float epsilon) { + return fabs(a - b) < epsilon; +} + +int main(int argc, const char ** argv) { + struct ggml_init_params params = { + .mem_size = 128*1024*1024, + .mem_buffer = NULL, + .no_alloc = false, + }; + + //struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM); + //opt_params.adam.alpha = 0.01f; + + struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_LBFGS); + + // original threads: 8 + int nthreads = 8; + const char *env = getenv("GGML_NTHREADS"); + if (env != NULL) { + nthreads = atoi(env); + } + if (argc > 1) { + nthreads = atoi(argv[1]); + } + opt_params.n_threads = nthreads; + printf("test2: n_threads:%d\n", opt_params.n_threads); + + const float xi[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f , 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, }; + float yi[] = { 15.0f, 25.0f, 35.0f, 45.0f, 55.0f, 65.0f, 75.0f, 85.0f, 95.0f, 105.0f, }; + + const int n = sizeof(xi)/sizeof(xi[0]); + + struct ggml_context * ctx0 = ggml_init(params); + + struct ggml_tensor * x = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n); + struct ggml_tensor * y = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n); + + for (int i = 0; i < n; i++) { + ((float *) x->data)[i] = xi[i]; + ((float *) y->data)[i] = yi[i]; + } + + { + struct ggml_tensor * t0 = ggml_new_f32(ctx0, 0.0f); + struct ggml_tensor * t1 = ggml_new_f32(ctx0, 0.0f); + + // initialize auto-diff parameters: + ggml_set_param(ctx0, t0); + ggml_set_param(ctx0, t1); + + // f = sum_i[(t0 + t1*x_i - y_i)^2]/(2n) + struct ggml_tensor * f = + ggml_div(ctx0, + ggml_sum(ctx0, + ggml_sqr(ctx0, + ggml_sub(ctx0, + ggml_add(ctx0, + ggml_mul(ctx0, x, ggml_repeat(ctx0, t1, x)), + ggml_repeat(ctx0, t0, x)), + y) + ) + ), + ggml_new_f32(ctx0, 2.0f*n)); + + enum ggml_opt_result res = ggml_opt(NULL, opt_params, f); + + printf("t0 = %f\n", ggml_get_f32_1d(t0, 0)); + printf("t1 = %f\n", ggml_get_f32_1d(t1, 0)); + + GGML_ASSERT(res == GGML_OPT_OK); + + GGML_ASSERT(is_close(ggml_get_f32_1d(t0, 0), 5.0f, 1e-3f)); + GGML_ASSERT(is_close(ggml_get_f32_1d(t1, 0), 10.0f, 1e-3f)); + } + + { + struct ggml_tensor * t0 = ggml_new_f32(ctx0, -1.0f); + struct ggml_tensor * t1 = ggml_new_f32(ctx0, 9.0f); + + ggml_set_param(ctx0, t0); + ggml_set_param(ctx0, t1); + + // f = 0.5*sum_i[abs(t0 + t1*x_i - y_i)]/n + struct ggml_tensor * f = + ggml_mul(ctx0, + ggml_new_f32(ctx0, 1.0/(2*n)), + ggml_sum(ctx0, + ggml_abs(ctx0, + ggml_sub(ctx0, + ggml_add(ctx0, + ggml_mul(ctx0, x, ggml_repeat(ctx0, t1, x)), + ggml_repeat(ctx0, t0, x)), + y) + ) + ) + ); + + + enum ggml_opt_result res = ggml_opt(NULL, opt_params, f); + + GGML_ASSERT(res == GGML_OPT_OK); + GGML_ASSERT(is_close(ggml_get_f32_1d(t0, 0), 5.0f, 1e-2f)); + GGML_ASSERT(is_close(ggml_get_f32_1d(t1, 0), 10.0f, 1e-2f)); + } + + { + struct ggml_tensor * t0 = ggml_new_f32(ctx0, 5.0f); + struct ggml_tensor * t1 = ggml_new_f32(ctx0, -4.0f); + + ggml_set_param(ctx0, t0); + ggml_set_param(ctx0, t1); + + // f = t0^2 + t1^2 + struct ggml_tensor * f = + ggml_add(ctx0, + ggml_sqr(ctx0, t0), + ggml_sqr(ctx0, t1) + ); + + enum ggml_opt_result res = ggml_opt(NULL, opt_params, f); + + GGML_ASSERT(res == GGML_OPT_OK); + GGML_ASSERT(is_close(ggml_get_f32_1d(f, 0), 0.0f, 1e-3f)); + GGML_ASSERT(is_close(ggml_get_f32_1d(t0, 0), 0.0f, 1e-3f)); + GGML_ASSERT(is_close(ggml_get_f32_1d(t1, 0), 0.0f, 1e-3f)); + } + + ///////////////////////////////////////// + + { + struct ggml_tensor * t0 = ggml_new_f32(ctx0, -7.0f); + struct ggml_tensor * t1 = ggml_new_f32(ctx0, 8.0f); + + ggml_set_param(ctx0, t0); + ggml_set_param(ctx0, t1); + + // f = (t0 + 2*t1 - 7)^2 + (2*t0 + t1 - 5)^2 + struct ggml_tensor * f = + ggml_add(ctx0, + ggml_sqr(ctx0, + ggml_sub(ctx0, + ggml_add(ctx0, + t0, + ggml_mul(ctx0, t1, ggml_new_f32(ctx0, 2.0f))), + ggml_new_f32(ctx0, 7.0f) + ) + ), + ggml_sqr(ctx0, + ggml_sub(ctx0, + ggml_add(ctx0, + ggml_mul(ctx0, t0, ggml_new_f32(ctx0, 2.0f)), + t1), + ggml_new_f32(ctx0, 5.0f) + ) + ) + ); + + enum ggml_opt_result res = ggml_opt(NULL, opt_params, f); + + GGML_ASSERT(res == GGML_OPT_OK); + GGML_ASSERT(is_close(ggml_get_f32_1d(f, 0), 0.0f, 1e-3f)); + GGML_ASSERT(is_close(ggml_get_f32_1d(t0, 0), 1.0f, 1e-3f)); + GGML_ASSERT(is_close(ggml_get_f32_1d(t1, 0), 3.0f, 1e-3f)); + } + + ggml_free(ctx0); + + return 0; +} diff --git a/seamless_communication/ggml/tests/test2.zig b/seamless_communication/ggml/tests/test2.zig new file mode 100644 index 0000000..667de96 --- /dev/null +++ b/seamless_communication/ggml/tests/test2.zig @@ -0,0 +1,165 @@ +const std = @import("std"); +const Thread = std.Thread; +const c = @cImport({ + @cInclude("ggml/ggml.h"); +}); + +fn is_close(a: f32, b: f32, epsilon: f32) bool { + return std.math.fabs(a - b) < epsilon; +} + +pub fn main() !void { + const params = .{ + .mem_size = 128*1024*1024, + .mem_buffer = null, + .no_alloc = false, + }; + + var opt_params = c.ggml_opt_default_params(c.GGML_OPT_LBFGS); + + const nthreads = try Thread.getCpuCount(); + opt_params.n_threads = @intCast(nthreads); + std.debug.print("test2: n_threads:{}\n", .{opt_params.n_threads}); + + const xi = [_]f32{ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 }; + const yi = [_]f32{ 15.0, 25.0, 35.0, 45.0, 55.0, 65.0, 75.0, 85.0, 95.0, 105.0 }; + + const n = xi.len; + + const ctx0 = c.ggml_init(params); + defer c.ggml_free(ctx0); + + const x = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, n); + const y = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, n); + + for (0..n) |i| { + const x_data_pointer: [*]f32 = @ptrCast(@alignCast(x.*.data)); + x_data_pointer[i] = xi[i]; + const y_data_pointer: [*]f32 = @ptrCast(@alignCast(y.*.data)); + y_data_pointer[i] = yi[i]; + } + + { + const t0 = c.ggml_new_f32(ctx0, 0.0); + const t1 = c.ggml_new_f32(ctx0, 0.0); + + // initialize auto-diff parameters: + _ = c.ggml_set_param(ctx0, t0); + _ = c.ggml_set_param(ctx0, t1); + + // f = sum_i[(t0 + t1*x_i - y_i)^2]/(2n) + const f = + c.ggml_div(ctx0, + c.ggml_sum(ctx0, + c.ggml_sqr(ctx0, + c.ggml_sub(ctx0, + c.ggml_add(ctx0, + c.ggml_mul(ctx0, x, c.ggml_repeat(ctx0, t1, x)), + c.ggml_repeat(ctx0, t0, x)), + y) + ) + ), + c.ggml_new_f32(ctx0, @as(f32, 2.0)*n)); + + const res = c.ggml_opt(null, opt_params, f); + + std.debug.print("t0 = {d:.6}\n", .{c.ggml_get_f32_1d(t0, 0)}); + std.debug.print("t1 = {d:.6}\n", .{c.ggml_get_f32_1d(t1, 0)}); + + try std.testing.expect(res == c.GGML_OPT_OK); + try std.testing.expect(is_close(c.ggml_get_f32_1d(t0, 0), 5.0, 1e-3)); + try std.testing.expect(is_close(c.ggml_get_f32_1d(t1, 0), 10.0, 1e-3)); + } + + { + const t0 = c.ggml_new_f32(ctx0, -1.0); + const t1 = c.ggml_new_f32(ctx0, 9.0); + + _ = c.ggml_set_param(ctx0, t0); + _ = c.ggml_set_param(ctx0, t1); + + // f = 0.5*sum_i[abs(t0 + t1*x_i - y_i)]/n + const f = + c.ggml_mul(ctx0, + c.ggml_new_f32(ctx0, @as(f32, 1.0)/(2*n)), + c.ggml_sum(ctx0, + c.ggml_abs(ctx0, + c.ggml_sub(ctx0, + c.ggml_add(ctx0, + c.ggml_mul(ctx0, x, c.ggml_repeat(ctx0, t1, x)), + c.ggml_repeat(ctx0, t0, x)), + y) + ) + ) + ); + + + const res = c.ggml_opt(null, opt_params, f); + + try std.testing.expect(res == c.GGML_OPT_OK); + try std.testing.expect(is_close(c.ggml_get_f32_1d(t0, 0), 5.0, 1e-2)); + try std.testing.expect(is_close(c.ggml_get_f32_1d(t1, 0), 10.0, 1e-2)); + } + + { + const t0 = c.ggml_new_f32(ctx0, 5.0); + const t1 = c.ggml_new_f32(ctx0, -4.0); + + _ = c.ggml_set_param(ctx0, t0); + _ = c.ggml_set_param(ctx0, t1); + + // f = t0^2 + t1^2 + const f = + c.ggml_add(ctx0, + c.ggml_sqr(ctx0, t0), + c.ggml_sqr(ctx0, t1) + ); + + const res = c.ggml_opt(null, opt_params, f); + + try std.testing.expect(res == c.GGML_OPT_OK); + try std.testing.expect(is_close(c.ggml_get_f32_1d(f, 0), 0.0, 1e-3)); + try std.testing.expect(is_close(c.ggml_get_f32_1d(t0, 0), 0.0, 1e-3)); + try std.testing.expect(is_close(c.ggml_get_f32_1d(t1, 0), 0.0, 1e-3)); + } + + ///////////////////////////////////////// + + { + const t0 = c.ggml_new_f32(ctx0, -7.0); + const t1 = c.ggml_new_f32(ctx0, 8.0); + + _ = c.ggml_set_param(ctx0, t0); + _ = c.ggml_set_param(ctx0, t1); + + // f = (t0 + 2*t1 - 7)^2 + (2*t0 + t1 - 5)^2 + const f = + c.ggml_add(ctx0, + c.ggml_sqr(ctx0, + c.ggml_sub(ctx0, + c.ggml_add(ctx0, + t0, + c.ggml_mul(ctx0, t1, c.ggml_new_f32(ctx0, 2.0))), + c.ggml_new_f32(ctx0, 7.0) + ) + ), + c.ggml_sqr(ctx0, + c.ggml_sub(ctx0, + c.ggml_add(ctx0, + c.ggml_mul(ctx0, t0, c.ggml_new_f32(ctx0, 2.0)), + t1), + c.ggml_new_f32(ctx0, 5.0) + ) + ) + ); + + const res = c.ggml_opt(null, opt_params, f); + + try std.testing.expect(res == c.GGML_OPT_OK); + try std.testing.expect(is_close(c.ggml_get_f32_1d(f, 0), 0.0, 1e-3)); + try std.testing.expect(is_close(c.ggml_get_f32_1d(t0, 0), 1.0, 1e-3)); + try std.testing.expect(is_close(c.ggml_get_f32_1d(t1, 0), 3.0, 1e-3)); + } + + _ = try std.io.getStdIn().reader().readByte(); +} diff --git a/seamless_communication/ggml/tests/test3.c b/seamless_communication/ggml/tests/test3.c new file mode 100644 index 0000000..b92d623 --- /dev/null +++ b/seamless_communication/ggml/tests/test3.c @@ -0,0 +1,95 @@ +#include "ggml/ggml.h" + +#include +#include +#include + +bool is_close(float a, float b, float epsilon) { + return fabs(a - b) < epsilon; +} + +int main(int argc, const char ** argv) { + struct ggml_init_params params = { + .mem_size = 1024*1024*1024, + .mem_buffer = NULL, + .no_alloc = false, + }; + + //struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM); + struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_LBFGS); + + opt_params.n_threads = (argc > 1) ? atoi(argv[1]) : 8; + + const int NP = 1 << 12; + const int NF = 1 << 8; + + struct ggml_context * ctx0 = ggml_init(params); + + struct ggml_tensor * F = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, NF, NP); + struct ggml_tensor * l = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, NP); + + // regularization weight + struct ggml_tensor * lambda = ggml_new_f32(ctx0, 1e-5f); + + srand(0); + + for (int j = 0; j < NP; j++) { + const float ll = j < NP/2 ? 1.0f : -1.0f; + ((float *)l->data)[j] = ll; + + for (int i = 0; i < NF; i++) { + ((float *)F->data)[j*NF + i] = ((ll > 0 && i < NF/2 ? 1.0f : ll < 0 && i >= NF/2 ? 1.0f : 0.0f) + ((float)rand()/(float)RAND_MAX - 0.5f)*0.1f)/(0.5f*NF); + } + } + + { + // initial guess + struct ggml_tensor * x = ggml_set_f32(ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, NF), 0.0f); + + ggml_set_param(ctx0, x); + + // f = sum[(fj*x - l)^2]/n + lambda*|x^2| + struct ggml_tensor * f = + ggml_add(ctx0, + ggml_div(ctx0, + ggml_sum(ctx0, + ggml_sqr(ctx0, + ggml_sub(ctx0, + ggml_mul_mat(ctx0, F, x), + l) + ) + ), + ggml_new_f32(ctx0, (float)NP) + ), + ggml_mul(ctx0, + ggml_sum(ctx0, ggml_sqr(ctx0, x)), + lambda) + ); + + enum ggml_opt_result res = ggml_opt(NULL, opt_params, f); + + GGML_ASSERT(res == GGML_OPT_OK); + + // print results + for (int i = 0; i < 16; i++) { + printf("x[%3d] = %g\n", i, ((float *)x->data)[i]); + } + printf("...\n"); + for (int i = NF - 16; i < NF; i++) { + printf("x[%3d] = %g\n", i, ((float *)x->data)[i]); + } + printf("\n"); + + for (int i = 0; i < NF; ++i) { + if (i < NF/2) { + GGML_ASSERT(is_close(((float *)x->data)[i], 1.0f, 1e-2f)); + } else { + GGML_ASSERT(is_close(((float *)x->data)[i], -1.0f, 1e-2f)); + } + } + } + + ggml_free(ctx0); + + return 0; +} diff --git a/seamless_communication/ggml/tests/test3.zig b/seamless_communication/ggml/tests/test3.zig new file mode 100644 index 0000000..d676961 --- /dev/null +++ b/seamless_communication/ggml/tests/test3.zig @@ -0,0 +1,102 @@ +const std = @import("std"); +const Thread = std.Thread; +const c = @cImport({ + @cInclude("stdlib.h"); + @cInclude("ggml/ggml.h"); +}); + +fn is_close(a: f32, b: f32, epsilon: f32) bool { + return std.math.fabs(a - b) < epsilon; +} + +pub fn main() !void { + const params = .{ + .mem_size = 128*1024*1024, + .mem_buffer = null, + .no_alloc = false, + }; + + var opt_params = c.ggml_opt_default_params(c.GGML_OPT_LBFGS); + + const nthreads = try Thread.getCpuCount(); + opt_params.n_threads = @intCast(nthreads); + + const NP = 1 << 12; + const NF = 1 << 8; + + const ctx0 = c.ggml_init(params); + defer c.ggml_free(ctx0); + + const F = c.ggml_new_tensor_2d(ctx0, c.GGML_TYPE_F32, NF, NP); + const l = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, NP); + + // regularization weight + const lambda = c.ggml_new_f32(ctx0, 1e-5); + + c.srand(0); + + const l_data_pointer: [*]f32 = @ptrCast(@alignCast(l.*.data)); + const f_data_pointer: [*]f32 = @ptrCast(@alignCast(F.*.data)); + for (0..NP) |j| { + const ll = if (j < NP/2) @as(f32, 1.0) else @as(f32, -1.0); + l_data_pointer[j] = ll; + + for (0..NF) |i| { + const c_rand: f32 = @floatFromInt(c.rand()); + f_data_pointer[j*NF + i] = + ((if (ll > 0 and i < NF/2) @as(f32, 1.0) else + if (ll < 0 and i >= NF/2) @as(f32, 1.0) else @as(f32, 0.0)) + + (c_rand/c.RAND_MAX - 0.5) * 0.1) / (0.5 * NF); + } + } + + { + // initial guess + const x = c.ggml_set_f32(c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, NF), 0.0); + + c.ggml_set_param(ctx0, x); + + // f = sum[(fj*x - l)^2]/n + lambda*|x^2| + const f = + c.ggml_add(ctx0, + c.ggml_div(ctx0, + c.ggml_sum(ctx0, + c.ggml_sqr(ctx0, + c.ggml_sub(ctx0, + c.ggml_mul_mat(ctx0, F, x), + l) + ) + ), + c.ggml_new_f32(ctx0, @as(f32, NP)) + ), + c.ggml_mul(ctx0, + c.ggml_sum(ctx0, c.ggml_sqr(ctx0, x)), + lambda) + ); + + const res = c.ggml_opt(null, opt_params, f); + + try std.testing.expect(res == c.GGML_OPT_OK); + + const x_data_pointer: [*]f32 = @ptrCast(@alignCast(x.*.data)); + // print results + for (0..16) |i| { + std.debug.print("x[{d:3}] = {d:.6}\n", .{i, x_data_pointer[i]}); + } + std.debug.print("...\n", .{}); + for (NF - 16..NF) |i| { + std.debug.print("x[{d:3}] = {d:.6}\n", .{i, x_data_pointer[i]}); + } + std.debug.print("\n", .{}); + + for (0..NF) |i| { + if (i < NF/2) { + try std.testing.expect(is_close(x_data_pointer[i], 1.0, 1e-2)); + } else { + try std.testing.expect(is_close(x_data_pointer[i], -1.0, 1e-2)); + } + } + } + + _ = try std.io.getStdIn().reader().readByte(); +} diff --git a/seamless_communication/ggml/third_party_ggml.py b/seamless_communication/ggml/third_party_ggml.py new file mode 100644 index 0000000..56ef873 --- /dev/null +++ b/seamless_communication/ggml/third_party_ggml.py @@ -0,0 +1,8056 @@ +"""This module is the core of the ggml-python library, it exposes a low-level [ctypes](https://docs.python.org/3/library/ctypes.html)-based interface for ggml. + +Structures and functions in the `ggml.ggml` module map directly to the original ggml C library and +they operate at a fairly low level. +No additional runtime checks checks are performed nor is memory management handled automatically. +You've been warned :). + +With that in mind here are some useful things to keep in mind + +- Functions accept both ctypes types (c_int, c_bool, c_float, etc.) and Python types (int, bool, float, etc.) as parameters. +- Functions return Python types for simple values (int, bool, float, etc.) and ctypes types for complex values ([ggml_context_p][ggml.ggml_context_p], [ggml_tensor_p][ggml.ggml_tensor_p], etc.). +- Memory management is the responsibility of the user. The user must call [ggml.ggml_free][] on the context after calling [ggml.ggml_init][]. + +Example + +```python +import ggml +import ctypes + +# Allocate a new context with 16 MB of memory +params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024, mem_buffer=None) +ctx = ggml.ggml_init(params=params) + +# Instantiate tensors +x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) +a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) +b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) + +# Use ggml operations to build a computational graph +x2 = ggml.ggml_mul(ctx, x, x) +f = ggml.ggml_add(ctx, ggml.ggml_mul(ctx, a, x2), b) + +gf = ggml.ggml_build_forward(f) + +# Set the input values +ggml.ggml_set_f32(x, 2.0) +ggml.ggml_set_f32(a, 3.0) +ggml.ggml_set_f32(b, 4.0) + +# Compute the graph +ggml.ggml_graph_compute_with_ctx(ctx, ctypes.pointer(gf), 1) + +# Get the output value +output = ggml.ggml_get_f32_1d(f, 0) +assert output == 16.0 + +# Free the context +ggml.ggml_free(ctx) +``` + +""" +import ctypes +import importlib.resources +import os +import pathlib +import sys +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Type, Union + +import numpy as np +from typing_extensions import TypeAlias + +NULL: ctypes.c_void_p = None # ignore: type +GGML_MEM_ALIGN = 16 + + +# Load the library +def load_shared_library(base_path: Path, lib_base_name: str): + # Construct the paths to the possible shared library names + # Searching for the library in the current directory under the name "libggml" (default name + # for ggml) and "ggml" (default name for this repo) + lib_names: List[str] = [ + f"lib{lib_base_name}.so", + f"lib{lib_base_name}.dylib", + f"{lib_base_name}.dll", + ] + + path = None + cdll_args = dict() # type: ignore + # Add the library directory to the DLL search path on Windows (if needed) + if sys.platform == "win32" and sys.version_info >= (3, 8): + os.add_dll_directory(str(base_path)) + cdll_args["winmode"] = 0 + + for lib_name in lib_names: + # Try to load the shared library, handling potential errors + path = base_path / lib_name + if not path.exists(): + continue + try: + return ctypes.CDLL(str(path), **cdll_args) + except Exception as e: + raise RuntimeError(f"Failed to load shared library '{path}': {e}") + + raise FileNotFoundError( + f"Shared library with base name '{lib_base_name}' not found in {base_path}" + ) + + +base_path = Path(__file__).parent.resolve() / "build/examples/unity" +lib_base_name = "fairseq2_cpp" +lib = load_shared_library(base_path, lib_base_name) + +##################################################### +# GGML Utility Types +##################################################### + +CFloatArray: TypeAlias = "ctypes.Array[ctypes.c_float]" +CInt64Array: TypeAlias = "ctypes.Array[ctypes.c_int64]" +CIntPointer: TypeAlias = "ctypes._Pointer[ctypes.c_int]" # type: ignore +CCharPointer: TypeAlias = "ctypes._Pointer[ctypes.c_char]" # type: ignore + + +##################################################### +# source: ggml.h +# GGML API +##################################################### + + +# #define GGML_FILE_MAGIC 0x67676d6c // "ggml" +GGML_FILE_MAGIC = int("0x67676d6c", 16) +# #define GGML_FILE_VERSION 1 +GGML_FILE_VERSION = 1 +# #define GGML_QNT_VERSION 2 // bump this on quantization format changes +GGML_QNT_VERSION = 2 +# #define GGML_QNT_VERSION_FACTOR 1000 // do not change this +GGML_QNT_VERSION_FACTOR = 1000 +# #define GGML_MAX_DIMS 4 +GGML_MAX_DIMS = 4 +# #define GGML_MAX_NODES 4096 +GGML_MAX_NODES = 4096 +# #define GGML_MAX_PARAMS 256 +GGML_MAX_PARAMS = 256 +# #define GGML_MAX_CONTEXTS 64 +GGML_MAX_CONTEXTS = 64 +# #define GGML_MAX_SRC 6 +GGML_MAX_SRC = 6 +# #define GGML_MAX_NAME 64 +GGML_MAX_NAME = 64 +# #define GGML_MAX_OP_PARAMS 32 +GGML_MAX_OP_PARAMS = 32 +# #define GGML_DEFAULT_N_THREADS 4 +GGML_DEFAULT_N_THREADS = 4 + +# #if UINTPTR_MAX == 0XFFFFFFFF +# #define GGML_MEMALIGN 4 +# #else +# # define GGML_MEMALIGN 16 +# #endif +GGML_MEMALIGN = ( + 16 if ctypes.sizeof(ctypes.c_void_p) == 4 else 32 +) # FIXME: Check if this is correct + +# #define GGML_EXIT_SUCCESS 0 +GGML_EXIT_SUCCESS = 0 +# #define GGML_EXIT_ABORTED 1 +GGML_EXIT_ABORTED = 1 + +# #define GGUF_MAGIC 0x46554747 // "GGUF" +GGUF_MAGIC = int("0x46554747", 16) +# #define GGUF_VERSION 2 +GGUF_VERSION = 2 + +# #define GGUF_DEFAULT_ALIGNMENT 32 +GGUF_DEFAULT_ALIGNMENT = 32 + +# TODO: Check if this is correct +# typedef uint16_t ggml_fp16_t; +ggml_fp16_t = ctypes.c_uint16 + +CFP16Array: TypeAlias = "ctypes.Array[ggml_fp16_t]" + + +# GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x); +def ggml_fp16_to_fp32(x: ggml_fp16_t) -> float: + return lib.ggml_fp16_to_fp32(x) + + +lib.ggml_fp16_to_fp32.argtypes = [ggml_fp16_t] +lib.ggml_fp16_to_fp32.restype = ctypes.c_float + + +# GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x); +def ggml_fp32_to_fp16(x: ctypes.c_float) -> int: + return lib.ggml_fp32_to_fp16(x) + + +lib.ggml_fp32_to_fp16.argtypes = [ctypes.c_float] +lib.ggml_fp32_to_fp16.restype = ggml_fp16_t + + +# GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n); +def ggml_fp16_to_fp32_row( + x: CFP16Array, + y: CFloatArray, + n: Union[ctypes.c_int, int], +) -> None: + return lib.ggml_fp16_to_fp32_row(x, y, n) + + +lib.ggml_fp16_to_fp32_row.argtypes = [ + ctypes.POINTER(ggml_fp16_t), + ctypes.POINTER(ctypes.c_float), + ctypes.c_int, +] +lib.ggml_fp16_to_fp32_row.restype = None + + +# GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n); +def ggml_fp32_to_fp16_row( + x: CFloatArray, + y: CFP16Array, + n: Union[ctypes.c_int, int], +) -> None: + return lib.ggml_fp32_to_fp16_row(x, y, n) + + +lib.ggml_fp32_to_fp16_row.argtypes = [ + ctypes.POINTER(ctypes.c_float), + ctypes.POINTER(ggml_fp16_t), + ctypes.c_int, +] +lib.ggml_fp32_to_fp16_row.restype = None + + +# struct ggml_context; +ggml_context_p = ctypes.c_void_p +"""Opaque pointer to a ggml_context. + +ggml_context structs are not accessed directly instead they must be created using [ggml_init](ggml.ggml_init) and freed using [ggml_free](ggml.ggml_free).""" + + +# enum ggml_type { +# GGML_TYPE_F32 = 0, +# GGML_TYPE_F16 = 1, +# GGML_TYPE_Q4_0 = 2, +# GGML_TYPE_Q4_1 = 3, +# // GGML_TYPE_Q4_2 = 4, support has been removed +# // GGML_TYPE_Q4_3 (5) support has been removed +# GGML_TYPE_Q5_0 = 6, +# GGML_TYPE_Q5_1 = 7, +# GGML_TYPE_Q8_0 = 8, +# GGML_TYPE_Q8_1 = 9, +# GGML_TYPE_Q2_K = 10, +# GGML_TYPE_Q3_K = 11, +# GGML_TYPE_Q4_K = 12, +# GGML_TYPE_Q5_K = 13, +# GGML_TYPE_Q6_K = 14, +# GGML_TYPE_Q8_K = 15, +# GGML_TYPE_I8, +# GGML_TYPE_I16, +# GGML_TYPE_I32, +# GGML_TYPE_COUNT, +# }; +GGML_TYPE_F32 = 0 +GGML_TYPE_F16 = 1 +GGML_TYPE_Q4_0 = 2 +GGML_TYPE_Q4_1 = 3 +GGML_TYPE_Q5_0 = 6 +GGML_TYPE_Q5_1 = 7 +GGML_TYPE_Q8_0 = 8 +GGML_TYPE_Q8_1 = 9 +GGML_TYPE_Q2_K = 10 +GGML_TYPE_Q3_K = 11 +GGML_TYPE_Q4_K = 12 +GGML_TYPE_Q5_K = 13 +GGML_TYPE_Q6_K = 14 +GGML_TYPE_Q8_K = 15 +GGML_TYPE_I8 = 16 +GGML_TYPE_I16 = 17 +GGML_TYPE_I32 = 18 +GGML_TYPE_COUNT = 19 + + +# enum ggml_backend { +# GGML_BACKEND_CPU = 0, +# GGML_BACKEND_GPU = 10, +# GGML_BACKEND_GPU_SPLIT = 20, +# }; +GGML_BACKEND_CPU = 0 +GGML_BACKEND_GPU = 10 +GGML_BACKEND_GPU_SPLIT = 20 + + +# // model file types +# enum ggml_ftype { +# GGML_FTYPE_UNKNOWN = -1, +# GGML_FTYPE_ALL_F32 = 0, +# GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors +# GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors +# GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors +# GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 +# GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors +# GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors +# GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors +# GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors +# GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors +# GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors +# GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors +# GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors +# }; +GGML_FTYPE_UNKNOWN = -1 +GGML_FTYPE_ALL_F32 = 0 +GGML_FTYPE_MOSTLY_F16 = 1 +GGML_FTYPE_MOSTLY_Q4_0 = 2 +GGML_FTYPE_MOSTLY_Q4_1 = 3 +GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4 +GGML_FTYPE_MOSTLY_Q8_0 = 7 +GGML_FTYPE_MOSTLY_Q5_0 = 8 +GGML_FTYPE_MOSTLY_Q5_1 = 9 +GGML_FTYPE_MOSTLY_Q2_K = 10 +GGML_FTYPE_MOSTLY_Q3_K = 11 +GGML_FTYPE_MOSTLY_Q4_K = 12 +GGML_FTYPE_MOSTLY_Q5_K = 13 +GGML_FTYPE_MOSTLY_Q6_K = 14 + + +# // available tensor operations: +# enum ggml_op { +# GGML_OP_NONE = 0, + +# GGML_OP_DUP, +# GGML_OP_ADD, +# GGML_OP_ADD1, +# GGML_OP_ACC, +# GGML_OP_SUB, +# GGML_OP_MUL, +# GGML_OP_DIV, +# GGML_OP_SQR, +# GGML_OP_SQRT, +# GGML_OP_LOG, +# GGML_OP_SUM, +# GGML_OP_SUM_ROWS, +# GGML_OP_MEAN, +# GGML_OP_ARGMAX, +# GGML_OP_REPEAT, +# GGML_OP_REPEAT_BACK, +# GGML_OP_CONCAT, +# GGML_OP_SILU_BACK, +# GGML_OP_NORM, // normalize +# GGML_OP_RMS_NORM, +# GGML_OP_RMS_NORM_BACK, +# GGML_OP_GROUP_NORM, + +# GGML_OP_MUL_MAT, +# GGML_OP_OUT_PROD, + +# GGML_OP_SCALE, +# GGML_OP_SET, +# GGML_OP_CPY, +# GGML_OP_CONT, +# GGML_OP_RESHAPE, +# GGML_OP_VIEW, +# GGML_OP_PERMUTE, +# GGML_OP_TRANSPOSE, +# GGML_OP_GET_ROWS, +# GGML_OP_GET_ROWS_BACK, +# GGML_OP_DIAG, +# GGML_OP_DIAG_MASK_INF, +# GGML_OP_DIAG_MASK_ZERO, +# GGML_OP_SOFT_MAX, +# GGML_OP_SOFT_MAX_BACK, +# GGML_OP_ROPE, +# GGML_OP_ROPE_BACK, +# GGML_OP_ALIBI, +# GGML_OP_CLAMP, +# GGML_OP_CONV_1D, +# GGML_OP_CONV_2D, +# GGML_OP_CONV_TRANSPOSE_2D, +# GGML_OP_POOL_1D, +# GGML_OP_POOL_2D, + +# GGML_OP_UPSCALE, // nearest interpolate + +# GGML_OP_FLASH_ATTN, +# GGML_OP_FLASH_FF, +# GGML_OP_FLASH_ATTN_BACK, +# GGML_OP_WIN_PART, +# GGML_OP_WIN_UNPART, +# GGML_OP_GET_REL_POS, +# GGML_OP_ADD_REL_POS, + +# GGML_OP_UNARY, + +# GGML_OP_MAP_UNARY, +# GGML_OP_MAP_BINARY, + +# GGML_OP_MAP_CUSTOM1_F32, +# GGML_OP_MAP_CUSTOM2_F32, +# GGML_OP_MAP_CUSTOM3_F32, + +# GGML_OP_MAP_CUSTOM1, +# GGML_OP_MAP_CUSTOM2, +# GGML_OP_MAP_CUSTOM3, + +# GGML_OP_CROSS_ENTROPY_LOSS, +# GGML_OP_CROSS_ENTROPY_LOSS_BACK, + +# GGML_OP_COUNT, +# }; +GGML_OP_NONE = 0 +GGML_OP_DUP = 1 +GGML_OP_ADD = 2 +GGML_OP_ADD1 = 3 +GGML_OP_ACC = 4 +GGML_OP_SUB = 5 +GGML_OP_MUL = 6 +GGML_OP_DIV = 7 +GGML_OP_SQR = 8 +GGML_OP_SQRT = 9 +GGML_OP_LOG = 10 +GGML_OP_SUM = 11 +GGML_OP_SUM_ROWS = 12 +GGML_OP_MEAN = 13 +GGML_OP_ARGMAX = 14 +GGML_OP_REPEAT = 15 +GGML_OP_REPEAT_BACK = 16 +GGML_OP_CONCAT = 17 +GGML_OP_SILU_BACK = 18 +GGML_OP_NORM = 19 +GGML_OP_RMS_NORM = 20 +GGML_OP_RMS_NORM_BACK = 21 +GGML_OP_GROUP_NORM = 22 +GGML_OP_MUL_MAT = 23 +GGML_OP_OUT_PROD = 24 +GGML_OP_SCALE = 25 +GGML_OP_SET = 26 +GGML_OP_CPY = 27 +GGML_OP_CONT = 28 +GGML_OP_RESHAPE = 29 +GGML_OP_VIEW = 30 +GGML_OP_PERMUTE = 31 +GGML_OP_TRANSPOSE = 32 +GGML_OP_GET_ROWS = 33 +GGML_OP_GET_ROWS_BACK = 34 +GGML_OP_DIAG = 35 +GGML_OP_DIAG_MASK_INF = 36 +GGML_OP_DIAG_MASK_ZERO = 37 +GGML_OP_SOFT_MAX = 38 +GGML_OP_SOFT_MAX_BACK = 39 +GGML_OP_ROPE = 40 +GGML_OP_ROPE_BACK = 41 +GGML_OP_ALIBI = 42 +GGML_OP_CLAMP = 43 +GGML_OP_CONV_1D = 44 +GGML_OP_CONV_2D = 45 +GGML_OP_CONV_TRANSPOSE_2D = 46 +GGML_OP_POOL_1D = 47 +GGML_OP_POOL_2D = 48 +GGML_OP_UPSCALE = 49 +GGML_OP_FLASH_ATTN = 50 +GGML_OP_FLASH_FF = 51 +GGML_OP_FLASH_ATTN_BACK = 52 +GGML_OP_WIN_PART = 53 +GGML_OP_WIN_UNPART = 54 +GGML_OP_GET_REL_POS = 55 +GGML_OP_ADD_REL_POS = 56 +GGML_OP_UNARY = 57 +GGML_OP_MAP_UNARY = 58 +GGML_OP_MAP_BINARY = 59 +GGML_OP_MAP_CUSTOM1_F32 = 60 +GGML_OP_MAP_CUSTOM2_F32 = 61 +GGML_OP_MAP_CUSTOM3_F32 = 62 +GGML_OP_MAP_CUSTOM1 = 63 +GGML_OP_MAP_CUSTOM2 = 64 +GGML_OP_MAP_CUSTOM3 = 65 +GGML_OP_CROSS_ENTROPY_LOSS = 66 +GGML_OP_CROSS_ENTROPY_LOSS_BACK = 67 +GGML_OP_COUNT = 68 + + +# enum ggml_unary_op { +# GGML_UNARY_OP_ABS, +# GGML_UNARY_OP_SGN, +# GGML_UNARY_OP_NEG, +# GGML_UNARY_OP_STEP, +# GGML_UNARY_OP_TANH, +# GGML_UNARY_OP_ELU, +# GGML_UNARY_OP_RELU, +# GGML_UNARY_OP_GELU, +# GGML_UNARY_OP_GELU_QUICK, +# GGML_UNARY_OP_SILU, +# }; +GGML_UNARY_OP_ABS = 0 +GGML_UNARY_OP_SGN = 1 +GGML_UNARY_OP_NEG = 2 +GGML_UNARY_OP_STEP = 3 +GGML_UNARY_OP_TANH = 4 +GGML_UNARY_OP_ELU = 5 +GGML_UNARY_OP_RELU = 6 +GGML_UNARY_OP_GELU = 7 +GGML_UNARY_OP_GELU_QUICK = 8 +GGML_UNARY_OP_SILU = 9 + +# enum ggml_object_type { +# GGML_OBJECT_TENSOR, +# GGML_OBJECT_GRAPH, +# GGML_OBJECT_WORK_BUFFER +# }; +GGML_OBJECT_TENSOR = 0 +GGML_OBJECT_GRAPH = 1 +GGML_OBJECT_WORK_BUFFER = 2 + +# // ggml object +# struct ggml_object { +# size_t offs; +# size_t size; + +# struct ggml_object * next; + +# enum ggml_object_type type; + + +# char padding[4]; +# }; +class ggml_object(ctypes.Structure): + pass + + +ggml_object._fields_ = [ + ("offs", ctypes.c_size_t), + ("size", ctypes.c_size_t), + ("next", ctypes.POINTER(ggml_object)), + ("type", ctypes.c_int), + ("padding", ctypes.c_char * 4), +] + +ggml_object_p: TypeAlias = "ctypes._Pointer[ggml_object]" # type: ignore + +GGML_OBJECT_SIZE = ctypes.sizeof(ggml_object) + + +# // n-dimensional tensor +# struct ggml_tensor { +# enum ggml_type type; +# enum ggml_backend backend; + +# int n_dims; +# int64_t ne[GGML_MAX_DIMS]; // number of elements +# size_t nb[GGML_MAX_DIMS]; // stride in bytes: +# // nb[0] = sizeof(type) +# // nb[1] = nb[0] * ne[0] + padding +# // nb[i] = nb[i-1] * ne[i-1] + +# // compute data +# enum ggml_op op; + +# // op params - allocated as int32_t for alignment +# int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; + +# bool is_param; + +# struct ggml_tensor * grad; +# struct ggml_tensor * src[GGML_MAX_SRC]; + +# // performance +# int perf_runs; +# int64_t perf_cycles; +# int64_t perf_time_us; + +# struct ggml_tensor * view_src; +# size_t view_offs; + +# void * data; + +# char name[GGML_MAX_NAME]; + +# void * extra; // extra things e.g. for ggml-cuda.cu + + +# char padding[4]; +# }; +class ggml_tensor(ctypes.Structure): + """n-dimensional tensor + + Attributes: + type (int): ggml_type + backend (int): ggml_backend + n_dims (int): number of dimensions + ne (ctypes.Array[ctypes.c_int64]): number of elements in each dimension + nb (ctypes.Array[ctypes.c_size_t]): stride in bytes for each dimension + op (int): ggml operation + op_params (ctypes.Array[ctypes.c_int32]): `GGML_MAX_OP_PARAMS`-length array of operation parameters + is_param (bool): is this a parameter tensor + grad (ggml_tensor_p): reference to gradient tensor + src (ctypes.Array[ggml_tensor_p]): `GGML_MAX_SRC`-length array of source tensors + perf_runs (int): number of performance runs + perf_cycles (int): number of cycles + perf_time_us (int): time in microseconds + view_src (ggml_tensor_p): pointer to tensor if this tensor is a view, None if the tensor is not a view + view_offs (ctypes.c_size_t): offset into the data pointer of the view tensor + data (ctypes.c_void_p): reference to raw tensor data + name (bytes): name of tensor + extra (ctypes.c_void_p): extra data (e.g. for CUDA) + """ + + pass + + +ggml_tensor._fields_ = [ + ("type", ctypes.c_int), + ("backend", ctypes.c_int), + ("n_dims", ctypes.c_int), + ("ne", ctypes.c_int64 * GGML_MAX_DIMS), + ("nb", ctypes.c_size_t * GGML_MAX_DIMS), + ("op", ctypes.c_int), + ( + "op_params", + ctypes.c_int32 * (GGML_MAX_OP_PARAMS // ctypes.sizeof(ctypes.c_int32)), + ), + ("is_param", ctypes.c_bool), + ("grad", ctypes.POINTER(ggml_tensor)), + ("src", ctypes.POINTER(ggml_tensor) * GGML_MAX_SRC), + ("perf_runs", ctypes.c_int), + ("perf_cycles", ctypes.c_int64), + ("perf_time_us", ctypes.c_int64), + ("view_src", ctypes.POINTER(ggml_tensor)), + ("view_offs", ctypes.c_size_t), + ("data", ctypes.c_void_p), + ("name", ctypes.c_char * GGML_MAX_NAME), + ("extra", ctypes.c_void_p), + ("padding", ctypes.c_char * 4), +] + +GGML_TENSOR_SIZE = ctypes.sizeof(ggml_tensor) + +ggml_tensor_p: TypeAlias = "ctypes._Pointer[ggml_tensor]" # type: ignore +"""ctypes pointer to a [ggml_tensor][ggml.ggml_tensor] + +Can be dereferenced to a [ggml_tensor][ggml.ggml_tensor] object using +the `.contents` attribute.""" + +abort_callback_t = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p) + +# // the compute plan that needs to be prepared for ggml_graph_compute() +# // since https://github.com/ggerganov/ggml/issues/287 +# struct ggml_cplan { +# size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` +# uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` + +# int n_threads; + +# // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes +# int n_tasks[GGML_MAX_NODES]; + + +# // abort ggml_graph_compute when true +# bool (*abort_callback)(void * data); +# void * abort_callback_data; +# }; +class ggml_cplan(ctypes.Structure): + """Compute plan for a ggml computation graph + + Attributes: + work_size (int): size of work buffer + work_data (ctypes.POINTER(ctypes.c_uint8)): work buffer + n_threads (int): number of threads to use when computing the graph using [ggml_graph_compute][ggml.ggml_graph_compute] + n_tasks (ctypes.Array[ctypes.c_int]): `n_tasks` of nodes, 1:1 mapping to cgraph nodes + abort_callback (abort_callback_t): abort callback + abort_callback_data (ctypes.c_void_p): abort callback data + """ + + _fields_ = [ + ("work_size", ctypes.c_size_t), + ("work_data", ctypes.POINTER(ctypes.c_uint8)), + ("n_threads", ctypes.c_int), + ("n_tasks", ctypes.c_int * GGML_MAX_NODES), + ( + "abort_callback", + abort_callback_t, + ), + ("abort_callback_data", ctypes.c_void_p), + ] + + +GGML_CPLAN_SIZE = ctypes.sizeof(ggml_cplan) + +ggml_cplan_p: TypeAlias = "ctypes._Pointer[ggml_cplan]" # type: ignore +"""ctypes pointer to a [ggml_cplan][ggml.ggml_cplan] + +Can be dereferenced to a [ggml_cplan][ggml.ggml_cplan] object using +the `.contents` attribute.""" + +# // next prime after GGML_MAX_NODES +# // #define GGML_GRAPH_HASHTABLE_SIZE 4099 +# // next prime after GGML_MAX_NODES * 2 (nodes + leafs) +# #define GGML_GRAPH_HASHTABLE_SIZE 8273 +GGML_GRAPH_HASHTABLE_SIZE = 8273 + +# // computation graph +# struct ggml_cgraph { +# int n_nodes; +# int n_leafs; + +# struct ggml_tensor * nodes[GGML_MAX_NODES]; +# struct ggml_tensor * grads[GGML_MAX_NODES]; +# struct ggml_tensor * leafs[GGML_MAX_NODES]; + +# void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE]; + + +# // performance +# int perf_runs; +# int64_t perf_cycles; +# int64_t perf_time_us; +# }; +class ggml_cgraph(ctypes.Structure): + """ggml computation graph + + Attributes: + n_nodes (int): number of nodes + n_leafs (int): number of leafs + nodes (ctypes.Array[ggml_tensor_p]): `n_nodes`-length array of compute tensors + grads (ctypes.Array[ggml_tensor_p]): `n_nodes`-length array of gradient tensors + leafs (ctypes.Array[ggml_tensor_p]): `n_leafs`-length array of parameter tensors + visited_hash_table (ctypes.Array[ctypes.c_void_p]): `GGML_GRAPH_HASHTABLE_SIZE`-length array of visited nodes + perf_runs (int): number of runs + perf_cycles (int): number of cycles + perf_time_us (int): computation time in microseconds""" + + _fields_ = [ + ("n_nodes", ctypes.c_int), + ("n_leafs", ctypes.c_int), + ("nodes", ctypes.POINTER(ggml_tensor) * GGML_MAX_NODES), + ("grads", ctypes.POINTER(ggml_tensor) * GGML_MAX_NODES), + ("leafs", ctypes.POINTER(ggml_tensor) * GGML_MAX_NODES), + ("visited_hash_table", ctypes.c_void_p * GGML_GRAPH_HASHTABLE_SIZE), + ("perf_runs", ctypes.c_int), + ("perf_cycles", ctypes.c_int64), + ("perf_time_us", ctypes.c_int64), + ] + + +ggml_cgraph_p: TypeAlias = "ctypes._Pointer[ggml_cgraph]" # type: ignore +"""ctypes pointer to a [ggml_cgraph][ggml.ggml_cgraph] + +Can be dereferenced to a [ggml_cgraph][ggml.ggml_cgraph] object using +the `.contents` attribute.""" + +# static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph); +GGML_GRAPH_SIZE = ctypes.sizeof(ggml_cgraph) + + +# struct ggml_scratch { +# size_t offs; +# size_t size; +# void * data; +# }; +class ggml_scratch(ctypes.Structure): + _fields_ = [ + ("offs", ctypes.c_size_t), + ("size", ctypes.c_size_t), + ("data", ctypes.c_void_p), + ] + + +# struct ggml_init_params { +# // memory pool +# size_t mem_size; // bytes +# void * mem_buffer; // if NULL, memory will be allocated internally +# bool no_alloc; // don't allocate memory for the tensor data +# }; +class ggml_init_params(ctypes.Structure): + """Initialization parameters for a ggml context + + **NOTE**: Reference counting does not cross into ggml, if you allocate a memory buffer + in python using ctypes Arrays or a numpy array, you must keep a reference to it until + you free the ggml context otherwise you will encounter a segmentation fault. + + Attributes: + mem_size (int): size of memory pool in bytes + mem_buffer (ctypes.c_void_p): pointer to memory pool, if None, memory will be allocated internally + no_alloc (bool): don't allocate memory for tensor data + """ + + _fields_ = [ + ("mem_size", ctypes.c_int64), + ("mem_buffer", ctypes.c_void_p), + ("no_alloc", ctypes.c_bool), + ] + + +# // compute types + +# // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled. +# // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995. +# enum ggml_task_type { +# GGML_TASK_INIT = 0, +# GGML_TASK_COMPUTE, +# GGML_TASK_FINALIZE, +# }; +GGML_TASK_INIT = 0 +GGML_TASK_COMPUTE = 1 +GGML_TASK_FINALIZE = 2 + +# struct ggml_compute_params { +# enum ggml_task_type type; + +# // ith = thread index, nth = number of threads +# int ith, nth; + + +# // work buffer for all threads +# size_t wsize; +# void * wdata; +# }; +class ggml_compute_params(ctypes.Structure): + _fields_ = [ + ("type", ctypes.c_int), + ("ith", ctypes.c_int), + ("nth", ctypes.c_int), + ("wsize", ctypes.c_size_t), + ("wdata", ctypes.c_void_p), + ] + + +ggml_compute_params_p: TypeAlias = "ctypes._Pointer[ggml_compute_params]" # type: ignore + +# // misc + + +# GGML_API void ggml_time_init(void); // call this once at the beginning of the program +def ggml_time_init(): + return lib.ggml_time_init() + + +lib.ggml_time_init.argtypes = [] +lib.ggml_time_init.restype = None + + +# GGML_API int64_t ggml_time_ms(void); +def ggml_time_ms() -> int: + return lib.ggml_time_ms() + + +lib.ggml_time_ms.argtypes = [] +lib.ggml_time_ms.restype = ctypes.c_int64 + + +# GGML_API int64_t ggml_time_us(void); +def ggml_time_us() -> int: + return lib.ggml_time_us() + + +lib.ggml_time_us.argtypes = [] +lib.ggml_time_us.restype = ctypes.c_int64 + + +# GGML_API int64_t ggml_cycles(void); +def ggml_cycles() -> int: + return lib.ggml_cycles() + + +lib.ggml_cycles.argtypes = [] +lib.ggml_cycles.restype = ctypes.c_int64 + + +# GGML_API int64_t ggml_cycles_per_ms(void); +def ggml_cycles_per_ms() -> int: + return lib.ggml_cycles_per_ms() + + +lib.ggml_cycles_per_ms.argtypes = [] +lib.ggml_cycles_per_ms.restype = ctypes.c_int64 + + +# GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems +def ggml_numa_init(): + return lib.ggml_numa_init() + + +lib.ggml_numa_init.argtypes = [] +lib.ggml_numa_init.restype = None + + +# GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node +def ggml_is_numa() -> bool: + return lib.ggml_is_numa() + + +lib.ggml_is_numa.argtypes = [] +lib.ggml_is_numa.restype = ctypes.c_bool + + +# GGML_API void ggml_print_object (const struct ggml_object * obj); +def ggml_print_object(obj: ggml_object_p): + return lib.ggml_print_object(obj) + + +lib.ggml_print_object.argtypes = [ctypes.POINTER(ggml_object)] +lib.ggml_print_object.restype = None + + +# GGML_API void ggml_print_objects(const struct ggml_context * ctx); +def ggml_print_objects(ctx: ggml_context_p): + return lib.ggml_print_objects(ctx) + + +lib.ggml_print_objects.argtypes = [ggml_context_p] +lib.ggml_print_objects.restype = None + + +# GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor); +def ggml_nelements( + tensor: ggml_tensor_p, +) -> int: + """Get the number of elements in a tensor + + Parameters: + tensor: tensor + + Returns: + number of elements""" + return lib.ggml_nelements(tensor) + + +lib.ggml_nelements.argtypes = [ctypes.POINTER(ggml_tensor)] +lib.ggml_nelements.restype = ctypes.c_int64 + + +# GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); +def ggml_nrows( + tensor: ggml_tensor_p, +) -> int: + """Get the number of rows in a tensor + + Parameters: + tensor: tensor + + Returns: + number of rows""" + return lib.ggml_nrows(tensor) + + +lib.ggml_nrows.argtypes = [ctypes.POINTER(ggml_tensor)] +lib.ggml_nrows.restype = ctypes.c_int64 + + +# GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); +def ggml_nbytes( + tensor: ggml_tensor_p, +) -> int: + """Get the number of bytes required to store tensor data + + Parameters: + tensor: tensor + + Returns: + number of bytes""" + return lib.ggml_nbytes(tensor) + + +lib.ggml_nbytes.argtypes = [ctypes.POINTER(ggml_tensor)] +lib.ggml_nbytes.restype = ctypes.c_size_t + + +# GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN +def ggml_nbytes_pad( + tensor: ggml_tensor_p, +) -> int: + """Get the number of bytes required to store tensor data, padded to GGML_MEM_ALIGN + + Parameters: + tensor: tensor + + Returns: + number of bytes""" + return lib.ggml_nbytes_pad(tensor) + + +lib.ggml_nbytes_pad.argtypes = [ctypes.POINTER(ggml_tensor)] +lib.ggml_nbytes_pad.restype = ctypes.c_size_t + + +# GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split); +def ggml_nbytes_split( + tensor: ggml_tensor_p, + nrows_split: Union[ctypes.c_int, int], +) -> int: + return lib.ggml_nbytes_split(tensor, nrows_split) + + +lib.ggml_nbytes_split.argtypes = [ctypes.POINTER(ggml_tensor), ctypes.c_int] +lib.ggml_nbytes_split.restype = ctypes.c_size_t + + +# GGML_API int ggml_blck_size (enum ggml_type type); +def ggml_blck_size(type: Union[ctypes.c_int, int]) -> int: + return lib.ggml_blck_size(type) + + +lib.ggml_blck_size.argtypes = [ctypes.c_int] +lib.ggml_blck_size.restype = ctypes.c_int + + +# GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block +def ggml_type_size(type: Union[ctypes.c_int, int]) -> int: + return lib.ggml_type_size(type) + + +lib.ggml_type_size.argtypes = [ctypes.c_int] +lib.ggml_type_size.restype = ctypes.c_size_t + + +# GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float +def ggml_type_sizef(type: Union[ctypes.c_int, int]) -> float: + return lib.ggml_type_sizef(type) + + +lib.ggml_type_sizef.argtypes = [ctypes.c_int] +lib.ggml_type_sizef.restype = ctypes.c_float + + +# GGML_API const char * ggml_type_name(enum ggml_type type); +def ggml_type_name(type: Union[ctypes.c_int, int]) -> bytes: + return lib.ggml_type_name(type) + + +lib.ggml_type_name.argtypes = [ctypes.c_int] +lib.ggml_type_name.restype = ctypes.c_char_p + + +# GGML_API const char * ggml_op_name (enum ggml_op op); +def ggml_op_name(op: Union[ctypes.c_int, int]) -> bytes: + return lib.ggml_op_name(op) + + +lib.ggml_op_name.argtypes = [ctypes.c_int] +lib.ggml_op_name.restype = ctypes.c_char_p + + +# GGML_API const char * ggml_op_symbol(enum ggml_op op); +def ggml_op_symbol(op: Union[ctypes.c_int, int]) -> bytes: + return lib.ggml_op_symbol(op) + + +lib.ggml_op_symbol.argtypes = [ctypes.c_int] +lib.ggml_op_symbol.restype = ctypes.c_char_p + + +# GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); +def ggml_element_size( + tensor: ggml_tensor_p, +) -> int: + return lib.ggml_element_size(tensor) + + +lib.ggml_element_size.argtypes = [ctypes.POINTER(ggml_tensor)] +lib.ggml_element_size.restype = ctypes.c_size_t + + +# GGML_API bool ggml_is_quantized(enum ggml_type type); +def ggml_is_quantized(type: Union[ctypes.c_int, int]) -> bool: + return lib.ggml_is_quantized(type) + + +lib.ggml_is_quantized.argtypes = [ctypes.c_int] +lib.ggml_is_quantized.restype = ctypes.c_bool + + +# // TODO: temporary until model loading of ggml examples is refactored +# GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); +def ggml_ftype_to_ggml_type(ftype: Union[ctypes.c_int, int]) -> int: + return lib.ggml_ftype_to_ggml_type(ftype) + + +lib.ggml_ftype_to_ggml_type.argtypes = [ctypes.c_int] +lib.ggml_ftype_to_ggml_type.restype = ctypes.c_int + + +# GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); +def ggml_is_transposed( + tensor: ggml_tensor_p, +) -> bool: + """Check if a tensor is transposed + + Parameters: + tensor: tensor + + Returns: + True if tensor is transposed else False""" + return lib.ggml_is_transposed(tensor) + + +lib.ggml_is_transposed.argtypes = [ctypes.POINTER(ggml_tensor)] +lib.ggml_is_transposed.restype = ctypes.c_bool + + +# GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor); +def ggml_is_contiguous( + tensor: ggml_tensor_p, +) -> bool: + """Check if a tensor is contiguous + + Parameters: + tensor: tensor + + Returns: + True if tensor is contiguous else False""" + return lib.ggml_is_contiguous(tensor) + + +lib.ggml_is_contiguous.argtypes = [ctypes.POINTER(ggml_tensor)] +lib.ggml_is_contiguous.restype = ctypes.c_bool + + +# GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); +def ggml_is_permuted( + tensor: ggml_tensor_p, +) -> bool: + """Check if a tensor is permuted + + Parameters: + tensor: tensor + + Returns: + True if tensor is permuted else False""" + return lib.ggml_is_permuted(tensor) + + +lib.ggml_is_permuted.argtypes = [ctypes.POINTER(ggml_tensor)] +lib.ggml_is_permuted.restype = ctypes.c_bool + + +# GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1); +def ggml_are_same_shape( + t0: ggml_tensor_p, + t1: ggml_tensor_p, +) -> bool: + """Check if two tensors have the same shape + + Parameters: + t0: tensor 0 + t1: tensor 1 + + Returns: + True if tensors have the same shape else False""" + return lib.ggml_are_same_shape(t0, t1) + + +lib.ggml_are_same_shape.argtypes = [ + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_are_same_shape.restype = ctypes.c_bool + + +# // use this to compute the memory overhead of a tensor +# GGML_API size_t ggml_tensor_overhead(void); +def ggml_tensor_overhead() -> int: + """Overhead required for a tensor struct in bytes + + Returns: + size of tensor struct in bytes""" + return lib.ggml_tensor_overhead() + + +lib.ggml_tensor_overhead.argtypes = [] +lib.ggml_tensor_overhead.restype = ctypes.c_size_t + +# // main + + +# GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); +def ggml_init( + params: ggml_init_params, +) -> ggml_context_p: + """Instantiate a new ggml context with params. + + You must call `ggml_free()` to free the context. + + Parameters: + params: ggml init params + + Returns: + Pointer to ggml_context""" + return lib.ggml_init(params) + + +lib.ggml_init.argtypes = [ggml_init_params] +lib.ggml_init.restype = ggml_context_p + + +# GGML_API void ggml_free(struct ggml_context * ctx); +def ggml_free(ctx: ggml_context_p): + """Free the ggml context. + + Parameters: + ctx: ggml context""" + return lib.ggml_free(ctx) + + +lib.ggml_free.argtypes = [ggml_context_p] +lib.ggml_free.restype = None + + +# GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); +def ggml_used_mem(ctx: ggml_context_p) -> int: + """Return the amount of memory used by the ggml context in bytes. + + Parameters: + ctx: ggml context + + Returns: + amount of memory used in bytes""" + return lib.ggml_used_mem(ctx) + + +lib.ggml_used_mem.argtypes = [ggml_context_p] +lib.ggml_used_mem.restype = ctypes.c_size_t + + +# GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); +def ggml_set_scratch(ctx: ggml_context_p, scratch: ggml_scratch) -> int: + """Set the scratch buffer for the ggml context.""" + return lib.ggml_set_scratch(ctx, scratch) + + +lib.ggml_set_scratch.argtypes = [ggml_context_p, ggml_scratch] +lib.ggml_set_scratch.restype = ctypes.c_size_t + + +# GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx); +def ggml_get_no_alloc(ctx: ggml_context_p) -> bool: + """Return the no_alloc flag for the ggml context.""" + return lib.ggml_get_no_alloc(ctx) + + +lib.ggml_get_no_alloc.argtypes = [ggml_context_p] +lib.ggml_get_no_alloc.restype = ctypes.c_bool + + +# GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); +def ggml_set_no_alloc(ctx: ggml_context_p, no_alloc: Union[ctypes.c_bool, bool]): + """Set the no_alloc flag for the ggml context.""" + return lib.ggml_set_no_alloc(ctx, no_alloc) + + +lib.ggml_set_no_alloc.argtypes = [ggml_context_p, ctypes.c_bool] +lib.ggml_set_no_alloc.restype = None + + +# GGML_API void * ggml_get_mem_buffer (struct ggml_context * ctx); +def ggml_get_mem_buffer(ctx: ggml_context_p) -> Optional[ctypes.c_void_p]: + """Return the memory buffer for the ggml context.""" + return lib.ggml_get_mem_buffer(ctx) + + +lib.ggml_get_mem_buffer.argtypes = [ggml_context_p] +lib.ggml_get_mem_buffer.restype = ctypes.c_void_p + + +# GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx); +def ggml_get_mem_size(ctx: ggml_context_p) -> int: + """Return the size of the memory buffer for the ggml context in bytes.""" + return lib.ggml_get_mem_size(ctx) + + +lib.ggml_get_mem_size.argtypes = [ggml_context_p] +lib.ggml_get_mem_size.restype = ctypes.c_int64 + + +# GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx); +def ggml_get_max_tensor_size(ctx: ggml_context_p) -> int: + """Return the maximum size of a tensor in bytes.""" + return lib.ggml_get_max_tensor_size(ctx) + + +lib.ggml_get_max_tensor_size.argtypes = [ggml_context_p] +lib.ggml_get_max_tensor_size.restype = ctypes.c_size_t + + +# GGML_API struct ggml_tensor * ggml_new_tensor( +# struct ggml_context * ctx, +# enum ggml_type type, +# int n_dims, +# const int64_t *ne); +def ggml_new_tensor( + ctx: ggml_context_p, + type: Union[ctypes.c_int, int], + n_dims: Union[ctypes.c_int, int], + ne: CInt64Array, +) -> ggml_tensor_p: + """Create a new tensor with the given type, number of dimensions, and number of elements in each dimension. + + Parameters: + ctx: ggml context + type: ggml type + n_dims: number of dimensions + ne (ctypes.Array[ctypes.c_int64]): number of elements in each dimension (array of length n_dims) + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_new_tensor(ctx, type, n_dims, ne) + + +lib.ggml_new_tensor.argtypes = [ + ggml_context_p, + ctypes.c_int, + ctypes.c_int, + ctypes.POINTER(ctypes.c_int64), +] +lib.ggml_new_tensor.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_new_tensor_1d( +# struct ggml_context * ctx, +# enum ggml_type type, +# int64_t ne0); +def ggml_new_tensor_1d( + ctx: ggml_context_p, type: Union[ctypes.c_int, int], ne0: Union[ctypes.c_int64, int] +) -> ggml_tensor_p: + """Create a new 1-dimensional tensor with the given type and number of elements. + + Parameters: + ctx: ggml context + type: ggml type + ne0: number of elements in dimension 0 + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_new_tensor_1d(ctx, type, ne0) + + +lib.ggml_new_tensor_1d.argtypes = [ggml_context_p, ctypes.c_int, ctypes.c_int64] +lib.ggml_new_tensor_1d.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_new_tensor_2d( +# struct ggml_context * ctx, +# enum ggml_type type, +# int64_t ne0, +# int64_t ne1); +def ggml_new_tensor_2d( + ctx: ggml_context_p, + type: Union[ctypes.c_int, int], + ne0: Union[ctypes.c_int64, int], + ne1: Union[ctypes.c_int64, int], +) -> ggml_tensor_p: + """Create a new 2-dimensional tensor with the given type and number of elements in each dimension. + + Parameters: + ctx: ggml context + type: ggml type + ne0: number of elements in dimension 0 + ne1: number of elements in dimension 1 + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_new_tensor_2d(ctx, type, ne0, ne1) + + +lib.ggml_new_tensor_2d.argtypes = [ + ggml_context_p, + ctypes.c_int, + ctypes.c_int64, + ctypes.c_int64, +] +lib.ggml_new_tensor_2d.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_new_tensor_3d( +# struct ggml_context * ctx, +# enum ggml_type type, +# int64_t ne0, +# int64_t ne1, +# int64_t ne2); +def ggml_new_tensor_3d( + ctx: ggml_context_p, + type: Union[ctypes.c_int, int], + ne0: Union[ctypes.c_int64, int], + ne1: Union[ctypes.c_int64, int], + ne2: Union[ctypes.c_int64, int], +) -> ggml_tensor_p: + """Create a new 3-dimensional tensor with the given type and number of elements in each dimension. + + Parameters: + ctx: ggml context + type: ggml type + ne0: number of elements in dimension 0 + ne1: number of elements in dimension 1 + ne2: number of elements in dimension 2 + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_new_tensor_3d(ctx, type, ne0, ne1, ne2) + + +lib.ggml_new_tensor_3d.argtypes = [ + ggml_context_p, + ctypes.c_int, + ctypes.c_int64, + ctypes.c_int64, + ctypes.c_int64, +] +lib.ggml_new_tensor_3d.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_new_tensor_4d( +# struct ggml_context * ctx, +# enum ggml_type type, +# int64_t ne0, +# int64_t ne1, +# int64_t ne2, +# int64_t ne3); +def ggml_new_tensor_4d( + ctx: ggml_context_p, + type: Union[ctypes.c_int, int], + ne0: Union[ctypes.c_int64, int], + ne1: Union[ctypes.c_int64, int], + ne2: Union[ctypes.c_int64, int], + ne3: Union[ctypes.c_int64, int], +) -> ggml_tensor_p: + """Create a new 4-dimensional tensor with the given type and number of elements in each dimension. + + Parameters: + ctx: ggml context + type: ggml type + ne0: number of elements in dimension 0 + ne1: number of elements in dimension 1 + ne2: number of elements in dimension 2 + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3) + + +lib.ggml_new_tensor_4d.argtypes = [ + ggml_context_p, + ctypes.c_int, + ctypes.c_int64, + ctypes.c_int64, + ctypes.c_int64, + ctypes.c_int64, +] +lib.ggml_new_tensor_4d.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); +def ggml_new_i32( + ctx: ggml_context_p, value: Union[ctypes.c_int32, int] +) -> ggml_tensor_p: + """Create a 1 element tensor with the given integer value. + + Parameters: + ctx: ggml context + value: integer value + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_new_i32(ctx, value) + + +lib.ggml_new_i32.argtypes = [ggml_context_p, ctypes.c_int32] +lib.ggml_new_i32.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); +def ggml_new_f32( + ctx: ggml_context_p, + value: Union[ctypes.c_float, float], +) -> ggml_tensor_p: + """Create a 1 element tensor with the given float value. + + Parameters: + ctx: ggml context + value: float value + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_new_f32(ctx, value) + + +lib.ggml_new_f32.argtypes = [ggml_context_p, ctypes.c_float] +lib.ggml_new_f32.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); +def ggml_dup_tensor(ctx: ggml_context_p, src: ggml_tensor_p) -> ggml_tensor_p: + """Create a new tensor with the same type and dimensions as the source tensor. + + Parameters: + ctx: ggml context + src: source tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_dup_tensor(ctx, src) + + +lib.ggml_dup_tensor.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_dup_tensor.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src); +def ggml_view_tensor(ctx: ggml_context_p, src: ggml_tensor_p) -> ggml_tensor_p: + """Create a new tensor with the same type, dimensions and data as the source tensor. + + Parameters: + ctx: ggml context + src: source tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_view_tensor(ctx, src) + + +lib.ggml_view_tensor.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_view_tensor.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); +def ggml_get_tensor(ctx: ggml_context_p, name: bytes) -> ggml_tensor_p: + """Get a tensor from the ggml context by name. + + Parameters: + ctx: ggml context + name: name of tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_get_tensor(ctx, name) + + +lib.ggml_get_tensor.argtypes = [ggml_context_p, ctypes.c_char_p] +lib.ggml_get_tensor.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); +def ggml_set_zero( + tensor: ggml_tensor_p, +) -> ggml_tensor_p: + """Zero all elements in a tensor. + + Parameters: + tensor: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_set_zero(tensor) + + +lib.ggml_set_zero.argtypes = [ctypes.POINTER(ggml_tensor)] +lib.ggml_set_zero.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); +def ggml_set_i32( + tensor: ggml_tensor_p, + value: Union[ctypes.c_int32, int], +) -> ggml_tensor_p: + """Set all elements in a tensor to the given integer value. + + Parameters: + tensor: tensor + value: integer value + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_set_i32(tensor, value) + + +lib.ggml_set_i32.argtypes = [ctypes.POINTER(ggml_tensor), ctypes.c_int32] +lib.ggml_set_i32.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); +def ggml_set_f32( + tensor: ggml_tensor_p, + value: Union[ctypes.c_float, float], +) -> ggml_tensor_p: + """Set all elements in a tensor to the given float value. + + Parameters: + tensor: tensor + value: float value + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_set_f32(tensor, value) + + +lib.ggml_set_f32.argtypes = [ctypes.POINTER(ggml_tensor), ctypes.c_float] +lib.ggml_set_f32.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); +def ggml_get_i32_1d( + tensor: ggml_tensor_p, + i: Union[ctypes.c_int, int], +) -> int: + """Get the integer value of the i-th element in a 1-dimensional tensor. + + Parameters: + tensor: tensor + i: index of element + + Returns: + integer value of element at index i""" + return lib.ggml_get_i32_1d(tensor, i) + + +lib.ggml_get_i32_1d.argtypes = [ctypes.POINTER(ggml_tensor), ctypes.c_int] +lib.ggml_get_i32_1d.restype = ctypes.c_int32 + + +# GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); +def ggml_set_i32_1d( + tensor: ggml_tensor_p, + i: Union[ctypes.c_int, int], + value: Union[ctypes.c_int32, int], +): + """Set the integer value of the i-th element in a 1-dimensional tensor. + + Parameters: + tensor: tensor + i: index of element + value: integer value to set element to""" + return lib.ggml_set_i32_1d(tensor, i, value) + + +lib.ggml_set_i32_1d.argtypes = [ + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int32, +] +lib.ggml_set_i32_1d.restype = None + + +# GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); +def ggml_get_f32_1d( + tensor: ggml_tensor_p, + i: Union[ctypes.c_int, int], +) -> float: + """Get the float value of the i-th element in a 1-dimensional tensor. + + Parameters: + tensor: tensor + + Returns: + float value of element at index i""" + return lib.ggml_get_f32_1d(tensor, i) + + +lib.ggml_get_f32_1d.argtypes = [ctypes.POINTER(ggml_tensor), ctypes.c_int] +lib.ggml_get_f32_1d.restype = ctypes.c_float + + +# GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); +def ggml_set_f32_1d( + tensor: ggml_tensor_p, + i: Union[ctypes.c_int, int], + value: Union[ctypes.c_float, float], +): + """Set the float value of the i-th element in a 1-dimensional tensor. + + Parameters: + tensor: tensor + i: index of element + value: float value to set element to""" + return lib.ggml_set_f32_1d(tensor, i, value) + + +lib.ggml_set_f32_1d.argtypes = [ + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_float, +] +lib.ggml_set_f32_1d.restype = None + + +# GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); +def ggml_get_data( + tensor: ggml_tensor_p, +) -> Optional[ctypes.c_void_p]: + """Get the data pointer of a tensor. + + Parameters: + tensor: tensor + + Returns: + Pointer to data, or None if tensor has no data""" + return lib.ggml_get_data(tensor) + + +lib.ggml_get_data.argtypes = [ctypes.POINTER(ggml_tensor)] +lib.ggml_get_data.restype = ctypes.c_void_p + + +# GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); +def ggml_get_data_f32( + tensor: ggml_tensor_p, +) -> Optional[CFloatArray]: + """Get the data pointer of a tensor as a float array. + + Parameters: + tensor: tensor + + Returns: + (Optional[ctypes.Array[ctypes.c_float]]): array of float to data, or None if tensor has no data + """ + return lib.ggml_get_data_f32(tensor) + + +lib.ggml_get_data_f32.argtypes = [ctypes.POINTER(ggml_tensor)] +lib.ggml_get_data_f32.restype = ctypes.POINTER(ctypes.c_float) + + +# GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); +def ggml_get_unary_op( + tensor: ggml_tensor_p, +) -> int: + """Get the unary operation of a tensor. + + Parameters: + tensor: tensor + + Returns: + unary operation""" + return lib.ggml_get_unary_op(tensor) + + +lib.ggml_get_unary_op.argtypes = [ctypes.POINTER(ggml_tensor)] +lib.ggml_get_unary_op.restype = ctypes.c_int + + +# GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor); +def ggml_get_name( + tensor: ggml_tensor_p, +) -> bytes: + """Get the name of a tensor. + + Parameters: + tensor: tensor + + Returns: + name of tensor""" + return lib.ggml_get_name(tensor) + + +lib.ggml_get_name.argtypes = [ctypes.POINTER(ggml_tensor)] +lib.ggml_get_name.restype = ctypes.c_char_p + + +# GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name); +def ggml_set_name( + tensor: ggml_tensor_p, + name: bytes, +) -> ggml_tensor_p: + """Set the name of a tensor. + + Parameters: + tensor: tensor + name: name to set tensor to + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_set_name(tensor, name) + + +lib.ggml_set_name.argtypes = [ctypes.POINTER(ggml_tensor), ctypes.c_char_p] +lib.ggml_set_name.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...); +def ggml_format_name( + tensor: ggml_tensor_p, + fmt: bytes, + *args: Sequence[Union[bool, int, float, str]], +) -> ggml_tensor_p: + """Format the name of a tensor using the given format c string and arguments. + + Parameters: + tensor: tensor + fmt: format c string + args: arguments to format string + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_format_name(tensor, fmt, *args) + + +lib.ggml_format_name.argtypes = [ctypes.POINTER(ggml_tensor), ctypes.c_char_p] +lib.ggml_format_name.restype = ctypes.POINTER(ggml_tensor) + +# // +# // operations on tensors with backpropagation +# // + + +# GGML_API struct ggml_tensor * ggml_dup( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_dup(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + return lib.ggml_dup(ctx, a) + + +lib.ggml_dup.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_dup.restype = ctypes.POINTER(ggml_tensor) + + +# // in-place, returns view(a) +# GGML_API struct ggml_tensor * ggml_dup_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_dup_inplace(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + return lib.ggml_dup_inplace(ctx, a) + + +lib.ggml_dup_inplace.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_dup_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_add( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_add( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + """Add two tensors together and return the result. + + Parameters: + ctx: ggml context + a: first tensor + b: second tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_add(ctx, a, b) + + +lib.ggml_add.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_add.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_add_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_add_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + """Add two tensors together and store the result in the first tensor. + + Parameters: + ctx: ggml context + a: first tensor + b: second tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_add_inplace(ctx, a, b) + + +lib.ggml_add_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_add_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_add1( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_add1( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + return lib.ggml_add1(ctx, a, b) + + +lib.ggml_add1.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_add1.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_add1_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_add1_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + return lib.ggml_add1_inplace(ctx, a, b) + + +lib.ggml_add1_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_add1_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_acc( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# size_t nb1, +# size_t nb2, +# size_t nb3, +# size_t offset); +def ggml_acc( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + nb1: Union[ctypes.c_size_t, int], + nb2: Union[ctypes.c_size_t, int], + nb3: Union[ctypes.c_size_t, int], + offset: Union[ctypes.c_size_t, int], +) -> ggml_tensor_p: + return lib.ggml_acc(ctx, a, b, nb1, nb2, nb3, offset) + + +lib.ggml_acc.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_size_t, + ctypes.c_size_t, + ctypes.c_size_t, + ctypes.c_size_t, +] +lib.ggml_acc.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_acc_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# size_t nb1, +# size_t nb2, +# size_t nb3, +# size_t offset); +def ggml_acc_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + nb1: Union[ctypes.c_size_t, int], + nb2: Union[ctypes.c_size_t, int], + nb3: Union[ctypes.c_size_t, int], + offset: Union[ctypes.c_size_t, int], +) -> ggml_tensor_p: + return lib.ggml_acc_inplace(ctx, a, b, nb1, nb2, nb3, offset) + + +lib.ggml_acc_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_size_t, + ctypes.c_size_t, + ctypes.c_size_t, + ctypes.c_size_t, +] +lib.ggml_acc_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_sub( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_sub( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + """Subtract two tensors and return the result. + + Parameters: + ctx: ggml context + a: first tensor + b: second tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_sub(ctx, a, b) + + +lib.ggml_sub.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_sub.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_sub_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_sub_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + """Subtract two tensors and store the result in the first tensor. + + Parameters: + ctx: ggml context + a: first tensor + b: second tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_sub_inplace(ctx, a, b) + + +lib.ggml_sub_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_sub_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_mul( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_mul( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + """Element-wise multiply two tensors and return the result. + + Parameters: + ctx: ggml context + a: first tensor + b: second tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_mul(ctx, a, b) + + +lib.ggml_mul.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_mul.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_mul_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_mul_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + """Element-wise multiply two tensors and store the result in the first tensor. + + Parameters: + ctx: ggml context + a: first tensor + b: second tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_mul_inplace(ctx, a, b) + + +lib.ggml_mul_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_mul_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_div( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_div( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + """Element-wise divide two tensors and return the result. + + Parameters: + ctx: ggml context + a: first tensor + b: second tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_div(ctx, a, b) + + +lib.ggml_div.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_div.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_div_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_div_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + """Element-wise divide two tensors and store the result in the first tensor. + + Parameters: + ctx: ggml context + a: first tensor + b: second tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_div_inplace(ctx, a, b) + + +lib.ggml_div_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_div_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_sqr( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_sqr( + ctx: ggml_context_p, + a: ggml_tensor_p, +) -> ggml_tensor_p: + """Square all elements in a tensor and return the result. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_sqr(ctx, a) + + +lib.ggml_sqr.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_sqr.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_sqr_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_sqr_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, +) -> ggml_tensor_p: + """Square all elements in a tensor and store the result in the first tensor. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_sqr_inplace(ctx, a) + + +lib.ggml_sqr_inplace.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_sqr_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_sqrt( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_sqrt( + ctx: ggml_context_p, + a: ggml_tensor_p, +) -> ggml_tensor_p: + """Square root all elements in a tensor and return the result. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_sqrt(ctx, a) + + +lib.ggml_sqrt.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_sqrt.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_sqrt_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_sqrt_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, +) -> ggml_tensor_p: + """Square root all elements in a tensor and store the result in the first tensor. + + Parameters: + ctx: ggml context + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_sqrt_inplace(ctx, a) + + +lib.ggml_sqrt_inplace.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_sqrt_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_log( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_log(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + """Take the natural logarithm of all elements in a tensor and return the result. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_log(ctx, a) + + +lib.ggml_log.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_log.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_log_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_log_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, +) -> ggml_tensor_p: + """Take the natural logarithm of all elements in a tensor and store the result in the first tensor. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_log_inplace(ctx, a) + + +lib.ggml_log_inplace.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_log_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# // return scalar +# GGML_API struct ggml_tensor * ggml_sum( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_sum(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + """Sum all elements in a tensor and return the result. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_sum(ctx, a) + + +lib.ggml_sum.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_sum.restype = ctypes.POINTER(ggml_tensor) + + +# // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d] +# GGML_API struct ggml_tensor * ggml_sum_rows( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_sum_rows(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + """Sum all elements in a tensor along the first axis and return the result. + + sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d] + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_sum_rows(ctx, a) + + +lib.ggml_sum_rows.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_sum_rows.restype = ctypes.POINTER(ggml_tensor) + + +# // mean along rows +# GGML_API struct ggml_tensor * ggml_mean( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_mean(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + """Take the mean of all elements in a tensor and return the result. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_mean(ctx, a) + + +lib.ggml_mean.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_mean.restype = ctypes.POINTER(ggml_tensor) + + +# // argmax along rows +# GGML_API struct ggml_tensor * ggml_argmax( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_argmax(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + """Take the argmax of all elements in a tensor and return the result. + + argmax along rows + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_argmax(ctx, a) + + +lib.ggml_argmax.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_argmax.restype = ctypes.POINTER(ggml_tensor) + + +# // if a is the same shape as b, and a is not parameter, return a +# // otherwise, return a new tensor: repeat(a) to fit in b +# GGML_API struct ggml_tensor * ggml_repeat( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_repeat( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + """Repeat a tensor to fit the shape of another tensor. + + If a is the same shape as b, and a is not parameter, return a + + Parameters: + ctx: ggml context + a: tensor to repeat + b: tensor to fit + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_repeat(ctx, a, b) + + +lib.ggml_repeat.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_repeat.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_repeat_back( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_repeat_back( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + return lib.ggml_repeat_back(ctx, a, b) + + +lib.ggml_repeat_back.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_repeat_back.restype = ctypes.POINTER(ggml_tensor) + + +# // concat a and b on dim 2 +# // used in stable-diffusion +# GGML_API struct ggml_tensor * ggml_concat( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_concat( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + """Concatenate two tensors along the second axis and return the result. + + Parameters: + ctx: ggml context + a: first tensor + b: second tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_concat(ctx, a, b) + + +lib.ggml_concat.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_concat.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_abs( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_abs(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + """Take the absolute value of all elements in a tensor and return the result. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_abs(ctx, a) + + +lib.ggml_abs.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_abs.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_abs_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_abs_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, +) -> ggml_tensor_p: + """Take the absolute value of all elements in a tensor and store the result in the first tensor. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_abs_inplace(ctx, a) + + +lib.ggml_abs_inplace.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_abs_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_sgn( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_sgn(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + """Get the sign of all elements in a tensor and return the result. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_sgn(ctx, a) + + +lib.ggml_sgn.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_sgn.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_sgn_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_sgn_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, +) -> ggml_tensor_p: + """Get the sign of all elements in a tensor and store the result in the first tensor. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_sgn_inplace(ctx, a) + + +lib.ggml_sgn_inplace.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_sgn_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_neg( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_neg(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + """Negate all elements in a tensor and return the result. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_neg(ctx, a) + + +lib.ggml_neg.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_neg.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_neg_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_neg_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, +) -> ggml_tensor_p: + """Negate all elements in a tensor and store the result in the first tensor. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_neg_inplace(ctx, a) + + +lib.ggml_neg_inplace.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_neg_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_step( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_step(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + return lib.ggml_step(ctx, a) + + +lib.ggml_step.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_step.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_tanh( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_tanh(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + """Apply the tanh activation function to all elements in a tensor and return the result. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_tanh(ctx, a) + + +lib.ggml_tanh.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_tanh.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_tanh_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_tanh_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, +) -> ggml_tensor_p: + """Apply the tanh activation function to all elements in a tensor and store the result in the first tensor. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_tanh_inplace(ctx, a) + + +lib.ggml_tanh_inplace.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_tanh_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_elu( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_elu(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + """Apply the ELU activation function to all elements in a tensor and return the result. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_elu(ctx, a) + + +lib.ggml_elu.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_elu.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_elu_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_elu_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, +) -> ggml_tensor_p: + """Apply the ELU activation function to all elements in a tensor and store the result in the first tensor. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_elu_inplace(ctx, a) + + +lib.ggml_elu_inplace.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_elu_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_relu( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_relu(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + """Apply the ReLU activation function to all elements in a tensor and return the result. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_relu(ctx, a) + + +lib.ggml_relu.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_relu.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_relu_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_relu_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, +) -> ggml_tensor_p: + """Apply the ReLU activation function to all elements in a tensor and store the result in the first tensor. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_relu_inplace(ctx, a) + + +lib.ggml_relu_inplace.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_relu_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# // TODO: double-check this computation is correct +# GGML_API struct ggml_tensor * ggml_gelu( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_gelu(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + """Apply the Gaussian Error Linear Unit activation function to all elements in a tensor and return the result. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_gelu(ctx, a) + + +lib.ggml_gelu.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_gelu.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_gelu_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_gelu_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, +) -> ggml_tensor_p: + """Apply the Gaussian Error Linear Unit activation function to all elements in a tensor and store the result in the first tensor. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_gelu_inplace(ctx, a) + + +lib.ggml_gelu_inplace.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_gelu_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_gelu_quick( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_gelu_quick(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + """Apply the Gaussian Error Linear Unit activation function to all elements in a tensor and return the result. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_gelu_quick(ctx, a) + + +lib.ggml_gelu_quick.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_gelu_quick.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_gelu_quick_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_gelu_quick_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, +) -> ggml_tensor_p: + """Apply the Gaussian Error Linear Unit activation function to all elements in a tensor and store the result in the first tensor. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_gelu_quick_inplace(ctx, a) + + +lib.ggml_gelu_quick_inplace.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_gelu_quick_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_silu( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_silu(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + """Apply the Sigmoid Linear Unit activation function to all elements in a tensor and return the result. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_silu(ctx, a) + + +lib.ggml_silu.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_silu.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_silu_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_silu_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, +) -> ggml_tensor_p: + """Apply the Sigmoid Linear Unit activation function to all elements in a tensor and store the result in the first tensor. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_silu_inplace(ctx, a) + + +lib.ggml_silu_inplace.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_silu_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# // a - x +# // b - dy +# GGML_API struct ggml_tensor * ggml_silu_back( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_silu_back( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + return lib.ggml_silu_back(ctx, a, b) + + +lib.ggml_silu_back.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_silu_back.restype = ctypes.POINTER(ggml_tensor) + + +# // normalize along rows +# GGML_API struct ggml_tensor * ggml_norm( +# struct ggml_context * ctx, +# struct ggml_tensor * a +# float eps); +def ggml_norm( + ctx: ggml_context_p, + a: ggml_tensor_p, + eps: Union[ctypes.c_float, float], +) -> ggml_tensor_p: + """Normalize all elements in a tensor along the first axis and return the result. + + normalize along rows. + + Parameters: + ctx: ggml context + a: tensor + eps: minimum value to avoid division by zero + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_norm(ctx, a, eps) + + +lib.ggml_norm.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor), ctypes.c_float] +lib.ggml_norm.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_norm_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a +# float eps); +def ggml_norm_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + eps: Union[ctypes.c_float, float], +) -> ggml_tensor_p: + """Normalize all elements in a tensor along the first axis and store the result in the first tensor. + + normalize along rows. + + Parameters: + ctx: ggml context + a: tensor + eps: minimum value to avoid division by zero + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_norm_inplace(ctx, a, eps) + + +lib.ggml_norm_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_float, +] +lib.ggml_norm_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_rms_norm( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# float eps); +def ggml_rms_norm( + ctx: ggml_context_p, + a: ggml_tensor_p, + eps: Union[ctypes.c_float, float], +) -> ggml_tensor_p: + """Compute the RMS norm of a tensor and return the result. + + Parameters: + ctx: ggml context + a: tensor + eps: float + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_rms_norm(ctx, a, eps) + + +lib.ggml_rms_norm.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_float, +] +lib.ggml_rms_norm.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_rms_norm_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# float eps); +def ggml_rms_norm_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + eps: Union[ctypes.c_float, float], +) -> ggml_tensor_p: + return lib.ggml_rms_norm_inplace(ctx, a, eps) + + +lib.ggml_rms_norm_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_float, +] +lib.ggml_rms_norm_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# // group normalize along ne0*ne1*n_groups +# // used in stable-diffusion +# // TODO: eps is hardcoded to 1e-6 for now +# GGML_API struct ggml_tensor * ggml_group_norm( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int n_groups); +def ggml_group_norm( + ctx: ggml_context_p, + a: ggml_tensor_p, + n_groups: int, +) -> ggml_tensor_p: + """Group normalize a tensor and return the result. + + Parameters: + ctx: ggml context + a: tensor + n_groups: int + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_group_norm(ctx, a, n_groups) + + +lib.ggml_group_norm.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, +] +lib.ggml_group_norm.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_group_norm_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int n_groups); +def ggml_group_norm_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + n_groups: int, +) -> ggml_tensor_p: + """Group normalize a tensor and store the result in the first tensor. + + Parameters: + ctx: ggml context + a: tensor + n_groups: int + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_group_norm_inplace(ctx, a, n_groups) + + +lib.ggml_group_norm_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, +] +lib.ggml_group_norm_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# // a - x +# // b - dy +# GGML_API struct ggml_tensor * ggml_rms_norm_back( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b +# float eps); +def ggml_rms_norm_back( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + eps: Union[ctypes.c_float, float], +) -> ggml_tensor_p: + return lib.ggml_rms_norm_back(ctx, a, b, eps) + + +lib.ggml_rms_norm_back.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_float, +] +lib.ggml_rms_norm_back.restype = ctypes.POINTER(ggml_tensor) + + +# // A: m rows, n columns +# // B: p rows, n columns (i.e. we transpose it internally) +# // result is m columns, p rows +# GGML_API struct ggml_tensor * ggml_mul_mat( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_mul_mat( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + """Multiply two matrices and return the result. + + A: m rows, n columns + B: p rows, n columns (i.e. we transpose it internally) + result is m columns, p rows + + Parameters: + ctx: ggml context + a: tensor + b: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_mul_mat(ctx, a, b) + + +lib.ggml_mul_mat.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_mul_mat.restype = ctypes.POINTER(ggml_tensor) + + +# // A: m columns, n rows, +# // B: p columns, n rows, +# // result is m columns, p rows +# GGML_API struct ggml_tensor * ggml_out_prod( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_out_prod( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + """Compute the outer product of two matrices and return the result. + + A: m columns, n rows, + B: p columns, n rows, + result is m columns, p rows + + Parameters: + ctx: ggml context + a: tensor + b: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_out_prod(ctx, a, b) + + +lib.ggml_out_prod.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_out_prod.restype = ctypes.POINTER(ggml_tensor) + +# // +# // operations on tensors without backpropagation +# // + + +# GGML_API struct ggml_tensor * ggml_scale( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_scale( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + """Scale a tensor by another tensor and return the result. + + Parameters: + ctx: ggml context + a: tensor + b: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_scale(ctx, a, b) + + +lib.ggml_scale.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_scale.restype = ctypes.POINTER(ggml_tensor) + + +# // in-place, returns view(a) +# GGML_API struct ggml_tensor * ggml_scale_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_scale_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + """Scale a tensor by another tensor and store the result in the first tensor. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_scale_inplace(ctx, a, b) + + +lib.ggml_scale_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_scale_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# // b -> view(a,offset,nb1,nb2,3), return modified a +# GGML_API struct ggml_tensor * ggml_set( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# size_t nb1, +# size_t nb2, +# size_t nb3, +# size_t offset); +def ggml_set( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + nb1: Union[ctypes.c_size_t, int], + nb2: Union[ctypes.c_size_t, int], + nb3: Union[ctypes.c_size_t, int], + offset: Union[ctypes.c_size_t, int], +) -> ggml_tensor_p: + return lib.ggml_set(ctx, a, b, nb1, nb2, nb3, offset) + + +lib.ggml_set.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_size_t, + ctypes.c_size_t, + ctypes.c_size_t, + ctypes.c_size_t, +] +lib.ggml_set.restype = ctypes.POINTER(ggml_tensor) + + +# // b -> view(a,offset,nb1,nb2,3), return view(a) +# GGML_API struct ggml_tensor * ggml_set_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# size_t nb1, +# size_t nb2, +# size_t nb3, +# size_t offset); +def ggml_set_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + nb1: Union[ctypes.c_size_t, int], + nb2: Union[ctypes.c_size_t, int], + nb3: Union[ctypes.c_size_t, int], + offset: Union[ctypes.c_size_t, int], +) -> ggml_tensor_p: + return lib.ggml_set_inplace(ctx, a, b, nb1, nb2, nb3, offset) + + +lib.ggml_set_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_size_t, + ctypes.c_size_t, + ctypes.c_size_t, + ctypes.c_size_t, +] +lib.ggml_set_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_set_1d( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# size_t offset); +def ggml_set_1d( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + offset: Union[ctypes.c_size_t, int], +) -> ggml_tensor_p: + return lib.ggml_set_1d(ctx, a, b, offset) + + +lib.ggml_set_1d.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_size_t, +] +lib.ggml_set_1d.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_set_1d_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# size_t offset); +def ggml_set_1d_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + offset: Union[ctypes.c_size_t, int], +) -> ggml_tensor_p: + return lib.ggml_set_1d_inplace(ctx, a, b, offset) + + +lib.ggml_set_1d_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_size_t, +] +lib.ggml_set_1d_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# // b -> view(a,offset,nb1,nb2,3), return modified a +# GGML_API struct ggml_tensor * ggml_set_2d( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# size_t nb1, +# size_t offset); +def ggml_set_2d( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + nb1: Union[ctypes.c_size_t, int], + offset: Union[ctypes.c_size_t, int], +) -> ggml_tensor_p: + return lib.ggml_set_2d(ctx, a, b, nb1, offset) + + +lib.ggml_set_2d.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_size_t, + ctypes.c_size_t, +] +lib.ggml_set_2d.restype = ctypes.POINTER(ggml_tensor) + + +# // b -> view(a,offset,nb1,nb2,3), return view(a) +# GGML_API struct ggml_tensor * ggml_set_2d_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# size_t nb1, +# size_t offset); +def ggml_set_2d_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + nb1: Union[ctypes.c_size_t, int], + offset: Union[ctypes.c_size_t, int], +) -> ggml_tensor_p: + return lib.ggml_set_2d_inplace(ctx, a, b, nb1, offset) + + +lib.ggml_set_2d_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_size_t, + ctypes.c_size_t, +] +lib.ggml_set_2d_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# // a -> b, return view(b) +# GGML_API struct ggml_tensor * ggml_cpy( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_cpy( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + return lib.ggml_cpy(ctx, a, b) + + +lib.ggml_cpy.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_cpy.restype = ctypes.POINTER(ggml_tensor) + + +# // a -> b, in-place, return view(b) +# GGML_API struct ggml_tensor * ggml_cpy_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_cpy_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + return lib.ggml_cpy_inplace(ctx, a, b) + + +lib.ggml_cpy_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_cpy_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# // make contiguous +# GGML_API struct ggml_tensor * ggml_cont( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_cont(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + """Make a tensor contiguous and return the result. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_cont(ctx, a) + + +lib.ggml_cont.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_cont.restype = ctypes.POINTER(ggml_tensor) + + +# // make contiguous, in-place +# GGML_API struct ggml_tensor * ggml_cont_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_cont_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, +) -> ggml_tensor_p: + """Make a tensor contiguous and store the result in the first tensor. + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_cont_inplace(ctx, a) + + +lib.ggml_cont_inplace.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_cont_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# // return view(a), b specifies the new shape +# // TODO: when we start computing gradient, make a copy instead of view +# GGML_API struct ggml_tensor * ggml_reshape( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_reshape( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + return lib.ggml_reshape(ctx, a, b) + + +lib.ggml_reshape.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_reshape.restype = ctypes.POINTER(ggml_tensor) + + +# // return view(a) +# // TODO: when we start computing gradient, make a copy instead of view +# GGML_API struct ggml_tensor * ggml_reshape_1d( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int64_t ne0); +def ggml_reshape_1d( + ctx: ggml_context_p, + a: ggml_tensor_p, + ne0: Union[ctypes.c_int64, int], +) -> ggml_tensor_p: + return lib.ggml_reshape_1d(ctx, a, ne0) + + +lib.ggml_reshape_1d.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int64, +] +lib.ggml_reshape_1d.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_reshape_2d( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int64_t ne0, +# int64_t ne1); +def ggml_reshape_2d( + ctx: ggml_context_p, + a: ggml_tensor_p, + ne0: Union[ctypes.c_int64, int], + ne1: Union[ctypes.c_int64, int], +) -> ggml_tensor_p: + return lib.ggml_reshape_2d(ctx, a, ne0, ne1) + + +lib.ggml_reshape_2d.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int64, + ctypes.c_int64, +] +lib.ggml_reshape_2d.restype = ctypes.POINTER(ggml_tensor) + + +# // return view(a) +# // TODO: when we start computing gradient, make a copy instead of view +# GGML_API struct ggml_tensor * ggml_reshape_3d( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int64_t ne0, +# int64_t ne1, +# int64_t ne2); +def ggml_reshape_3d( + ctx: ggml_context_p, + a: ggml_tensor_p, + ne0: Union[ctypes.c_int64, int], + ne1: Union[ctypes.c_int64, int], + ne2: Union[ctypes.c_int64, int], +) -> ggml_tensor_p: + return lib.ggml_reshape_3d(ctx, a, ne0, ne1, ne2) + + +lib.ggml_reshape_3d.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int64, + ctypes.c_int64, + ctypes.c_int64, +] +lib.ggml_reshape_3d.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_reshape_4d( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int64_t ne0, +# int64_t ne1, +# int64_t ne2, +# int64_t ne3); +def ggml_reshape_4d( + ctx: ggml_context_p, + a: ggml_tensor_p, + ne0: Union[ctypes.c_int64, int], + ne1: Union[ctypes.c_int64, int], + ne2: Union[ctypes.c_int64, int], + ne3: Union[ctypes.c_int64, int], +) -> ggml_tensor_p: + return lib.ggml_reshape_4d(ctx, a, ne0, ne1, ne2, ne3) + + +lib.ggml_reshape_4d.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int64, + ctypes.c_int64, + ctypes.c_int64, + ctypes.c_int64, +] +lib.ggml_reshape_4d.restype = ctypes.POINTER(ggml_tensor) + + +# // offset in bytes +# GGML_API struct ggml_tensor * ggml_view_1d( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int64_t ne0, +# size_t offset); +def ggml_view_1d( + ctx: ggml_context_p, + a: ggml_tensor_p, + ne0: Union[ctypes.c_int64, int], + offset: Union[ctypes.c_size_t, int], +) -> ggml_tensor_p: + return lib.ggml_view_1d(ctx, a, ne0, offset) + + +lib.ggml_view_1d.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int64, + ctypes.c_size_t, +] +lib.ggml_view_1d.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_view_2d( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int64_t ne0, +# int64_t ne1, +# size_t nb1, // row stride in bytes +# size_t offset); +def ggml_view_2d( + ctx: ggml_context_p, + a: ggml_tensor_p, + ne0: Union[ctypes.c_int64, int], + ne1: Union[ctypes.c_int64, int], + nb1: Union[ctypes.c_size_t, int], + offset: Union[ctypes.c_size_t, int], +) -> ggml_tensor_p: + return lib.ggml_view_2d(ctx, a, ne0, ne1, nb1, offset) + + +lib.ggml_view_2d.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int64, + ctypes.c_int64, + ctypes.c_size_t, + ctypes.c_size_t, +] +lib.ggml_view_2d.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_view_3d( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int64_t ne0, +# int64_t ne1, +# int64_t ne2, +# size_t nb1, // row stride in bytes +# size_t nb2, // slice stride in bytes +# size_t offset); +def ggml_view_3d( + ctx: ggml_context_p, + a: ggml_tensor_p, + ne0: Union[ctypes.c_int64, int], + ne1: Union[ctypes.c_int64, int], + ne2: Union[ctypes.c_int64, int], + nb1: Union[ctypes.c_size_t, int], + nb2: Union[ctypes.c_size_t, int], + offset: Union[ctypes.c_size_t, int], +) -> ggml_tensor_p: + return lib.ggml_view_3d(ctx, a, ne0, ne1, ne2, nb1, nb2, offset) + + +lib.ggml_view_3d.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int64, + ctypes.c_int64, + ctypes.c_int64, + ctypes.c_size_t, + ctypes.c_size_t, + ctypes.c_size_t, +] +lib.ggml_view_3d.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_view_4d( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int64_t ne0, +# int64_t ne1, +# int64_t ne2, +# int64_t ne3, +# size_t nb1, // row stride in bytes +# size_t nb2, // slice stride in bytes +# size_t nb3, +# size_t offset); +def ggml_view_4d( + ctx: ggml_context_p, + a: ggml_tensor_p, + ne0: Union[ctypes.c_int64, int], + ne1: Union[ctypes.c_int64, int], + ne2: Union[ctypes.c_int64, int], + ne3: Union[ctypes.c_int64, int], + nb1: Union[ctypes.c_size_t, int], + nb2: Union[ctypes.c_size_t, int], + nb3: Union[ctypes.c_size_t, int], + offset: Union[ctypes.c_size_t, int], +) -> ggml_tensor_p: + return lib.ggml_view_4d(ctx, a, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset) + + +lib.ggml_view_4d.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int64, + ctypes.c_int64, + ctypes.c_int64, + ctypes.c_int64, + ctypes.c_size_t, + ctypes.c_size_t, + ctypes.c_size_t, + ctypes.c_size_t, +] +lib.ggml_view_4d.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_permute( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int axis0, +# int axis1, +# int axis2, +# int axis3); +def ggml_permute( + ctx: ggml_context_p, + a: ggml_tensor_p, + axis0: Union[ctypes.c_int, int], + axis1: Union[ctypes.c_int, int], + axis2: Union[ctypes.c_int, int], + axis3: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + return lib.ggml_permute(ctx, a, axis0, axis1, axis2, axis3) + + +lib.ggml_permute.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, +] +lib.ggml_permute.restype = ctypes.POINTER(ggml_tensor) + + +# // alias for ggml_permute(ctx, a, 1, 0, 2, 3) +# GGML_API struct ggml_tensor * ggml_transpose( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_transpose(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + """Transpose *the first two dimensions* of a tensor and return the result. + + alias for `ggml_permute(ctx, a, 1, 0, 2, 3)` + + Parameters: + ctx: ggml context + a: tensor + + Returns: + Pointer to ggml_tensor""" + return lib.ggml_transpose(ctx, a) + + +lib.ggml_transpose.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_transpose.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_get_rows( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_get_rows( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + return lib.ggml_get_rows(ctx, a, b) + + +lib.ggml_get_rows.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_get_rows.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_get_rows_back( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# struct ggml_tensor * c); +def ggml_get_rows_back( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + c: ggml_tensor_p, +) -> ggml_tensor_p: + return lib.ggml_get_rows_back(ctx, a, b, c) + + +lib.ggml_get_rows_back.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_get_rows_back.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_diag( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_diag(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + return lib.ggml_diag(ctx, a) + + +lib.ggml_diag.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_diag.restype = ctypes.POINTER(ggml_tensor) + + +# // set elements above the diagonal to -INF +# GGML_API struct ggml_tensor * ggml_diag_mask_inf( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int n_past); +def ggml_diag_mask_inf( + ctx: ggml_context_p, + a: ggml_tensor_p, + n_past: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + return lib.ggml_diag_mask_inf(ctx, a, n_past) + + +lib.ggml_diag_mask_inf.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, +] +lib.ggml_diag_mask_inf.restype = ctypes.POINTER(ggml_tensor) + + +# // in-place, returns view(a) +# GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int n_past); +def ggml_diag_mask_inf_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + n_past: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + return lib.ggml_diag_mask_inf_inplace(ctx, a, n_past) + + +lib.ggml_diag_mask_inf_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, +] +lib.ggml_diag_mask_inf_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# // set elements above the diagonal to 0 +# GGML_API struct ggml_tensor * ggml_diag_mask_zero( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int n_past); +def ggml_diag_mask_zero( + ctx: ggml_context_p, + a: ggml_tensor_p, + n_past: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + return lib.ggml_diag_mask_zero(ctx, a, n_past) + + +lib.ggml_diag_mask_zero.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, +] +lib.ggml_diag_mask_zero.restype = ctypes.POINTER(ggml_tensor) + + +# // in-place, returns view(a) +# GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int n_past); +def ggml_diag_mask_zero_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + n_past: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + return lib.ggml_diag_mask_zero_inplace(ctx, a, n_past) + + +lib.ggml_diag_mask_zero_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, +] +lib.ggml_diag_mask_zero_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_soft_max( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_soft_max(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + return lib.ggml_soft_max(ctx, a) + + +lib.ggml_soft_max.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_soft_max.restype = ctypes.POINTER(ggml_tensor) + + +# // in-place, returns view(a) +# GGML_API struct ggml_tensor * ggml_soft_max_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a); +def ggml_soft_max_inplace(ctx: ggml_context_p, a: ggml_tensor_p) -> ggml_tensor_p: + return lib.ggml_soft_max_inplace(ctx, a) + + +lib.ggml_soft_max_inplace.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_soft_max_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_soft_max_back( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_soft_max_back( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + return lib.ggml_soft_max_back(ctx, a, b) + + +lib.ggml_soft_max_back.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_soft_max_back.restype = ctypes.POINTER(ggml_tensor) + + +# // in-place, returns view(a) +# GGML_API struct ggml_tensor * ggml_soft_max_back_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_soft_max_back_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + return lib.ggml_soft_max_back_inplace(ctx, a, b) + + +lib.ggml_soft_max_back_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_soft_max_back_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# // rotary position embedding +# // if mode & 1 == 1, skip n_past elements +# // if mode & 2 == 1, GPT-NeoX style +# // if mode & 4 == 1, ChatGLM style +# // TODO: avoid creating a new tensor every time +# GGML_API struct ggml_tensor * ggml_rope( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int n_past, +# int n_dims, +# int mode, +# int n_ctx); +def ggml_rope( + ctx: ggml_context_p, + a: ggml_tensor_p, + n_past: Union[ctypes.c_int, int], + n_dims: Union[ctypes.c_int, int], + mode: Union[ctypes.c_int, int], + n_ctx: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + return lib.ggml_rope(ctx, a, n_past, n_dims, mode, n_ctx) + + +lib.ggml_rope.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, +] +lib.ggml_rope.restype = ctypes.POINTER(ggml_tensor) + + +# // in-place, returns view(a) +# GGML_API struct ggml_tensor * ggml_rope_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int n_past, +# int n_dims, +# int mode, +# int n_ctx); +def ggml_rope_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + n_past: Union[ctypes.c_int, int], + n_dims: Union[ctypes.c_int, int], + mode: Union[ctypes.c_int, int], + n_ctx: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + return lib.ggml_rope_inplace(ctx, a, n_past, n_dims, mode, n_ctx) + + +lib.ggml_rope_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, +] +lib.ggml_rope_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# // custom RoPE +# GGML_API struct ggml_tensor * ggml_rope_custom( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int n_past, +# int n_dims, +# int mode, +# int n_ctx, +# float freq_base, +# float freq_scale); +def ggml_rope_custom( + ctx: ggml_context_p, + a: ggml_tensor_p, + n_past: Union[ctypes.c_int, int], + n_dims: Union[ctypes.c_int, int], + mode: Union[ctypes.c_int, int], + n_ctx: Union[ctypes.c_int, int], + freq_base: Union[ctypes.c_float, float], + freq_scale: Union[ctypes.c_float, float], +) -> ggml_tensor_p: + return lib.ggml_rope_custom( + ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale + ) + + +lib.ggml_rope_custom.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_float, + ctypes.c_float, +] +lib.ggml_rope_custom.restype = ctypes.POINTER(ggml_tensor) + + +# // in-place, returns view(a) +# GGML_API struct ggml_tensor * ggml_rope_custom_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int n_past, +# int n_dims, +# int mode, +# int n_ctx, +# float freq_base, +# float freq_scale); +def ggml_rope_custom_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + n_past: Union[ctypes.c_int, int], + n_dims: Union[ctypes.c_int, int], + mode: Union[ctypes.c_int, int], + n_ctx: Union[ctypes.c_int, int], + freq_base: Union[ctypes.c_float, float], + freq_scale: Union[ctypes.c_float, float], +) -> ggml_tensor_p: + return lib.ggml_rope_custom_inplace( + ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale + ) + + +lib.ggml_rope_custom_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_float, + ctypes.c_float, +] +lib.ggml_rope_custom_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# // xPos RoPE, in-place, returns view(a) +# GGML_API struct ggml_tensor * ggml_rope_xpos_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int n_past, +# int n_dims, +# float base, +# bool down); +def ggml_rope_xpos_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + n_past: Union[ctypes.c_int, int], + n_dims: Union[ctypes.c_int, int], + base: Union[ctypes.c_float, float], + down: Union[ctypes.c_bool, bool], +) -> ggml_tensor_p: + return lib.ggml_rope_xpos_inplace(ctx, a, n_past, n_dims, base, down) + + +lib.ggml_rope_xpos_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, + ctypes.c_float, + ctypes.c_bool, +] +lib.ggml_rope_xpos_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# // rotary position embedding backward, i.e compute dx from dy +# // a - dy +# GGML_API struct ggml_tensor * ggml_rope_back( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int n_past, +# int n_dims, +# int mode, +# int n_ctx, +# float freq_base, +# float freq_scale, +# float xpos_base, +# bool xpos_down); +def ggml_rope_back( + ctx: ggml_context_p, + a: ggml_tensor_p, + n_past: Union[ctypes.c_int, int], + n_dims: Union[ctypes.c_int, int], + mode: Union[ctypes.c_int, int], + n_ctx: Union[ctypes.c_int, int], + freq_base: Union[ctypes.c_float, float], + freq_scale: Union[ctypes.c_float, float], + xpos_base: Union[ctypes.c_float, float], + xpos_down: Union[ctypes.c_bool, bool], +) -> ggml_tensor_p: + return lib.ggml_rope_back( + ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, xpos_base, xpos_down + ) + + +lib.ggml_rope_back.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_float, + ctypes.c_float, + ctypes.c_float, + ctypes.c_bool, +] +lib.ggml_rope_back.restype = ctypes.POINTER(ggml_tensor) + + +# // alibi position embedding +# // in-place, returns view(a) +# struct ggml_tensor * ggml_alibi( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int n_past, +# int n_head, +# float bias_max); +def ggml_alibi( + ctx: ggml_context_p, + a: ggml_tensor_p, + n_past: Union[ctypes.c_int, int], + n_head: Union[ctypes.c_int, int], + bias_max: Union[ctypes.c_float, float], +) -> ggml_tensor_p: + return lib.ggml_alibi(ctx, a, n_past, n_head, bias_max) + + +lib.ggml_alibi.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, + ctypes.c_float, +] +lib.ggml_alibi.restype = ctypes.POINTER(ggml_tensor) + + +# // clamp +# // in-place, returns view(a) +# struct ggml_tensor * ggml_clamp( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# float min, +# float max); +def ggml_clamp( + ctx: ggml_context_p, + a: ggml_tensor_p, + min: Union[ctypes.c_float, float], + max: Union[ctypes.c_float, float], +) -> ggml_tensor_p: + return lib.ggml_clamp(ctx, a, min, max) + + +lib.ggml_clamp.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_float, + ctypes.c_float, +] +lib.ggml_clamp.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_conv_1d( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# int s0, // stride +# int p0, // padding +# int d0); // dilation +def ggml_conv_1d( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + s0: Union[ctypes.c_int, int], + p0: Union[ctypes.c_int, int], + d0: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + """Convolution 1D + + Parameters: + a: input tensor + b: filter tensor + s0: stride + p0: padding + d0: dilation + + Returns: + output tensor""" + return lib.ggml_conv_1d(ctx, a, b, s0, p0, d0) + + +lib.ggml_conv_1d.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, +] +lib.ggml_conv_1d.restype = ctypes.POINTER(ggml_tensor) + + +# // conv_1d with padding = half +# // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) +# GGML_API struct ggml_tensor* ggml_conv_1d_ph( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# int s, +# int d); +def ggml_conv_1d_ph( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + s: Union[ctypes.c_int, int], + d: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + """Convolution 1D with padding = half + + Parameters: + a: input tensor + b: filter tensor + s: stride + d: dilation + + Returns: + output tensor""" + return lib.ggml_conv_1d_ph(ctx, a, b, s, d) + + +lib.ggml_conv_1d_ph.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, +] +lib.ggml_conv_1d_ph.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_conv_2d( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# int s0, +# int s1, +# int p0, +# int p1, +# int d0, +# int d1); +def ggml_conv_2d( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + s0: Union[ctypes.c_int, int], + s1: Union[ctypes.c_int, int], + p0: Union[ctypes.c_int, int], + p1: Union[ctypes.c_int, int], + d0: Union[ctypes.c_int, int], + d1: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + """Convolution 2D + + Parameters: + a: input tensor + b: filter tensor + s0: stride + s1: stride + p0: padding + p1: padding + d0: dilation + d1: dilation + + Returns: + output tensor""" + return lib.ggml_conv_2d(ctx, a, b, s0, s1, p0, p1, d0, d1) + + +lib.ggml_conv_2d.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, +] +lib.ggml_conv_2d.restype = ctypes.POINTER(ggml_tensor) + + +# // kernel size is a->ne[0] x a->ne[1] +# // stride is equal to kernel size +# // padding is zero +# // example: +# // a: 16 16 3 768 +# // b: 1024 1024 3 1 +# // res: 64 64 768 1 +# // used in sam +# GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_conv_2d_sk_p0( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + """Convolution 2D + + Parameters: + a: input tensor + b: filter tensor + + Returns: + output tensor""" + return lib.ggml_conv_2d_sk_p0(ctx, a, b) + + +lib.ggml_conv_2d_sk_p0.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_conv_2d_sk_p0.restype = ctypes.POINTER(ggml_tensor) + + +# // kernel size is a->ne[0] x a->ne[1] +# // stride is 1 +# // padding is half +# // example: +# // a: 3 3 256 256 +# // b: 64 64 256 1 +# // res: 64 64 256 1 +# // used in sam +# GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_conv_2d_s1_ph( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + """Convolution 2D with stride = 1 and padding = half + + Parameters: + a: input tensor + b: filter tensor + + Returns: + output tensor""" + return lib.ggml_conv_2d_s1_ph(ctx, a, b) + + +lib.ggml_conv_2d_s1_ph.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_conv_2d_s1_ph.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# int stride); +def ggml_conv_transpose_2d_p0( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + stride: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + """Convolution Transpose 2D with padding = zero + + Parameters: + a: input tensor + b: filter tensor + stride: stride + + Returns: + output tensor""" + return lib.ggml_conv_transpose_2d_p0(ctx, a, b, stride) + + +lib.ggml_conv_transpose_2d_p0.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_int, +] +lib.ggml_conv_transpose_2d_p0.restype = ctypes.POINTER(ggml_tensor) + +# enum ggml_op_pool { +# GGML_OP_POOL_MAX, +# GGML_OP_POOL_AVG, +# GGML_OP_POOL_COUNT, +# }; +GGML_OP_POOL_MAX = 0 +GGML_OP_POOL_AVG = 1 +GGML_OP_POOL_COUNT = 2 + + +# GGML_API struct ggml_tensor * ggml_pool_1d( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# enum ggml_op_pool op, +# int k0, // kernel size +# int s0, // stride +# int p0); // padding +def ggml_pool_1d( + ctx: ggml_context_p, + a: ggml_tensor_p, + op: Union[ctypes.c_int, int], + k0: Union[ctypes.c_int, int], + s0: Union[ctypes.c_int, int], + p0: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + """1D Pooling + + Parameters: + a: input tensor + op: pooling operation + k0: kernel size + s0: stride + p0: padding + + Returns: + output tensor""" + return lib.ggml_pool_1d(ctx, a, op, k0, s0, p0) + + +lib.ggml_pool_1d.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, +] +lib.ggml_pool_1d.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_pool_2d( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# enum ggml_op_pool op, +# int k0, +# int k1, +# int s0, +# int s1, +# int p0, +# int p1); +def ggml_pool_2d( + ctx: ggml_context_p, + a: ggml_tensor_p, + op: Union[ctypes.c_int, int], + k0: Union[ctypes.c_int, int], + k1: Union[ctypes.c_int, int], + s0: Union[ctypes.c_int, int], + s1: Union[ctypes.c_int, int], + p0: Union[ctypes.c_int, int], + p1: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + """2D Pooling + + Parameters: + a: input tensor + op: pooling operation + k0: kernel size + k1: kernel size + s0: stride + s1: stride + p0: padding + p1: padding + + Returns: + output tensor""" + return lib.ggml_pool_2d(ctx, a, op, k0, k1, s0, s1, p0, p1) + + +lib.ggml_pool_2d.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, +] +lib.ggml_pool_2d.restype = ctypes.POINTER(ggml_tensor) + + +# // nearest interpolate +# // used in stable-diffusion +# GGML_API struct ggml_tensor * ggml_upscale( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int scale_factor); +def ggml_upscale( + ctx: ggml_context_p, + a: ggml_tensor_p, + scale_factor: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + """Upscale + + Parameters: + a: input tensor + scale_factor: scale factor + + Returns: + output tensor""" + return lib.ggml_upscale(ctx, a, scale_factor) + + +lib.ggml_upscale.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, +] +lib.ggml_upscale.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_flash_attn( +# struct ggml_context * ctx, +# struct ggml_tensor * q, +# struct ggml_tensor * k, +# struct ggml_tensor * v, +# bool masked); +def ggml_flash_attn( + ctx: ggml_context_p, + q: ggml_tensor_p, + k: ggml_tensor_p, + v: ggml_tensor_p, + masked: Union[ctypes.c_bool, bool], +) -> ggml_tensor_p: + return lib.ggml_flash_attn(ctx, q, k, v, masked) + + +lib.ggml_flash_attn.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_bool, +] +lib.ggml_flash_attn.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_flash_attn_back( +# struct ggml_context * ctx, +# struct ggml_tensor * q, +# struct ggml_tensor * k, +# struct ggml_tensor * v, +# struct ggml_tensor * d, +# bool masked); +def ggml_flash_attn_back( + ctx: ggml_context_p, + q: ggml_tensor_p, + k: ggml_tensor_p, + v: ggml_tensor_p, + d: ggml_tensor_p, + masked: Union[ctypes.c_bool, bool], +) -> ggml_tensor_p: + return lib.ggml_flash_attn_back(ctx, q, k, v, d, masked) + + +lib.ggml_flash_attn_back.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_bool, +] +lib.ggml_flash_attn_back.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_flash_ff( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b0, +# struct ggml_tensor * b1, +# struct ggml_tensor * c0, +# struct ggml_tensor * c1); +def ggml_flash_ff( + ctx: ggml_context_p, + a: ggml_tensor_p, + b0: ggml_tensor_p, + b1: ggml_tensor_p, + c0: ggml_tensor_p, + c1: ggml_tensor_p, +) -> ggml_tensor_p: + return lib.ggml_flash_ff(ctx, a, b0, b1, c0, c1) + + +lib.ggml_flash_ff.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_flash_ff.restype = ctypes.POINTER(ggml_tensor) + + +# // partition into non-overlapping windows with padding if needed +# // example: +# // a: 768 64 64 1 +# // w: 14 +# // res: 768 14 14 25 +# // used in sam +# GGML_API struct ggml_tensor * ggml_win_part( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int w); +def ggml_win_part( + ctx: ggml_context_p, + a: ggml_tensor_p, + w: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + return lib.ggml_win_part(ctx, a, w) + + +lib.ggml_win_part.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, +] +lib.ggml_win_part.restype = ctypes.POINTER(ggml_tensor) + + +# // reverse of ggml_win_part +# // used in sam +# GGML_API struct ggml_tensor * ggml_win_unpart( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int w0, +# int h0, +# int w); +def ggml_win_unpart( + ctx: ggml_context_p, + a: ggml_tensor_p, + w0: Union[ctypes.c_int, int], + h0: Union[ctypes.c_int, int], + w: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + return lib.ggml_win_unpart(ctx, a, w0, h0, w) + + +lib.ggml_win_unpart.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, +] +lib.ggml_win_unpart.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_unary( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# enum ggml_unary_op op); +def ggml_unary( + ctx: ggml_context_p, + a: ggml_tensor_p, + op: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + return lib.ggml_unary(ctx, a, op) + + +lib.ggml_unary.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, +] +lib.ggml_unary.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_unary_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# enum ggml_unary_op op); +def ggml_unary_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + op: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + return lib.ggml_unary_inplace(ctx, a, op) + + +lib.ggml_unary_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, +] +lib.ggml_unary_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# // used in sam +# GGML_API struct ggml_tensor * ggml_get_rel_pos( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# int qh, +# int kh); +def ggml_get_rel_pos( + ctx: ggml_context_p, + a: ggml_tensor_p, + qh: Union[ctypes.c_int, int], + kh: Union[ctypes.c_int, int], +) -> ggml_tensor_p: + return lib.ggml_get_rel_pos(ctx, a, qh, kh) + + +lib.ggml_get_rel_pos.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, +] +lib.ggml_get_rel_pos.restype = ctypes.POINTER(ggml_tensor) + + +# // used in sam +# GGML_API struct ggml_tensor * ggml_add_rel_pos( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * pw, +# struct ggml_tensor * ph); +def ggml_add_rel_pos( + ctx: ggml_context_p, + a: ggml_tensor_p, + pw: ggml_tensor_p, + ph: ggml_tensor_p, +) -> ggml_tensor_p: + return lib.ggml_add_rel_pos(ctx, a, pw, ph) + + +lib.ggml_add_rel_pos.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_add_rel_pos.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * pw, +# struct ggml_tensor * ph); +def ggml_add_rel_pos_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + pw: ggml_tensor_p, + ph: ggml_tensor_p, +) -> ggml_tensor_p: + return lib.ggml_add_rel_pos_inplace(ctx, a, pw, ph) + + +lib.ggml_add_rel_pos_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_add_rel_pos_inplace.restype = ctypes.POINTER(ggml_tensor) + +# // custom operators (DEPRECATED) + +# typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *); +ggml_unary_op_f32_t = ctypes.CFUNCTYPE( + None, ctypes.c_int, ctypes.POINTER(ctypes.c_float), ctypes.POINTER(ctypes.c_float) +) + +# typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); +ggml_binary_op_f32_t = ctypes.CFUNCTYPE( + None, + ctypes.c_int, + ctypes.POINTER(ctypes.c_float), + ctypes.POINTER(ctypes.c_float), + ctypes.POINTER(ctypes.c_float), +) + +# typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *); +ggml_custom1_op_f32_t = ctypes.CFUNCTYPE( + None, ctypes.POINTER(ggml_tensor), ctypes.POINTER(ggml_tensor) +) +"""Unary operator function type""" + +# typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); +ggml_custom2_op_f32_t = ctypes.CFUNCTYPE( + None, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +) +"""Binary operator function type""" + +# typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); +ggml_custom3_op_f32_t = ctypes.CFUNCTYPE( + None, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +) +"""Ternary operator function type""" + + +# GGML_API struct ggml_tensor * ggml_map_unary_f32( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# ggml_unary_op_f32_t fun); +def ggml_map_unary_f32( + ctx: ggml_context_p, + a: ggml_tensor_p, + fun: "ctypes._FuncPointer", # type: ignore +) -> ggml_tensor_p: + return lib.ggml_map_unary_f32(ctx, a, fun) + + +lib.ggml_map_unary_f32.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ggml_unary_op_f32_t, +] +lib.ggml_map_unary_f32.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# ggml_unary_op_f32_t fun); +def ggml_map_unary_inplace_f32( + ctx: ggml_context_p, + a: ggml_tensor_p, + fun: "ctypes._FuncPointer", # type: ignore +) -> ggml_tensor_p: + return lib.ggml_map_unary_inplace_f32(ctx, a, fun) + + +lib.ggml_map_unary_inplace_f32.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ggml_unary_op_f32_t, +] +lib.ggml_map_unary_inplace_f32.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_map_binary_f32( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# ggml_binary_op_f32_t fun); +def ggml_map_binary_f32( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + fun: "ctypes._FuncPointer", # type: ignore +) -> ggml_tensor_p: + return lib.ggml_map_binary_f32(ctx, a, b, fun) + + +lib.ggml_map_binary_f32.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ggml_binary_op_f32_t, +] +lib.ggml_map_binary_f32.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# ggml_binary_op_f32_t fun); +def ggml_map_binary_inplace_f32( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + fun: "ctypes._FuncPointer", # type: ignore +) -> ggml_tensor_p: + return lib.ggml_map_binary_inplace_f32(ctx, a, b, fun) + + +lib.ggml_map_binary_inplace_f32.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ggml_binary_op_f32_t, +] +lib.ggml_map_binary_inplace_f32.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_map_custom1_f32( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# ggml_custom1_op_f32_t fun); +def ggml_map_custom1_f32( + ctx: ggml_context_p, + a: ggml_tensor_p, + fun: "ctypes._FuncPointer", # type: ignore +) -> ggml_tensor_p: + """Custom unary operator on a tensor. + + Example: + ```python + import ggml + + @ggml.ggml_custom1_op_f32_t + def custom_op(b: ggml.tensor_p, a: ggml.tensor_p): + # do something with a and copy to b + return + + ... + + b = ggml.ggml_map_custom1_f32(ctx, a, custom_op) + ``` + + Parameters: + a: input tensor + fun (ggml.ggml_custom1_op_f32_t): function to apply to each element + + Returns: + output tensor""" + return lib.ggml_map_custom1_f32(ctx, a, fun) + + +lib.ggml_map_custom1_f32.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ggml_custom1_op_f32_t, +] +lib.ggml_map_custom1_f32.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# ggml_custom1_op_f32_t fun); +def ggml_map_custom1_inplace_f32( + ctx: ggml_context_p, + a: ggml_tensor_p, + fun: "ctypes._CFuncPtr", # type: ignore +) -> ggml_tensor_p: + """Custom unary operator on a tensor inplace. + + Parameters: + a: input tensor + fun (ggml.ggml_custom1_op_f32_t): function to apply to each element + + Returns: + output tensor""" + return lib.ggml_map_custom1_inplace_f32(ctx, a, fun) + + +lib.ggml_map_custom1_inplace_f32.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ggml_custom1_op_f32_t, +] +lib.ggml_map_custom1_inplace_f32.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_map_custom2_f32( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# ggml_custom2_op_f32_t fun); +def ggml_map_custom2_f32( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + fun: "ctypes._FuncPointer", # type: ignore +) -> ggml_tensor_p: + """Custom binary operator on two tensors. + + Parameters: + a: input tensor + b: input tensor + fun (ggml.ggml_custom2_op_f32_t): function to apply to each element + + Returns: + output tensor""" + return lib.ggml_map_custom2_f32(ctx, a, b, fun) + + +lib.ggml_map_custom2_f32.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ggml_custom2_op_f32_t, +] +lib.ggml_map_custom2_f32.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# ggml_custom2_op_f32_t fun); +def ggml_map_custom2_inplace_f32( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + fun: "ctypes._FuncPointer", # type: ignore +) -> ggml_tensor_p: + """Custom binary operator on two tensors inplace. + + Parameters: + a: input tensor + b: input tensor + fun (ggml.ggml_custom2_op_f32_t): function to apply to each element + + Returns: + output tensor""" + return lib.ggml_map_custom2_inplace_f32(ctx, a, b, fun) + + +lib.ggml_map_custom2_inplace_f32.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ggml_custom2_op_f32_t, +] +lib.ggml_map_custom2_inplace_f32.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_map_custom3_f32( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# struct ggml_tensor * c, +# ggml_custom3_op_f32_t fun); +def ggml_map_custom3_f32( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + c: ggml_tensor_p, + fun: "ctypes._FuncPointer", # type: ignore +) -> ggml_tensor_p: + """Custom ternary operator on three tensors. + + Parameters: + a: input tensor + b: input tensor + c: input tensor + fun (ggml.ggml_custom3_op_f32_t): function to apply to each element + + Returns: + output tensor""" + return lib.ggml_map_custom3_f32(ctx, a, b, c, fun) + + +lib.ggml_map_custom3_f32.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ggml_custom3_op_f32_t, +] +lib.ggml_map_custom3_f32.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# struct ggml_tensor * c, +# ggml_custom3_op_f32_t fun); +def ggml_map_custom3_inplace_f32( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + c: ggml_tensor_p, + fun: "ctypes._FuncPointer", # type: ignore +) -> ggml_tensor_p: + """Custom ternary operator on three tensors inplace. + + Parameters: + a: input tensor + b: input tensor + c: input tensor + fun (ggml.ggml_custom3_op_f32_t): function to apply to each element + + Returns: + output tensor""" + return lib.ggml_map_custom3_inplace_f32(ctx, a, b, c, fun) + + +lib.ggml_map_custom3_inplace_f32.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ggml_custom3_op_f32_t, +] +lib.ggml_map_custom3_inplace_f32.restype = ctypes.POINTER(ggml_tensor) + +# // custom operators v2 + +# typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata); +ggml_custom1_op_t = ctypes.CFUNCTYPE( + None, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, + ctypes.c_void_p, +) +"""Custom unary operator on a tensor.""" + +# typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata); +ggml_custom2_op_t = ctypes.CFUNCTYPE( + None, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, + ctypes.c_void_p, +) +"""Custom binary operator on two tensors.""" + +# typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata); +ggml_custom3_op_t = ctypes.CFUNCTYPE( + None, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_int, + ctypes.c_int, + ctypes.c_void_p, +) +"""Custom ternary operator on three tensors.""" + +# #define GGML_N_TASKS_MAX -1 +GGML_N_TASKS_MAX = -1 + + +# GGML_API struct ggml_tensor * ggml_map_custom1( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# ggml_custom1_op_t fun, +# int n_tasks, +# void * userdata); +def ggml_map_custom1( + ctx: ggml_context_p, + a: ggml_tensor_p, + fun: "ctypes._FuncPointer", # type: ignore + n_tasks: Union[ctypes.c_int, int], + userdata: Optional[ctypes.c_void_p], +) -> ggml_tensor_p: + return lib.ggml_map_custom1(ctx, a, fun, n_tasks, userdata) + + +lib.ggml_map_custom1.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ggml_custom1_op_t, + ctypes.c_int, + ctypes.c_void_p, +] +lib.ggml_map_custom1.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_map_custom1_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# ggml_custom1_op_t fun, +# int n_tasks, +# void * userdata); +def ggml_map_custom1_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + fun: "ctypes._FuncPointer", # type: ignore + n_tasks: Union[ctypes.c_int, int], + userdata: Optional[ctypes.c_void_p], +) -> ggml_tensor_p: + return lib.ggml_map_custom1_inplace(ctx, a, fun, n_tasks, userdata) + + +lib.ggml_map_custom1_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ggml_custom1_op_t, + ctypes.c_int, + ctypes.c_void_p, +] +lib.ggml_map_custom1_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_map_custom2( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# ggml_custom2_op_t fun, +# int n_tasks, +# void * userdata); +def ggml_map_custom2( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + fun: "ctypes._FuncPointer", # type: ignore + n_tasks: Union[ctypes.c_int, int], + userdata: Optional[ctypes.c_void_p], +) -> ggml_tensor_p: + return lib.ggml_map_custom2(ctx, a, b, fun, n_tasks, userdata) + + +lib.ggml_map_custom2.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ggml_custom2_op_t, + ctypes.c_int, + ctypes.c_void_p, +] +lib.ggml_map_custom2.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_map_custom2_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# ggml_custom2_op_t fun, +# int n_tasks, +# void * userdata); +def ggml_map_custom2_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + fun: "ctypes._FuncPointer", # type: ignore + n_tasks: Union[ctypes.c_int, int], + userdata: Optional[ctypes.c_void_p], +) -> ggml_tensor_p: + return lib.ggml_map_custom2_inplace(ctx, a, b, fun, n_tasks, userdata) + + +lib.ggml_map_custom2_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ggml_custom2_op_t, + ctypes.c_int, + ctypes.c_void_p, +] +lib.ggml_map_custom2_inplace.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_map_custom3( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# struct ggml_tensor * c, +# ggml_custom3_op_t fun, +# int n_tasks, +# void * userdata); +def ggml_map_custom3( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + c: ggml_tensor_p, + fun: "ctypes._FuncPointer", # type: ignore + n_tasks: Union[ctypes.c_int, int], + userdata: Optional[ctypes.c_void_p], +) -> ggml_tensor_p: + return lib.ggml_map_custom3(ctx, a, b, c, fun, n_tasks, userdata) + + +lib.ggml_map_custom3.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ggml_custom3_op_t, + ctypes.c_int, + ctypes.c_void_p, +] +lib.ggml_map_custom3.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_map_custom3_inplace( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# struct ggml_tensor * c, +# ggml_custom3_op_t fun, +# int n_tasks, +# void * userdata); +def ggml_map_custom3_inplace( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + c: ggml_tensor_p, + fun: "ctypes._FuncPointer", # type: ignore + n_tasks: Union[ctypes.c_int, int], + userdata: Optional[ctypes.c_void_p], +) -> ggml_tensor_p: + return lib.ggml_map_custom3_inplace(ctx, a, b, c, fun, n_tasks, userdata) + + +lib.ggml_map_custom3_inplace.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ggml_custom3_op_t, + ctypes.c_int, + ctypes.c_void_p, +] +lib.ggml_map_custom3_inplace.restype = ctypes.POINTER(ggml_tensor) + +# // loss function + + +# GGML_API struct ggml_tensor * ggml_cross_entropy_loss( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b); +def ggml_cross_entropy_loss( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, +) -> ggml_tensor_p: + return lib.ggml_cross_entropy_loss(ctx, a, b) + + +lib.ggml_cross_entropy_loss.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_cross_entropy_loss.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back( +# struct ggml_context * ctx, +# struct ggml_tensor * a, +# struct ggml_tensor * b, +# struct ggml_tensor * c); +def ggml_cross_entropy_loss_back( + ctx: ggml_context_p, + a: ggml_tensor_p, + b: ggml_tensor_p, + c: ggml_tensor_p, +) -> ggml_tensor_p: + return lib.ggml_cross_entropy_loss_back(ctx, a, b, c) + + +lib.ggml_cross_entropy_loss_back.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_cross_entropy_loss_back.restype = ctypes.POINTER(ggml_tensor) + +# // +# // automatic differentiation +# // + + +# GGML_API void ggml_set_param( +# struct ggml_context * ctx, +# struct ggml_tensor * tensor); +def ggml_set_param(ctx: ggml_context_p, tensor: ggml_tensor_p): + return lib.ggml_set_param(ctx, tensor) + + +lib.ggml_set_param.argtypes = [ggml_context_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_set_param.restype = None + + +# GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); +def ggml_build_forward_expand( + cgraph: ggml_cgraph_p, + tensor: ggml_tensor_p, +): + """Add a tensor to the forward computation graph. This is used to + compute and save the value of the tensor. + + Parameters: + cgraph: The graph. + tensor: The tensor.""" + return lib.ggml_build_forward_expand(cgraph, tensor) + + +lib.ggml_build_forward_expand.argtypes = [ + ctypes.POINTER(ggml_cgraph), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_build_forward_expand.restype = None + + +# GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep); +def ggml_build_backward_expand( + ctx: ggml_context_p, + gf: ggml_cgraph_p, + gb: ggml_cgraph_p, + keep: Union[ctypes.c_bool, bool], +): + """Add a tensor to the backward computation graph. This is used to + compute the gradient of the tensor. + + Parameters: + ctx: The context. + gf: The forward graph. + gb: The backward graph. + keep: Whether to keep the tensor.""" + return lib.ggml_build_backward_expand(ctx, gf, gb, keep) + + +lib.ggml_build_backward_expand.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_cgraph), + ctypes.POINTER(ggml_cgraph), + ctypes.c_bool, +] +lib.ggml_build_backward_expand.restype = None + + +# GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); +def ggml_build_forward( + tensor: ggml_tensor_p, +) -> ggml_cgraph: + """Build the forward computation graph. + + Parameters: + tensor: The tensor. + + Returns: + The graph.""" + return lib.ggml_build_forward(tensor) + + +lib.ggml_build_forward.argtypes = [ctypes.POINTER(ggml_tensor)] +lib.ggml_build_forward.restype = ggml_cgraph + + +# GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); +def ggml_build_backward( + ctx: ggml_context_p, + gf: ggml_cgraph_p, + keep: Union[ctypes.c_bool, bool], +) -> ggml_cgraph: + return lib.ggml_build_backward(ctx, gf, keep) + + +lib.ggml_build_backward.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_cgraph), + ctypes.c_bool, +] +lib.ggml_build_backward.restype = ggml_cgraph + + +# // graph allocation in a context +# GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); +def ggml_new_graph( + ctx: ggml_context_p, +) -> ggml_cgraph: + """Create a new graph. + + Parameters: + ctx: The context. + + Returns: + The graph.""" + return lib.ggml_new_graph(ctx) + + +lib.ggml_new_graph.argtypes = [ggml_context_p] +lib.ggml_new_graph.restype = ggml_cgraph + + +# GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor); +def ggml_build_forward_ctx( + ctx: ggml_context_p, + tensor: ggml_tensor_p, +) -> ggml_cgraph: + """Build the forward computation graph in a context. + + Parameters: + ctx: The context. + tensor: The tensor. + + Returns: + The graph.""" + return lib.ggml_build_forward_ctx(ctx, tensor) + + +lib.ggml_build_forward_ctx.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_tensor), +] +lib.ggml_build_forward_ctx.restype = ggml_cgraph + + +# GGML_API size_t ggml_graph_overhead(void); +def ggml_graph_overhead() -> int: + """Get the overhead of the graph.""" + return lib.ggml_graph_overhead() + + +lib.ggml_graph_overhead.argtypes = [] +lib.ggml_graph_overhead.restype = ctypes.c_size_t + + +# // ggml_graph_plan() has to be called before ggml_graph_compute() +# // when plan.work_size > 0, caller must allocate memory for plan.work_data +# GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); +def ggml_graph_plan( + cgraph: ggml_cgraph_p, + n_threads: Union[ctypes.c_int, int] = GGML_DEFAULT_N_THREADS, +) -> ggml_cplan: + """Plan the computation graph. + + Parameters: + cgraph: The graph. + n_threads: The number of threads to use. + + Returns: + The plan.""" + return lib.ggml_graph_plan(cgraph, n_threads) + + +lib.ggml_graph_plan.argtypes = [ + ctypes.POINTER(ggml_cgraph), + ctypes.c_int, +] +lib.ggml_graph_plan.restype = ggml_cplan + + +# GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); +def ggml_graph_compute( + cgraph: ggml_cgraph_p, + cplan: ggml_cplan_p, +) -> int: + """Compute the graph. + + Parameters: + cgraph: The graph. + cplan: The plan.""" + return lib.ggml_graph_compute(cgraph, cplan) + + +lib.ggml_graph_compute.argtypes = [ + ctypes.POINTER(ggml_cgraph), + ctypes.POINTER(ggml_cplan), +] +lib.ggml_graph_compute.restype = ctypes.c_int + + +# GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); +def ggml_graph_reset( + cgraph: ggml_cgraph_p, +): + """Reset the graph. + + Parameters: + cgraph: The graph.""" + return lib.ggml_graph_reset(cgraph) + + +# // same as ggml_graph_compute() but the work data is allocated as a part of the context +# // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data +# GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); +def ggml_graph_compute_with_ctx( + ctx: ggml_context_p, + cgraph: ggml_cgraph_p, + n_threads: Union[ctypes.c_int, int], +): + """Compute the graph with a context. + + Parameters: + ctx: The context. + cgraph: The graph. + n_threads: The number of threads to use.""" + return lib.ggml_graph_compute_with_ctx(ctx, cgraph, n_threads) + + +lib.ggml_graph_compute_with_ctx.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_cgraph), + ctypes.c_int, +] +lib.ggml_graph_compute_with_ctx.restype = None + + +# GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); +def ggml_graph_get_tensor( + cgraph: ggml_cgraph_p, + name: bytes, +) -> ggml_tensor_p: + """Get a tensor from the graph by name. + + Parameters: + cgraph: The graph. + name: The name of the tensor. + + Returns: + The tensor.""" + return lib.ggml_graph_get_tensor(cgraph, name) + + +lib.ggml_graph_get_tensor.argtypes = [ + ctypes.POINTER(ggml_cgraph), + ctypes.c_char_p, +] +lib.ggml_graph_get_tensor.restype = ctypes.POINTER(ggml_tensor) + + +# GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname); +def ggml_graph_export( + cgraph: ggml_cgraph_p, + fname: bytes, +): + return lib.ggml_graph_export(cgraph, fname) + + +lib.ggml_graph_export.argtypes = [ + ctypes.POINTER(ggml_cgraph), + ctypes.c_char_p, +] +lib.ggml_graph_export.restype = None + + +# GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval); +def ggml_graph_import( + fname: bytes, + ctx_data: "ctypes._Pointer[ggml_context_p]", # type: ignore + ctx_eval: "ctypes._Pointer[ggml_context_p]", # type: ignore +) -> ggml_cgraph: + return lib.ggml_graph_import(fname, ctx_data, ctx_eval) + + +lib.ggml_graph_import.argtypes = [ + ctypes.c_char_p, + ctypes.POINTER(ggml_context_p), + ctypes.POINTER(ggml_context_p), +] +lib.ggml_graph_import.restype = ggml_cgraph + + +# // print info and performance information for the graph +# GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph); +def ggml_graph_print( + cgraph: ggml_cgraph_p, +): + return lib.ggml_graph_print(cgraph) + + +lib.ggml_graph_print.argtypes = [ctypes.POINTER(ggml_cgraph)] +lib.ggml_graph_print.restype = None + + +# // dump the graph into a file using the dot format +# GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); +def ggml_graph_dump_dot( + gb: ggml_cgraph_p, + gf: ggml_cgraph_p, + filename: bytes, +): + return lib.ggml_graph_dump_dot(gb, gf, filename) + + +lib.ggml_graph_dump_dot.argtypes = [ + ctypes.POINTER(ggml_cgraph), + ctypes.POINTER(ggml_cgraph), + ctypes.c_char_p, +] +lib.ggml_graph_dump_dot.restype = None + + +# // +# // optimization +# // + +# // optimization methods +# enum ggml_opt_type { +# GGML_OPT_ADAM, +# GGML_OPT_LBFGS, +# }; +GGML_OPT_ADAM = 0 +GGML_OPT_LBFGS = 1 + +# // linesearch methods +# enum ggml_linesearch { +# GGML_LINESEARCH_DEFAULT = 1, + +# GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0, +# GGML_LINESEARCH_BACKTRACKING_WOLFE = 1, +# GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2, +# }; +GGML_LINESEARCH_DEFAULT = 1 +GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0 +GGML_LINESEARCH_BACKTRACKING_WOLFE = 1 +GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2 + +# // optimization return values +# enum ggml_opt_result { +# GGML_OPT_OK = 0, +# GGML_OPT_DID_NOT_CONVERGE, +# GGML_OPT_NO_CONTEXT, +# GGML_OPT_INVALID_WOLFE, +# GGML_OPT_FAIL, + +# GGML_LINESEARCH_FAIL = -128, +# GGML_LINESEARCH_MINIMUM_STEP, +# GGML_LINESEARCH_MAXIMUM_STEP, +# GGML_LINESEARCH_MAXIMUM_ITERATIONS, +# GGML_LINESEARCH_INVALID_PARAMETERS, +# }; +GGML_OPT_OK = 0 +GGML_OPT_DID_NOT_CONVERGE = 1 +GGML_OPT_NO_CONTEXT = 2 +GGML_OPT_INVALID_WOLFE = 3 +GGML_OPT_FAIL = 4 +GGML_LINESEARCH_FAIL = -128 +GGML_LINESEARCH_MINIMUM_STEP = -127 +GGML_LINESEARCH_MAXIMUM_STEP = -126 +GGML_LINESEARCH_MAXIMUM_ITERATIONS = -125 +GGML_LINESEARCH_INVALID_PARAMETERS = -124 + +# typedef void (*ggml_opt_callback)(void * data, float * sched); +ggml_opt_callback = ctypes.CFUNCTYPE( + None, + ctypes.c_void_p, + ctypes.POINTER(ctypes.c_float), +) + +# // optimization parameters +# // +# // see ggml.c (ggml_opt_default_params) for default values +# // +# struct ggml_opt_params { +# enum ggml_opt_type type; + +# int n_threads; + +# // delta-based convergence test +# // +# // if past == 0 - disabled +# // if past > 0: +# // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|) +# // +# int past; +# float delta; + +# // maximum number of iterations without improvement +# // +# // if 0 - disabled +# // if > 0: +# // assume convergence if no cost improvement in this number of iterations +# // +# int max_no_improvement; + +# bool print_forward_graph; +# bool print_backward_graph; + +# // ADAM parameters +# struct { +# int n_iter; + +# float sched; // schedule multiplier (fixed, decay or warmup) +# float decay; // weight decay for AdamW, use 0.0f to disable +# int decay_min_ndim; // minimum number of tensor dimension to apply weight decay +# float alpha; // learning rate +# float beta1; +# float beta2; +# float eps; // epsilon for numerical stability +# float eps_f; // epsilon for convergence test +# float eps_g; // epsilon for convergence test +# float gclip; // gradient clipping +# } adam; + +# // LBFGS parameters +# struct { +# int m; // number of corrections to approximate the inv. Hessian +# int n_iter; +# int max_linesearch; + +# float eps; // convergence tolerance +# float ftol; // line search tolerance +# float wolfe; +# float min_step; +# float max_step; + +# enum ggml_linesearch linesearch; +# } lbfgs; +# }; + + +class ggml_opt_params_adam(ctypes.Structure): + _fields_ = [ + ("n_iter", ctypes.c_int), + ("sched", ctypes.c_float), + ("decay", ctypes.c_float), + ("decay_min_ndim", ctypes.c_int), + ("alpha", ctypes.c_float), + ("beta1", ctypes.c_float), + ("beta2", ctypes.c_float), + ("eps", ctypes.c_float), + ("eps_f", ctypes.c_float), + ("eps_g", ctypes.c_float), + ("gclip", ctypes.c_float), + ] + + +class ggml_opt_params_lbfgs(ctypes.Structure): + _fields_ = [ + ("m", ctypes.c_int), + ("n_iter", ctypes.c_int), + ("max_linesearch", ctypes.c_int), + ("eps", ctypes.c_float), + ("ftol", ctypes.c_float), + ("wolfe", ctypes.c_float), + ("min_step", ctypes.c_float), + ("max_step", ctypes.c_float), + ("linesearch", ctypes.c_int), + ] + + +class ggml_opt_params(ctypes.Structure): + _fields_ = [ + ("type", ctypes.c_int), + ("n_threads", ctypes.c_int), + ("past", ctypes.c_int), + ("delta", ctypes.c_float), + ("max_no_improvement", ctypes.c_int), + ("print_forward_graph", ctypes.c_bool), + ("print_backward_graph", ctypes.c_bool), + ("adam", ggml_opt_params_adam), + ("lbfgs", ggml_opt_params_lbfgs), + ] + + +# struct ggml_opt_context { +# struct ggml_context * ctx; +# struct ggml_opt_params params; + +# int iter; +# int64_t nx; // number of parameter elements + +# bool just_initialized; + +# float loss_before; +# float loss_after; + +# struct { +# struct ggml_tensor * m; // first moment +# struct ggml_tensor * v; // second moment +# struct ggml_tensor * pf; // past function values +# float fx_best; +# float fx_prev; +# int n_no_improvement; +# } adam; + +# struct { +# struct ggml_tensor * x; // current parameters +# struct ggml_tensor * xp; // previous parameters +# struct ggml_tensor * g; // current gradient +# struct ggml_tensor * gp; // previous gradient +# struct ggml_tensor * d; // search direction +# struct ggml_tensor * pf; // past function values +# struct ggml_tensor * lmal; // the L-BFGS memory alpha +# struct ggml_tensor * lmys; // the L-BFGS memory ys +# struct ggml_tensor * lms; // the L-BFGS memory s +# struct ggml_tensor * lmy; // the L-BFGS memory y +# float fx_best; +# float step; +# int j; +# int k; +# int end; +# int n_no_improvement; +# } lbfgs; +# }; + + +class ggml_opt_context_adam(ctypes.Structure): + _fields_ = [ + ("m", ctypes.POINTER(ggml_tensor)), + ("v", ctypes.POINTER(ggml_tensor)), + ("pf", ctypes.POINTER(ggml_tensor)), + ("fx_best", ctypes.c_float), + ("fx_prev", ctypes.c_float), + ("n_no_improvement", ctypes.c_int), + ] + + +class ggml_opt_context_lbfgs(ctypes.Structure): + _fields_ = [ + ("x", ctypes.POINTER(ggml_tensor)), + ("xp", ctypes.POINTER(ggml_tensor)), + ("g", ctypes.POINTER(ggml_tensor)), + ("gp", ctypes.POINTER(ggml_tensor)), + ("d", ctypes.POINTER(ggml_tensor)), + ("pf", ctypes.POINTER(ggml_tensor)), + ("lmal", ctypes.POINTER(ggml_tensor)), + ("lmys", ctypes.POINTER(ggml_tensor)), + ("lms", ctypes.POINTER(ggml_tensor)), + ("lmy", ctypes.POINTER(ggml_tensor)), + ("fx_best", ctypes.c_float), + ("step", ctypes.c_float), + ("j", ctypes.c_int), + ("k", ctypes.c_int), + ("end", ctypes.c_int), + ("n_no_improvement", ctypes.c_int), + ] + + +class ggml_opt_context(ctypes.Structure): + _fields_ = [ + ("ctx", ggml_context_p), + ("params", ggml_opt_params), + ("iter", ctypes.c_int), + ("nx", ctypes.c_int64), + ("just_initialized", ctypes.c_bool), + ("loss_before", ctypes.c_float), + ("loss_after", ctypes.c_float), + ("adam", ggml_opt_context_adam), + ("lbfgs", ggml_opt_context_lbfgs), + ] + + +ggml_opt_context_p = ctypes.POINTER(ggml_opt_context) + + +# GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); +def ggml_opt_default_params(type: Union[ctypes.c_int, bool]) -> ggml_opt_params: + return lib.ggml_opt_default_params(type) + + +lib.ggml_opt_default_params.argtypes = [ctypes.c_int] +lib.ggml_opt_default_params.restype = ggml_opt_params + + +# // optimize the function defined by the tensor f +# GGML_API enum ggml_opt_result ggml_opt( +# struct ggml_context * ctx, +# struct ggml_opt_params params, +# struct ggml_tensor * f); +def ggml_opt( + ctx: ggml_context_p, + params: ggml_opt_params, + f: ggml_tensor_p, +) -> int: + return lib.ggml_opt(ctx, params, f) + + +lib.ggml_opt.argtypes = [ggml_context_p, ggml_opt_params, ctypes.POINTER(ggml_tensor)] +lib.ggml_opt.restype = ctypes.c_int + + +# // initialize optimizer context +# GGML_API void ggml_opt_init( +# struct ggml_context * ctx, +# struct ggml_opt_context * opt, +# struct ggml_opt_params params, +# int64_t nx); +def ggml_opt_init( + ctx: ggml_context_p, + opt: "ctypes._Pointer[ggml_opt_context]", # type: ignore + params: ggml_opt_params, + nx: Union[ctypes.c_int64, int], +): + return lib.ggml_opt_init(ctx, opt, params, nx) + + +lib.ggml_opt_init.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_opt_context), + ggml_opt_params, + ctypes.c_int64, +] +lib.ggml_opt_init.restype = None + + +# // continue optimizing the function defined by the tensor f +# GGML_API enum ggml_opt_result ggml_opt_resume( +# struct ggml_context * ctx, +# struct ggml_opt_context * opt, +# struct ggml_tensor * f); +def ggml_opt_resume( + ctx: ggml_context_p, + opt: "ctypes._Pointer[ggml_opt_context]", # type: ignore + f: ggml_tensor_p, +) -> int: + return lib.ggml_opt_resume(ctx, opt, f) + + +lib.ggml_opt_resume.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_opt_context), + ctypes.POINTER(ggml_tensor), +] +lib.ggml_opt_resume.restype = ctypes.c_int + +# // continue optimizing the function defined by the tensor f +# GGML_API enum ggml_opt_result ggml_opt_resume_g( +# struct ggml_context * ctx, +# struct ggml_opt_context * opt, +# struct ggml_tensor * f, +# struct ggml_cgraph * gf, +# struct ggml_cgraph * gb, +# ggml_opt_callback callback, +# void * callback_data); + + +# // continue optimizing the function defined by the tensor f +# GGML_API enum ggml_opt_result ggml_opt_resume_g( +# struct ggml_context * ctx, +# struct ggml_opt_context * opt, +# struct ggml_tensor * f, +# struct ggml_cgraph * gf, +# struct ggml_cgraph * gb); +def ggml_opt_resume_g( + ctx: ggml_context_p, + opt: "ctypes._Pointer[ggml_opt_context]", # type: ignore + f: ggml_tensor_p, + gf: ggml_cgraph_p, + gb: ggml_cgraph_p, + callback: ggml_opt_callback = None, + callback_data: ctypes.c_void_p = None, +) -> int: + return lib.ggml_opt_resume_g(ctx, opt, f, gf, gb, callback, callback_data) + + +lib.ggml_opt_resume_g.argtypes = [ + ggml_context_p, + ctypes.POINTER(ggml_opt_context), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_cgraph), + ctypes.POINTER(ggml_cgraph), + ggml_opt_callback, + ctypes.c_void_p, +] +lib.ggml_opt_resume_g.restype = ctypes.c_int + +# // +# // quantization +# // + + +# GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); +def ggml_quantize_q4_0( + src: CFloatArray, + dst: ctypes.c_void_p, + n: Union[ctypes.c_int, int], + k: Union[ctypes.c_int, int], + hist: CInt64Array, +) -> int: + return lib.ggml_quantize_q4_0(src, dst, n, k, hist) + + +lib.ggml_quantize_q4_0.argtypes = [ + ctypes.POINTER(ctypes.c_float), + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.POINTER(ctypes.c_int64), +] +lib.ggml_quantize_q4_0.restype = ctypes.c_size_t + + +# GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); +def ggml_quantize_q4_1( + src: CFloatArray, + dst: ctypes.c_void_p, + n: Union[ctypes.c_int, int], + k: Union[ctypes.c_int, int], + hist: CInt64Array, +) -> int: + return lib.ggml_quantize_q4_1(src, dst, n, k, hist) + + +lib.ggml_quantize_q4_1.argtypes = [ + ctypes.POINTER(ctypes.c_float), + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.POINTER(ctypes.c_int64), +] +lib.ggml_quantize_q4_1.restype = ctypes.c_size_t + + +# GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); +def ggml_quantize_q5_0( + src: CFloatArray, + dst: ctypes.c_void_p, + n: Union[ctypes.c_int, int], + k: Union[ctypes.c_int, int], + hist: CInt64Array, +) -> int: + return lib.ggml_quantize_q5_0(src, dst, n, k, hist) + + +lib.ggml_quantize_q5_0.argtypes = [ + ctypes.POINTER(ctypes.c_float), + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.POINTER(ctypes.c_int64), +] +lib.ggml_quantize_q5_0.restype = ctypes.c_size_t + + +# GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); +def ggml_quantize_q5_1( + src: CFloatArray, + dst: ctypes.c_void_p, + n: Union[ctypes.c_int, int], + k: Union[ctypes.c_int, int], + hist: CInt64Array, +) -> int: + return lib.ggml_quantize_q5_1(src, dst, n, k, hist) + + +lib.ggml_quantize_q5_1.argtypes = [ + ctypes.POINTER(ctypes.c_float), + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.POINTER(ctypes.c_int64), +] +lib.ggml_quantize_q5_1.restype = ctypes.c_size_t + + +# GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); +def ggml_quantize_q8_0( + src: CFloatArray, + dst: ctypes.c_void_p, + n: Union[ctypes.c_int, int], + k: Union[ctypes.c_int, int], + hist: CInt64Array, +) -> int: + return lib.ggml_quantize_q8_0(src, dst, n, k, hist) + + +lib.ggml_quantize_q8_0.argtypes = [ + ctypes.POINTER(ctypes.c_float), + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.POINTER(ctypes.c_int64), +] +lib.ggml_quantize_q8_0.restype = ctypes.c_size_t + + +# GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); +def ggml_quantize_chunk( + type: Union[ctypes.c_int, int], + src: CFloatArray, + dst: ctypes.c_void_p, + start: Union[ctypes.c_int, int], + n: Union[ctypes.c_int, int], + hist: CInt64Array, +) -> int: + return lib.ggml_quantize_chunk(type, src, dst, start, n, hist) + + +lib.ggml_quantize_chunk.argtypes = [ + ctypes.c_int, + ctypes.POINTER(ctypes.c_float), + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.POINTER(ctypes.c_int64), +] +lib.ggml_quantize_chunk.restype = ctypes.c_size_t + +# // +# // gguf +# // + +# enum gguf_type { +# GGUF_TYPE_UINT8 = 0, +# GGUF_TYPE_INT8 = 1, +# GGUF_TYPE_UINT16 = 2, +# GGUF_TYPE_INT16 = 3, +# GGUF_TYPE_UINT32 = 4, +# GGUF_TYPE_INT32 = 5, +# GGUF_TYPE_FLOAT32 = 6, +# GGUF_TYPE_BOOL = 7, +# GGUF_TYPE_STRING = 8, +# GGUF_TYPE_ARRAY = 9, +# GGUF_TYPE_UINT64 = 10, +# GGUF_TYPE_INT64 = 11, +# GGUF_TYPE_FLOAT64 = 12, +# GGUF_TYPE_COUNT, // marks the end of the enum +# }; +GGUF_TYPE_UINT8 = 0 +GGUF_TYPE_INT8 = 1 +GGUF_TYPE_UINT16 = 2 +GGUF_TYPE_INT16 = 3 +GGUF_TYPE_UINT32 = 4 +GGUF_TYPE_INT32 = 5 +GGUF_TYPE_FLOAT32 = 6 +GGUF_TYPE_BOOL = 7 +GGUF_TYPE_STRING = 8 +GGUF_TYPE_ARRAY = 9 +GGUF_TYPE_COUNT = 10 + +# struct gguf_context; +gguf_context_p = ctypes.c_void_p + +# struct gguf_init_params { +# bool no_alloc; + + +# // if not NULL, create a ggml_context and allocate the tensor data in it +# struct ggml_context ** ctx; +# }; +class gguf_init_params(ctypes.Structure): + _fields_ = [ + ("no_alloc", ctypes.c_bool), + ("ctx", ctypes.POINTER(ggml_context_p)), + ] + + +# GGML_API struct gguf_context * gguf_init_empty(void); +def gguf_init_empty() -> gguf_context_p: + return lib.gguf_init_empty() + + +lib.gguf_init_empty.argtypes = [] +lib.gguf_init_empty.restype = gguf_context_p + + +# GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); +def gguf_init_from_file( + fname: bytes, + params: gguf_init_params, +) -> gguf_context_p: + return lib.gguf_init_from_file(fname, params) + + +lib.gguf_init_from_file.argtypes = [ + ctypes.c_char_p, + gguf_init_params, +] +lib.gguf_init_from_file.restype = gguf_context_p + +# //GGML_API struct gguf_context * gguf_init_from_buffer(..); + + +# GGML_API void gguf_free(struct gguf_context * ctx); +def gguf_free( + ctx: gguf_context_p, +): + return lib.gguf_free(ctx) + + +lib.gguf_free.argtypes = [ + gguf_context_p, +] +lib.gguf_free.restype = None + + +# GGML_API const char * gguf_type_name(enum gguf_type type); +def gguf_type_name( + type: Union[ctypes.c_int, int], +) -> bytes: + return lib.gguf_type_name(type) + + +lib.gguf_type_name.argtypes = [ + ctypes.c_int, +] +lib.gguf_type_name.restype = ctypes.c_char_p + + +# GGML_API int gguf_get_version (const struct gguf_context * ctx); +def gguf_get_version( + ctx: gguf_context_p, +) -> int: + return lib.gguf_get_version(ctx) + + +lib.gguf_get_version.argtypes = [ + gguf_context_p, +] +lib.gguf_get_version.restype = ctypes.c_int + + +# GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx); +def gguf_get_alignment( + ctx: gguf_context_p, +) -> int: + return lib.gguf_get_alignment(ctx) + + +lib.gguf_get_alignment.argtypes = [ + gguf_context_p, +] +lib.gguf_get_alignment.restype = ctypes.c_size_t + + +# GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx); +def gguf_get_data_offset( + ctx: gguf_context_p, +) -> int: + return lib.gguf_get_data_offset(ctx) + + +lib.gguf_get_data_offset.argtypes = [ + gguf_context_p, +] +lib.gguf_get_data_offset.restype = ctypes.c_size_t + + +# GGML_API void * gguf_get_data (const struct gguf_context * ctx); +def gguf_get_data( + ctx: gguf_context_p, +) -> ctypes.c_void_p: + return lib.gguf_get_data(ctx) + + +lib.gguf_get_data.argtypes = [ + gguf_context_p, +] +lib.gguf_get_data.restype = ctypes.c_void_p + + +# GGML_API int gguf_get_n_kv(const struct gguf_context * ctx); +def gguf_get_n_kv( + ctx: gguf_context_p, +) -> int: + return lib.gguf_get_n_kv(ctx) + + +lib.gguf_get_n_kv.argtypes = [ + gguf_context_p, +] +lib.gguf_get_n_kv.restype = ctypes.c_int + + +# GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key); +def gguf_find_key( + ctx: gguf_context_p, + key: bytes, +) -> int: + return lib.gguf_find_key(ctx, key) + + +lib.gguf_find_key.argtypes = [ + gguf_context_p, + ctypes.c_char_p, +] +lib.gguf_find_key.restype = ctypes.c_int + + +# GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i); +def gguf_get_key( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> bytes: + return lib.gguf_get_key(ctx, i) + + +lib.gguf_get_key.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_key.restype = ctypes.c_char_p + + +# GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i); +def gguf_get_kv_type( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> int: + return lib.gguf_get_kv_type(ctx, i) + + +lib.gguf_get_kv_type.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_kv_type.restype = ctypes.c_int + + +# GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i); +def gguf_get_arr_type( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> int: + return lib.gguf_get_arr_type(ctx, i) + + +lib.gguf_get_arr_type.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_arr_type.restype = ctypes.c_int + + +# // results are undefined if the wrong type is used for the key +# GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int i); +def gguf_get_val_u8( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> int: + return lib.gguf_get_val_u8(ctx, i) + + +lib.gguf_get_val_u8.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_val_u8.restype = ctypes.c_uint8 + + +# GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int i); +def gguf_get_val_i8( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> int: + return lib.gguf_get_val_i8(ctx, i) + + +lib.gguf_get_val_i8.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_val_i8.restype = ctypes.c_int8 + + +# GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int i); +def gguf_get_val_u16( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> int: + return lib.gguf_get_val_u16(ctx, i) + + +lib.gguf_get_val_u16.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_val_u16.restype = ctypes.c_uint16 + + +# GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int i); +def gguf_get_val_i16( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> int: + return lib.gguf_get_val_i16(ctx, i) + + +lib.gguf_get_val_i16.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_val_i16.restype = ctypes.c_int16 + + +# GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int i); +def gguf_get_val_u32( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> int: + return lib.gguf_get_val_u32(ctx, i) + + +lib.gguf_get_val_u32.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_val_u32.restype = ctypes.c_uint32 + + +# GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int i); +def gguf_get_val_i32( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> int: + return lib.gguf_get_val_i32(ctx, i) + + +lib.gguf_get_val_i32.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_val_i32.restype = ctypes.c_int32 + + +# GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int i); +def gguf_get_val_f32( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> float: + return lib.gguf_get_val_f32(ctx, i) + + +lib.gguf_get_val_f32.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_val_f32.restype = ctypes.c_float + + +# GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int i); +def gguf_get_val_u64( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> int: + return lib.gguf_get_val_u64(ctx, i) + + +lib.gguf_get_val_u64.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_val_u64.restype = ctypes.c_uint64 + + +# GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int i); +def gguf_get_val_i64( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> int: + return lib.gguf_get_val_i64(ctx, i) + + +lib.gguf_get_val_i64.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_val_i64.restype = ctypes.c_int64 + + +# GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int i); +def gguf_get_val_f64( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> float: + return lib.gguf_get_val_f64(ctx, i) + + +lib.gguf_get_val_f64.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_val_f64.restype = ctypes.c_double + + +# GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int i); +def gguf_get_val_bool( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> bool: + return lib.gguf_get_val_bool(ctx, i) + + +lib.gguf_get_val_bool.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_val_bool.restype = ctypes.c_bool + + +# GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i); +def gguf_get_val_str( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> bytes: + return lib.gguf_get_val_str(ctx, i) + + +lib.gguf_get_val_str.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_val_str.restype = ctypes.c_char_p + + +# GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int i); +def gguf_get_arr_n( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> int: + return lib.gguf_get_arr_n(ctx, i) + + +lib.gguf_get_arr_n.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_arr_n.restype = ctypes.c_int + + +# GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i); +def gguf_get_arr_data( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> ctypes.c_void_p: + return lib.gguf_get_arr_data(ctx, i) + + +lib.gguf_get_arr_data.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_arr_data.restype = ctypes.c_void_p + + +# GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i); +def gguf_get_arr_str( + ctx: gguf_context_p, + key_id: Union[ctypes.c_int, int], + i: Union[ctypes.c_int, int], +) -> bytes: + return lib.gguf_get_arr_str(ctx, key_id, i) + + +lib.gguf_get_arr_str.argtypes = [ + gguf_context_p, + ctypes.c_int, + ctypes.c_int, +] +lib.gguf_get_arr_str.restype = ctypes.c_char_p + + +# GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); +def gguf_get_n_tensors( + ctx: gguf_context_p, +) -> int: + return lib.gguf_get_n_tensors(ctx) + + +lib.gguf_get_n_tensors.argtypes = [ + gguf_context_p, +] +lib.gguf_get_n_tensors.restype = ctypes.c_int + + +# GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); +def gguf_find_tensor( + ctx: gguf_context_p, + name: bytes, +) -> int: + return lib.gguf_find_tensor(ctx, name) + + +lib.gguf_find_tensor.argtypes = [ + gguf_context_p, + ctypes.c_char_p, +] +lib.gguf_find_tensor.restype = ctypes.c_int + + +# GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i); +def gguf_get_tensor_offset( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> int: + return lib.gguf_get_tensor_offset(ctx, i) + + +lib.gguf_get_tensor_offset.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_tensor_offset.restype = ctypes.c_size_t + + +# GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i); +def gguf_get_tensor_name( + ctx: gguf_context_p, + i: Union[ctypes.c_int, int], +) -> bytes: + return lib.gguf_get_tensor_name(ctx, i) + + +lib.gguf_get_tensor_name.argtypes = [ + gguf_context_p, + ctypes.c_int, +] +lib.gguf_get_tensor_name.restype = ctypes.c_char_p + + +# // overrides existing values or adds a new one +# GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val); +def gguf_set_val_u8( + ctx: gguf_context_p, + key: bytes, + val: Union[ctypes.c_uint8, int], +): + return lib.gguf_set_val_u8(ctx, key, val) + + +lib.gguf_set_val_u8.argtypes = [ + gguf_context_p, + ctypes.c_char_p, + ctypes.c_uint8, +] +lib.gguf_set_val_u8.restype = None + + +# GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val); +def gguf_set_val_i8( + ctx: gguf_context_p, + key: bytes, + val: Union[ctypes.c_int8, int], +): + return lib.gguf_set_val_i8(ctx, key, val) + + +lib.gguf_set_val_i8.argtypes = [ + gguf_context_p, + ctypes.c_char_p, + ctypes.c_int8, +] +lib.gguf_set_val_i8.restype = None + + +# GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val); +def gguf_set_val_u16( + ctx: gguf_context_p, + key: bytes, + val: Union[ctypes.c_uint16, int], +): + return lib.gguf_set_val_u16(ctx, key, val) + + +lib.gguf_set_val_u16.argtypes = [ + gguf_context_p, + ctypes.c_char_p, + ctypes.c_uint16, +] +lib.gguf_set_val_u16.restype = None + + +# GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val); +def gguf_set_val_i16( + ctx: gguf_context_p, + key: bytes, + val: Union[ctypes.c_int16, int], +): + return lib.gguf_set_val_i16(ctx, key, val) + + +lib.gguf_set_val_i16.argtypes = [ + gguf_context_p, + ctypes.c_char_p, + ctypes.c_int16, +] +lib.gguf_set_val_i16.restype = None + + +# GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val); +def gguf_set_val_u32( + ctx: gguf_context_p, + key: bytes, + val: Union[ctypes.c_uint32, int], +): + return lib.gguf_set_val_u32(ctx, key, val) + + +lib.gguf_set_val_u32.argtypes = [ + gguf_context_p, + ctypes.c_char_p, + ctypes.c_uint32, +] +lib.gguf_set_val_u32.restype = None + + +# GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val); +def gguf_set_val_i32( + ctx: gguf_context_p, + key: bytes, + val: Union[ctypes.c_int32, int], +): + return lib.gguf_set_val_i32(ctx, key, val) + + +lib.gguf_set_val_i32.argtypes = [ + gguf_context_p, + ctypes.c_char_p, + ctypes.c_int32, +] +lib.gguf_set_val_i32.restype = None + + +# GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val); +def gguf_set_val_f32( + ctx: gguf_context_p, + key: bytes, + val: Union[ctypes.c_float, float], +): + return lib.gguf_set_val_f32(ctx, key, val) + + +lib.gguf_set_val_f32.argtypes = [ + gguf_context_p, + ctypes.c_char_p, + ctypes.c_float, +] +lib.gguf_set_val_f32.restype = None + + +# GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val); +def gguf_set_val_u64( + ctx: gguf_context_p, + key: bytes, + val: Union[ctypes.c_uint64, int], +): + return lib.gguf_set_val_u64(ctx, key, val) + + +lib.gguf_set_val_u64.argtypes = [ + gguf_context_p, + ctypes.c_char_p, + ctypes.c_uint64, +] +lib.gguf_set_val_u64.restype = None + + +# GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val); +def gguf_set_val_i64( + ctx: gguf_context_p, + key: bytes, + val: Union[ctypes.c_int64, int], +): + return lib.gguf_set_val_i64(ctx, key, val) + + +lib.gguf_set_val_i64.argtypes = [ + gguf_context_p, + ctypes.c_char_p, + ctypes.c_int64, +] +lib.gguf_set_val_i64.restype = None + + +# GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val); +def gguf_set_val_f64( + ctx: gguf_context_p, + key: bytes, + val: Union[ctypes.c_double, float], +): + return lib.gguf_set_val_f64(ctx, key, val) + + +lib.gguf_set_val_f64.argtypes = [ + gguf_context_p, + ctypes.c_char_p, + ctypes.c_double, +] +lib.gguf_set_val_f64.restype = None + + +# GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val); +def gguf_set_val_bool( + ctx: gguf_context_p, + key: bytes, + val: Union[ctypes.c_bool, bool], +): + return lib.gguf_set_val_bool(ctx, key, val) + + +lib.gguf_set_val_bool.argtypes = [ + gguf_context_p, + ctypes.c_char_p, + ctypes.c_bool, +] +lib.gguf_set_val_bool.restype = None + + +# GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val); +def gguf_set_val_str( + ctx: gguf_context_p, + key: bytes, + val: bytes, +): + return lib.gguf_set_val_str(ctx, key, val) + + +lib.gguf_set_val_str.argtypes = [ + gguf_context_p, + ctypes.c_char_p, + ctypes.c_char_p, +] +lib.gguf_set_val_str.restype = None + + +# GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n); +def gguf_set_arr_data( + ctx: gguf_context_p, + key: bytes, + type: Union[ctypes.c_int, int], + data: ctypes.c_void_p, + n: Union[ctypes.c_int, int], +): + return lib.gguf_set_arr_data(ctx, key, type, data, n) + + +lib.gguf_set_arr_data.argtypes = [ + gguf_context_p, + ctypes.c_char_p, + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_int, +] +lib.gguf_set_arr_data.restype = None + + +# GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n); +def gguf_set_arr_str( + ctx: gguf_context_p, + key: bytes, + data: CCharPointer, + n: Union[ctypes.c_int, int], +): + return lib.gguf_set_arr_str(ctx, key, data, n) + + +lib.gguf_set_arr_str.argtypes = [ + gguf_context_p, + ctypes.c_char_p, + ctypes.POINTER(ctypes.c_char_p), + ctypes.c_int, +] +lib.gguf_set_arr_str.restype = None + + +# // set or add KV pairs from another context +# GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src); +def gguf_set_kv( + ctx: gguf_context_p, + src: gguf_context_p, +): + return lib.gguf_set_kv(ctx, src) + + +lib.gguf_set_kv.argtypes = [ + gguf_context_p, + gguf_context_p, +] +lib.gguf_set_kv.restype = None + + +# // manage tensor info +# GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor); +def gguf_add_tensor( + ctx: gguf_context_p, + tensor: ggml_tensor_p, +): + return lib.gguf_add_tensor(ctx, tensor) + + +lib.gguf_add_tensor.argtypes = [ + gguf_context_p, + ctypes.POINTER(ggml_tensor), +] +lib.gguf_add_tensor.restype = None + + +# GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type); +def gguf_set_tensor_type( + ctx: gguf_context_p, + name: bytes, + type: Union[ctypes.c_int, int], +): + return lib.gguf_set_tensor_type(ctx, name, type) + + +lib.gguf_set_tensor_type.argtypes = [ + gguf_context_p, + ctypes.c_char_p, + ctypes.c_int, +] +lib.gguf_set_tensor_type.restype = None + + +# GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size); +def gguf_set_tensor_data( + ctx: gguf_context_p, + name: bytes, + data: ctypes.c_void_p, + size: Union[ctypes.c_size_t, int], +): + return lib.gguf_set_tensor_data(ctx, name, data, size) + + +lib.gguf_set_tensor_data.argtypes = [ + gguf_context_p, + ctypes.c_char_p, + ctypes.c_void_p, + ctypes.c_size_t, +] +lib.gguf_set_tensor_data.restype = None + +# // writing gguf files can be done in 2 ways: +# // +# // - write the entire gguf_context to a binary file in a single pass: +# // +# // gguf_write_to_file(ctx, fname); +# // +# // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data: +# // +# // FILE * f = fopen(fname, "wb"); +# // fseek(f, gguf_get_meta_size(ctx), SEEK_SET); +# // fwrite(f, ...); +# // void * data = gguf_meta_get_meta_data(ctx); +# // fseek(f, 0, SEEK_SET); +# // fwrite(f, data, gguf_get_meta_size(ctx)); +# // free(data); +# // fclose(f); +# // + + +# // write the entire context to a binary file +# GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta); +def gguf_write_to_file( + ctx: gguf_context_p, + fname: bytes, + only_meta: Union[ctypes.c_bool, bool], +): + return lib.gguf_write_to_file(ctx, fname, only_meta) + + +lib.gguf_write_to_file.argtypes = [ + gguf_context_p, + ctypes.c_char_p, + ctypes.c_bool, +] +lib.gguf_write_to_file.restype = None + + +# // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding +# GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx); +def gguf_get_meta_size( + ctx: gguf_context_p, +) -> int: + return lib.gguf_get_meta_size(ctx) + + +lib.gguf_get_meta_size.argtypes = [ + gguf_context_p, +] +lib.gguf_get_meta_size.restype = ctypes.c_size_t + + +# GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data); +def gguf_get_meta_data( + ctx: gguf_context_p, + data: ctypes.c_void_p, +): + return lib.gguf_get_meta_data(ctx, data) + + +lib.gguf_get_meta_data.argtypes = [ + gguf_context_p, + ctypes.c_void_p, +] +lib.gguf_get_meta_data.restype = None + + +# // +# // system info +# // + + +# GGML_API int ggml_cpu_has_avx (void); +def ggml_cpu_has_avx() -> int: + return lib.ggml_cpu_has_avx() + + +lib.ggml_cpu_has_avx.argtypes = [] +lib.ggml_cpu_has_avx.restype = ctypes.c_int + + +# GGML_API int ggml_cpu_has_avx2 (void); +def ggml_cpu_has_avx2() -> int: + return lib.ggml_cpu_has_avx2() + + +lib.ggml_cpu_has_avx2.argtypes = [] +lib.ggml_cpu_has_avx2.restype = ctypes.c_int + + +# GGML_API int ggml_cpu_has_avx512 (void); +def ggml_cpu_has_avx512() -> int: + return lib.ggml_cpu_has_avx512() + + +lib.ggml_cpu_has_avx512.argtypes = [] +lib.ggml_cpu_has_avx512.restype = ctypes.c_int + + +# GGML_API int ggml_cpu_has_avx512_vbmi(void); +def ggml_cpu_has_avx512_vbmi() -> int: + return lib.ggml_cpu_has_avx512_vbmi() + + +lib.ggml_cpu_has_avx512_vbmi.argtypes = [] +lib.ggml_cpu_has_avx512_vbmi.restype = ctypes.c_int + + +# GGML_API int ggml_cpu_has_avx512_vnni(void); +def ggml_cpu_has_avx512_vnni() -> int: + return lib.ggml_cpu_has_avx512_vnni() + + +lib.ggml_cpu_has_avx512_vnni.argtypes = [] +lib.ggml_cpu_has_avx512_vnni.restype = ctypes.c_int + + +# GGML_API int ggml_cpu_has_fma (void); +def ggml_cpu_has_fma() -> int: + return lib.ggml_cpu_has_fma() + + +lib.ggml_cpu_has_fma.argtypes = [] +lib.ggml_cpu_has_fma.restype = ctypes.c_int + + +# GGML_API int ggml_cpu_has_neon (void); +def ggml_cpu_has_neon() -> int: + return lib.ggml_cpu_has_neon() + + +lib.ggml_cpu_has_neon.argtypes = [] +lib.ggml_cpu_has_neon.restype = ctypes.c_int + + +# GGML_API int ggml_cpu_has_arm_fma (void); +def ggml_cpu_has_arm_fma() -> int: + return lib.ggml_cpu_has_arm_fma() + + +lib.ggml_cpu_has_arm_fma.argtypes = [] +lib.ggml_cpu_has_arm_fma.restype = ctypes.c_int + + +# GGML_API int ggml_cpu_has_f16c (void); +def ggml_cpu_has_f16c() -> int: + return lib.ggml_cpu_has_f16c() + + +lib.ggml_cpu_has_f16c.argtypes = [] +lib.ggml_cpu_has_f16c.restype = ctypes.c_int + + +# GGML_API int ggml_cpu_has_fp16_va (void); +def ggml_cpu_has_fp16_va() -> int: + return lib.ggml_cpu_has_fp16_va() + + +lib.ggml_cpu_has_fp16_va.argtypes = [] +lib.ggml_cpu_has_fp16_va.restype = ctypes.c_int + + +# GGML_API int ggml_cpu_has_wasm_simd (void); +def ggml_cpu_has_wasm_simd() -> int: + return lib.ggml_cpu_has_wasm_simd() + + +lib.ggml_cpu_has_wasm_simd.argtypes = [] +lib.ggml_cpu_has_wasm_simd.restype = ctypes.c_int + + +# GGML_API int ggml_cpu_has_blas (void); +def ggml_cpu_has_blas() -> int: + return lib.ggml_cpu_has_blas() + + +lib.ggml_cpu_has_blas.argtypes = [] +lib.ggml_cpu_has_blas.restype = ctypes.c_int + + +# GGML_API int ggml_cpu_has_cublas (void); +def ggml_cpu_has_cublas() -> int: + return lib.ggml_cpu_has_cublas() + + +lib.ggml_cpu_has_cublas.argtypes = [] +lib.ggml_cpu_has_cublas.restype = ctypes.c_int + + +# GGML_API int ggml_cpu_has_clblast (void); +def ggml_cpu_has_clblast() -> int: + return lib.ggml_cpu_has_clblast() + + +lib.ggml_cpu_has_clblast.argtypes = [] +lib.ggml_cpu_has_clblast.restype = ctypes.c_int + + +# GGML_API int ggml_cpu_has_gpublas (void); +def ggml_cpu_has_gpublas() -> int: + return lib.ggml_cpu_has_gpublas() + + +lib.ggml_cpu_has_gpublas.argtypes = [] +lib.ggml_cpu_has_gpublas.restype = ctypes.c_int + + +# GGML_API int ggml_cpu_has_sse3 (void); +def ggml_cpu_has_sse3() -> int: + return lib.ggml_cpu_has_sse3() + + +lib.ggml_cpu_has_sse3.argtypes = [] +lib.ggml_cpu_has_sse3.restype = ctypes.c_int + + +# GGML_API int ggml_cpu_has_ssse3 (void); +def ggml_cpu_has_ssse3() -> int: + return lib.ggml_cpu_has_ssse3() + + +lib.ggml_cpu_has_ssse3.argtypes = [] +lib.ggml_cpu_has_ssse3.restype = ctypes.c_int + + +# GGML_API int ggml_cpu_has_vsx (void); +def ggml_cpu_has_vsx() -> int: + return lib.ggml_cpu_has_vsx() + + +lib.ggml_cpu_has_vsx.argtypes = [] +lib.ggml_cpu_has_vsx.restype = ctypes.c_int + + +# // +# // Internal types and functions exposed for tests and benchmarks +# // + +# typedef void (*ggml_to_float_t)(const void * x, float * y, int k); +ggml_to_float_t = ctypes.CFUNCTYPE( + None, ctypes.c_void_p, ctypes.POINTER(ctypes.c_float), ctypes.c_int +) + +# typedef void (*ggml_from_float_t)(const float * x, void * y, int k); +ggml_from_float_t = ctypes.CFUNCTYPE( + None, ctypes.POINTER(ctypes.c_float), ctypes.c_void_p, ctypes.c_int +) + +# typedef void (*ggml_vec_dot_t)(const int n, float * s, const void * x, const void * y); +ggml_vec_dot_t = ctypes.CFUNCTYPE( + None, ctypes.c_int, ctypes.POINTER(ctypes.c_float), ctypes.c_void_p, ctypes.c_void_p +) + + +# typedef struct { +# const char * type_name; +# int blck_size; +# size_t type_size; +# bool is_quantized; +# ggml_to_float_t to_float; +# ggml_from_float_t from_float; +# ggml_from_float_t from_float_reference; +# ggml_vec_dot_t vec_dot; +# enum ggml_type vec_dot_type; +# } ggml_type_traits_t; +class ggml_type_traits_t(ctypes.Structure): + _fields_ = [ + ("type_name", ctypes.c_char_p), + ("blck_size", ctypes.c_int), + ("type_size", ctypes.c_size_t), + ("is_quantized", ctypes.c_bool), + ("to_float", ggml_to_float_t), + ("from_float", ggml_from_float_t), + ("from_float_reference", ggml_from_float_t), + ("vec_dot", ggml_vec_dot_t), + ("vec_dot_type", ctypes.c_int), + ] + + +# ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); +def ggml_internal_get_type_traits(type: Union[ctypes.c_int, int]) -> ggml_type_traits_t: + return lib.ggml_internal_get_type_traits(type) + + +lib.ggml_internal_get_type_traits.argtypes = [ctypes.c_int] +lib.ggml_internal_get_type_traits.restype = ggml_type_traits_t + + +##################################################### +# GGML ALLOC API +# source: ggml-alloc.h +##################################################### + +ggml_allocr_p = ctypes.c_void_p + + +# GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment); +def ggml_allocr_new( + data: ctypes.c_void_p, + size: Union[ctypes.c_size_t, int], + alignment: Union[ctypes.c_size_t, int], +) -> ggml_allocr_p: + return lib.ggml_allocr_new(data, size, alignment) + + +lib.ggml_allocr_new.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_size_t] +lib.ggml_allocr_new.restype = ggml_allocr_p + + +# GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment); +def ggml_allocr_new_measure( + alignment: Union[ctypes.c_size_t, int], +) -> ggml_allocr_p: + return lib.ggml_allocr_new_measure(alignment) + + +lib.ggml_allocr_new_measure.argtypes = [ctypes.c_size_t] +lib.ggml_allocr_new_measure.restype = ggml_allocr_p + + +# // tell the allocator to parse nodes following the order described in the list +# // you should call this if your graph are optimized to execute out-of-order +# GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n); +def ggml_allocr_set_parse_seq( + alloc: ggml_allocr_p, + list: CIntPointer, + n: Union[ctypes.c_int, int], +): + return lib.ggml_allocr_set_parse_seq(alloc, list, n) + + +lib.ggml_allocr_set_parse_seq.argtypes = [ + ggml_allocr_p, + ctypes.POINTER(ctypes.c_int), + ctypes.c_int, +] +lib.ggml_allocr_set_parse_seq.restype = None + + +# GGML_API void ggml_allocr_free(struct ggml_allocr * alloc); +def ggml_allocr_free( + alloc: ggml_allocr_p, +): + return lib.ggml_allocr_free(alloc) + + +lib.ggml_allocr_free.argtypes = [ggml_allocr_p] +lib.ggml_allocr_free.restype = None + + +# GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc); +def ggml_allocr_is_measure( + alloc: ggml_allocr_p, +) -> bool: + return lib.ggml_allocr_is_measure(alloc) + + +lib.ggml_allocr_is_measure.argtypes = [ggml_allocr_p] +lib.ggml_allocr_is_measure.restype = ctypes.c_bool + + +# GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc); +def ggml_allocr_reset( + alloc: ggml_allocr_p, +): + return lib.ggml_allocr_reset(alloc) + + +lib.ggml_allocr_reset.argtypes = [ggml_allocr_p] +lib.ggml_allocr_reset.restype = None + + +# GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor); +def ggml_allocr_alloc( + alloc: ggml_allocr_p, + tensor: ggml_tensor_p, +): + return lib.ggml_allocr_alloc(alloc, tensor) + + +lib.ggml_allocr_alloc.argtypes = [ggml_allocr_p, ctypes.POINTER(ggml_tensor)] +lib.ggml_allocr_alloc.restype = None + + +# GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph); +def ggml_allocr_alloc_graph( + alloc: ggml_allocr_p, + graph: ggml_cgraph_p, +) -> int: + return lib.ggml_allocr_alloc_graph(alloc, graph) + + +lib.ggml_allocr_alloc_graph.argtypes = [ggml_allocr_p, ctypes.POINTER(ggml_cgraph)] +lib.ggml_allocr_alloc_graph.restype = ctypes.c_size_t + + +##################################################### +# GGML CUDA API +# source: ggml-cuda.h +##################################################### + + +GGML_USE_CUBLAS = hasattr(lib, "ggml_init_cublas") + + +GGML_CUDA_MAX_DEVICES = 16 + + +# GGML_API void ggml_init_cublas(void); +def ggml_init_cublas(): + return lib.ggml_init_cublas() + + +if GGML_USE_CUBLAS: + lib.ggml_init_cublas.argtypes = [] + lib.ggml_init_cublas.restype = None + + +# void * ggml_cuda_host_malloc(size_t size); +def ggml_cuda_host_malloc( + size: Union[ctypes.c_size_t, int], +) -> Optional[ctypes.c_void_p]: + return lib.ggml_cuda_host_malloc(size) + + +if GGML_USE_CUBLAS: + lib.ggml_cuda_host_malloc.argtypes = [ctypes.c_size_t] + lib.ggml_cuda_host_malloc.restype = ctypes.c_void_p + + +# void ggml_cuda_host_free(void * ptr); +def ggml_cuda_host_free( + ptr: ctypes.c_void_p, +): + return lib.ggml_cuda_host_free(ptr) + + +if GGML_USE_CUBLAS: + lib.ggml_cuda_host_free.argtypes = [ctypes.c_void_p] + lib.ggml_cuda_host_free.restype = None + + +# GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +def ggml_cuda_can_mul_mat( + src0: ggml_tensor_p, + src1: ggml_tensor_p, + dst: ggml_tensor_p, +) -> bool: + return lib.ggml_cuda_can_mul_mat(src0, src1, dst) + + +if GGML_USE_CUBLAS: + lib.ggml_cuda_can_mul_mat.argtypes = [ + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ] + lib.ggml_cuda_can_mul_mat.restype = ctypes.c_bool + + +# GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split); +def ggml_cuda_set_tensor_split( + tensor_split: CFloatArray, +): + return lib.ggml_cuda_set_tensor_split(tensor_split) + + +if GGML_USE_CUBLAS: + lib.ggml_cuda_set_tensor_split.argtypes = [ctypes.POINTER(ctypes.c_float)] + lib.ggml_cuda_set_tensor_split.restype = None + + +# void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor); +def ggml_cuda_transform_tensor( + data: ctypes.c_void_p, + tensor: ggml_tensor_p, +): + return lib.ggml_cuda_transform_tensor(data, tensor) + + +if GGML_USE_CUBLAS: + lib.ggml_cuda_transform_tensor.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(ggml_tensor), + ] + lib.ggml_cuda_transform_tensor.restype = None + + +# void ggml_cuda_free_data(struct ggml_tensor * tensor); +def ggml_cuda_free_data( + tensor: ggml_tensor_p, +): + return lib.ggml_cuda_free_data(tensor) + + +if GGML_USE_CUBLAS: + lib.ggml_cuda_free_data.argtypes = [ + ctypes.POINTER(ggml_tensor), + ] + lib.ggml_cuda_free_data.restype = None + + +# void ggml_cuda_assign_buffers(struct ggml_tensor * tensor); +def ggml_cuda_assign_buffers( + tensor: ggml_tensor_p, +): + return lib.ggml_cuda_assign_buffers(tensor) + + +if GGML_USE_CUBLAS: + lib.ggml_cuda_assign_buffers.argtypes = [ + ctypes.POINTER(ggml_tensor), + ] + lib.ggml_cuda_assign_buffers.restype = None + + +# void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); +def ggml_cuda_assign_buffers_no_scratch( + tensor: ggml_tensor_p, +): + return lib.ggml_cuda_assign_buffers_no_scratch(tensor) + + +if GGML_USE_CUBLAS: + lib.ggml_cuda_assign_buffers_no_scratch.argtypes = [ + ctypes.POINTER(ggml_tensor), + ] + lib.ggml_cuda_assign_buffers_no_scratch.restype = None + + +# GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor); +def ggml_cuda_assign_buffers_force_inplace( + tensor: ggml_tensor_p, +): + return lib.ggml_cuda_assign_buffers_force_inplace(tensor) + + +if GGML_USE_CUBLAS: + lib.ggml_cuda_assign_buffers_force_inplace.argtypes = [ + ctypes.POINTER(ggml_tensor), + ] + lib.ggml_cuda_assign_buffers_force_inplace.restype = None + + +# GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor); +def ggml_cuda_assign_buffers_no_alloc( + tensor: ggml_tensor_p, +): + return lib.ggml_cuda_assign_buffers_no_alloc(tensor) + + +if GGML_USE_CUBLAS: + lib.ggml_cuda_assign_buffers_no_alloc.argtypes = [ + ctypes.POINTER(ggml_tensor), + ] + lib.ggml_cuda_assign_buffers_no_alloc.restype = None + + +# GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset); +def ggml_cuda_assign_scratch_offset( + tensor: ggml_tensor_p, + offset: Union[ctypes.c_size_t, int], +): + return lib.ggml_cuda_assign_scratch_offset(tensor, offset) + + +if GGML_USE_CUBLAS: + lib.ggml_cuda_assign_scratch_offset.argtypes = [ + ctypes.POINTER(ggml_tensor), + ctypes.c_size_t, + ] + lib.ggml_cuda_assign_scratch_offset.restype = None + + +# void ggml_cuda_set_main_device(int main_device); +def ggml_cuda_set_main_device( + main_device: Union[ctypes.c_int, int], +): + return lib.ggml_cuda_set_main_device(main_device) + + +if GGML_USE_CUBLAS: + lib.ggml_cuda_set_main_device.argtypes = [ + ctypes.c_int, + ] + lib.ggml_cuda_set_main_device.restype = None + + +# GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q); +def ggml_cuda_set_mul_mat_q( + mul_mat_q: Union[ctypes.c_bool, bool], +): + return lib.ggml_cuda_set_mul_mat_q(mul_mat_q) + + +if GGML_USE_CUBLAS: + lib.ggml_cuda_set_mul_mat_q.argtypes = [ + ctypes.c_bool, + ] + lib.ggml_cuda_set_mul_mat_q.restype = None + + +# void ggml_cuda_set_scratch_size(size_t scratch_size); +def ggml_cuda_set_scratch_size( + scratch_size: Union[ctypes.c_size_t, int], +): + return lib.ggml_cuda_set_scratch_size(scratch_size) + + +if GGML_USE_CUBLAS: + lib.ggml_cuda_set_scratch_size.argtypes = [ + ctypes.c_size_t, + ] + lib.ggml_cuda_set_scratch_size.restype = None + + +# void ggml_cuda_free_scratch(void); +def ggml_cuda_free_scratch(): + return lib.ggml_cuda_free_scratch() + + +if GGML_USE_CUBLAS: + lib.ggml_cuda_free_scratch.argtypes = [] + lib.ggml_cuda_free_scratch.restype = None + + +# GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); +def ggml_cuda_compute_forward( + params: ggml_compute_params_p, + tensor: ggml_tensor_p, +) -> bool: + return lib.ggml_cuda_compute_forward(params, tensor) + + +if GGML_USE_CUBLAS: + lib.ggml_cuda_compute_forward.argtypes = [ + ctypes.POINTER(ggml_compute_params), + ctypes.POINTER(ggml_tensor), + ] + lib.ggml_cuda_compute_forward.restype = ctypes.c_bool + + +# GGML_API int ggml_cuda_get_device_count(void); +def ggml_cuda_get_device_count() -> int: + return lib.ggml_cuda_get_device_count() + + +if GGML_USE_CUBLAS: + lib.ggml_cuda_get_device_count.argtypes = [] + lib.ggml_cuda_get_device_count.restype = ctypes.c_int + + +# GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size); +def ggml_cuda_get_device_description( + device: Union[ctypes.c_int, int], + description: bytes, + description_size: Union[ctypes.c_size_t, int], +): + return lib.ggml_cuda_get_device_description(device, description, description_size) + + +if GGML_USE_CUBLAS: + lib.ggml_cuda_get_device_description.argtypes = [ + ctypes.c_int, + ctypes.c_char_p, + ctypes.c_size_t, + ] + lib.ggml_cuda_get_device_description.restype = None + +##################################################### +# GGML METAL API +# source: ggml-metal.h +##################################################### + + +GGML_USE_METAL = hasattr(lib, "ggml_metal_init") + + +# // max memory buffers that can be mapped to the device +# #define GGML_METAL_MAX_BUFFERS 16 +GGML_METAL_MAX_BUFFERS = 16 +# #define GGML_METAL_MAX_COMMAND_BUFFERS 32 +GGML_METAL_MAX_COMMAND_BUFFERS = 32 + +# struct ggml_metal_context; +ggml_metal_context_p = ctypes.c_void_p + + +# struct ggml_metal_context * ggml_metal_init(int n_cb); +def ggml_metal_init( + n_cb: Union[ctypes.c_int, int], +) -> ggml_metal_context_p: + return lib.ggml_metal_init(n_cb) + + +if GGML_USE_METAL: + lib.ggml_metal_init.argtypes = [ctypes.c_int] + lib.ggml_metal_init.restype = ggml_metal_context_p + + +# void ggml_metal_free(struct ggml_metal_context * ctx); +def ggml_metal_free( + ctx: ggml_metal_context_p, +): + return lib.ggml_metal_free(ctx) + + +if GGML_USE_METAL: + lib.ggml_metal_free.argtypes = [ggml_metal_context_p] + lib.ggml_metal_free.restype = None + + +# // set the number of command buffers to use +# void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); +def ggml_metal_set_n_cb( + ctx: ggml_metal_context_p, + n_cb: Union[ctypes.c_int, int], +): + return lib.ggml_metal_set_n_cb(ctx, n_cb) + + +if GGML_USE_METAL: + lib.ggml_metal_set_n_cb.argtypes = [ggml_metal_context_p, ctypes.c_int] + lib.ggml_metal_set_n_cb.restype = None + + +# // creates a mapping between a host memory buffer and a device memory buffer +# // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute +# // - the mapping is used during computation to determine the arguments of the compute kernels +# // - you don't need to keep the host memory buffer allocated as it is never accessed by Metal +# // - max_size specifies the maximum size of a tensor and is used to create shared views such +# // that it is guaranteed that the tensor will fit in at least one of the views +# // +# bool ggml_metal_add_buffer( +# struct ggml_metal_context * ctx, +# const char * name, +# void * data, +# size_t size, +# size_t max_size); +def ggml_metal_add_buffer( + ctx: ggml_metal_context_p, + name: bytes, + data: ctypes.c_void_p, + size: Union[ctypes.c_size_t, int], + max_size: Union[ctypes.c_size_t, int], +) -> bool: + return lib.ggml_metal_add_buffer(ctx, name, data, size, max_size) + + +if GGML_USE_METAL: + lib.ggml_metal_add_buffer.argtypes = [ + ggml_metal_context_p, + ctypes.c_char_p, + ctypes.c_void_p, + ctypes.c_size_t, + ctypes.c_size_t, + ] + lib.ggml_metal_add_buffer.restype = ctypes.c_bool + + +# // set data from host memory into the device +# void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); +def ggml_metal_set_tensor( + ctx: ggml_metal_context_p, + t: ggml_tensor_p, +): + return lib.ggml_metal_set_tensor(ctx, t) + + +if GGML_USE_METAL: + lib.ggml_metal_set_tensor.argtypes = [ + ggml_metal_context_p, + ctypes.POINTER(ggml_tensor), + ] + lib.ggml_metal_set_tensor.restype = None + + +# // get data from the device into host memory +# void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); +def ggml_metal_get_tensor( + ctx: ggml_metal_context_p, + t: ggml_tensor_p, +): + return lib.ggml_metal_get_tensor(ctx, t) + + +if GGML_USE_METAL: + lib.ggml_metal_get_tensor.argtypes = [ + ggml_metal_context_p, + ctypes.POINTER(ggml_tensor), + ] + lib.ggml_metal_get_tensor.restype = None + + +# // try to find operations that can be run concurrently in the graph +# // you should run it again if the topology of your graph changes +# void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem); +def ggml_metal_graph_find_concurrency( + ctx: ggml_metal_context_p, + gf: ggml_cgraph_p, + check_mem: Union[ctypes.c_bool, bool], +): + return lib.ggml_metal_graph_find_concurrency(ctx, gf, check_mem) + + +if GGML_USE_METAL: + lib.ggml_metal_graph_find_concurrency.argtypes = [ + ggml_metal_context_p, + ctypes.POINTER(ggml_cgraph), + ctypes.c_bool, + ] + lib.ggml_metal_graph_find_concurrency.restype = None + + +# // if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized +# int ggml_metal_if_optimized(struct ggml_metal_context * ctx); +def ggml_metal_if_optimized( + ctx: ggml_metal_context_p, +) -> int: + return lib.ggml_metal_if_optimized(ctx) + + +if GGML_USE_METAL: + lib.ggml_metal_if_optimized.argtypes = [ + ggml_metal_context_p, + ] + lib.ggml_metal_if_optimized.restype = ctypes.c_int + + +# // output the concur_list for ggml_alloc +# int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx); +def ggml_metal_get_concur_list( + ctx: ggml_metal_context_p, +) -> CIntPointer: + return lib.ggml_metal_get_concur_list(ctx) + + +if GGML_USE_METAL: + lib.ggml_metal_get_concur_list.argtypes = [ + ggml_metal_context_p, + ] + lib.ggml_metal_get_concur_list.restype = ctypes.POINTER(ctypes.c_int) + + +# // same as ggml_graph_compute but uses Metal +# // creates gf->n_threads command buffers in parallel +# void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); +def ggml_metal_graph_compute( + ctx: ggml_metal_context_p, + gf: ggml_cgraph_p, +): + return lib.ggml_metal_graph_compute(ctx, gf) + + +if GGML_USE_METAL: + lib.ggml_metal_graph_compute.argtypes = [ + ggml_metal_context_p, + ctypes.POINTER(ggml_cgraph), + ] + lib.ggml_metal_graph_compute.restype = None + + +##################################################### +# GGML OPENCL API +# source: ggml-opencl.h +##################################################### + + +GGML_USE_CLBLAST = hasattr(lib, "ggml_cl_init") + + +# void ggml_cl_init(void); +def ggml_cl_init(): + return lib.ggml_cl_init() + + +if GGML_USE_CLBLAST: + lib.ggml_cl_init.argtypes = [] + lib.ggml_cl_init.restype = None + + +# void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +def ggml_cl_mul( + src0: ggml_tensor_p, + src1: ggml_tensor_p, + dst: ggml_tensor_p, +): + return lib.ggml_cl_mul(src0, src1, dst) + + +if GGML_USE_CLBLAST: + lib.ggml_cl_mul.argtypes = [ + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ] + lib.ggml_cl_mul.restype = None + + +# bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +def ggml_cl_can_mul_mat( + src0: ggml_tensor_p, + src1: ggml_tensor_p, + dst: ggml_tensor_p, +) -> bool: + return lib.ggml_cl_can_mul_mat(src0, src1, dst) + + +if GGML_USE_CLBLAST: + lib.ggml_cl_can_mul_mat.argtypes = [ + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ] + lib.ggml_cl_can_mul_mat.restype = ctypes.c_bool + + +# size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +def ggml_cl_mul_mat_get_wsize( + src0: ggml_tensor_p, + src1: ggml_tensor_p, + dst: ggml_tensor_p, +) -> int: + return lib.ggml_cl_mul_mat_get_wsize(src0, src1, dst) + + +if GGML_USE_CLBLAST: + lib.ggml_cl_mul_mat_get_wsize.argtypes = [ + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ] + lib.ggml_cl_mul_mat_get_wsize.restype = ctypes.c_size_t + + +# void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); +def ggml_cl_mul_mat( + src0: ggml_tensor_p, + src1: ggml_tensor_p, + dst: ggml_tensor_p, + wdata: ctypes.c_void_p, + wsize: Union[ctypes.c_size_t, int], +): + return lib.ggml_cl_mul_mat(src0, src1, dst, wdata, wsize) + + +if GGML_USE_CLBLAST: + lib.ggml_cl_mul_mat.argtypes = [ + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.POINTER(ggml_tensor), + ctypes.c_void_p, + ctypes.c_size_t, + ] + lib.ggml_cl_mul_mat.restype = None + + +# void * ggml_cl_host_malloc(size_t size); +def ggml_cl_host_malloc( + size: Union[ctypes.c_size_t, int], +) -> Optional[ctypes.c_void_p]: + return lib.ggml_cl_host_malloc(size) + + +if GGML_USE_CLBLAST: + lib.ggml_cl_host_malloc.argtypes = [ + ctypes.c_size_t, + ] + lib.ggml_cl_host_malloc.restype = ctypes.c_void_p + + +# void ggml_cl_host_free(void * ptr); +def ggml_cl_host_free( + ptr: ctypes.c_void_p, +): + return lib.ggml_cl_host_free(ptr) + + +if GGML_USE_CLBLAST: + lib.ggml_cl_host_free.argtypes = [ + ctypes.c_void_p, + ] + lib.ggml_cl_host_free.restype = None + + +# void ggml_cl_free_data(const struct ggml_tensor* tensor); +def ggml_cl_free_data( + tensor: ggml_tensor_p, +): + return lib.ggml_cl_free_data(tensor) + + +if GGML_USE_CLBLAST: + lib.ggml_cl_free_data.argtypes = [ + ctypes.POINTER(ggml_tensor), + ] + lib.ggml_cl_free_data.restype = None + + +# void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor); +def ggml_cl_transform_tensor( + data: ctypes.c_void_p, + tensor: ggml_tensor_p, +): + return lib.ggml_cl_transform_tensor(data, tensor) + + +if GGML_USE_CLBLAST: + lib.ggml_cl_transform_tensor.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(ggml_tensor), + ] + lib.ggml_cl_transform_tensor.restype = None diff --git a/seamless_communication/pyproject.toml b/seamless_communication/pyproject.toml new file mode 100644 index 0000000..fd789e4 --- /dev/null +++ b/seamless_communication/pyproject.toml @@ -0,0 +1,37 @@ +[build-system] +requires = ["packaging~=23.1", "setuptools~=67.8", "wheel~=0.40"] +build-backend = "setuptools.build_meta" + +[tool.flake8] +extend_ignore = ["E", "Y"] # Black +per-file-ignores = [ + "__init__.py:F401", +] + +[tool.isort] +profile = "black" + +[tool.mypy] +disable_error_code = "type-abstract,typeddict-unknown-key" +disallow_untyped_calls = false +disallow_untyped_decorators = false +ignore_missing_imports = true +python_version = 3.8 +show_error_codes = true +show_error_context = true +strict = true +warn_unused_configs = false +warn_unused_ignores = false + +[tool.pytest.ini_options] +minversion = "7.1" +testpaths = ["tests"] +filterwarnings = [ + "ignore:Deprecated call to `pkg_resources", + "ignore:Please use `line_search_wolfe", + "ignore:Please use `spmatrix", + "ignore:TypedStorage is deprecated", + "ignore:distutils Version classes are deprecated", + "ignore:pkg_resources is deprecated", + "ignore:torch.nn.utils.weight_norm is deprecated in favor of", +] diff --git a/seamless_communication/seamlessM4T.png b/seamless_communication/seamlessM4T.png new file mode 100644 index 0000000..cd33a9e Binary files /dev/null and b/seamless_communication/seamlessM4T.png differ diff --git a/seamless_communication/setup.py b/seamless_communication/setup.py new file mode 100644 index 0000000..fdf9c5c --- /dev/null +++ b/seamless_communication/setup.py @@ -0,0 +1,48 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from setuptools import find_packages, setup + +setup( + name="seamless_communication", + version="1.0.0", + packages=find_packages(where="src"), + package_dir={"": "src"}, + package_data={"": ["py.typed", "cards/*.yaml"]}, + description="SeamlessM4T -- Massively Multilingual & Multimodal Machine Translation Model", + long_description=open("README.md", encoding="utf-8").read(), + long_description_content_type="text/markdown", + readme="README.md", + python_requires=">=3.8", + author="Fundamental AI Research (FAIR) at Meta", + url="https://github.com/facebookresearch/seamless_communication", + license="Creative Commons", + install_requires=[ + "datasets", + "fairseq2==0.2.*", + "fire", + "librosa", + "openai-whisper", + "simuleval~=1.1.3", + "soundfile", + "scipy", + "torchaudio", + "tqdm", + ], + entry_points={ + "console_scripts": [ + "m4t_evaluate=seamless_communication.cli.m4t.evaluate.evaluate:main", + "m4t_predict=seamless_communication.cli.m4t.predict.predict:main", + "m4t_finetune=seamless_communication.cli.m4t.finetune.finetune:main", + "m4t_prepare_dataset=seamless_communication.cli.m4t.finetune.dataset:main", + "m4t_audio_to_units=seamless_communication.cli.m4t.audio_to_units.audio_to_units:main", + "expressivity_evaluate=seamless_communication.cli.expressivity.evaluate.evaluate:main", + "expressivity_predict=seamless_communication.cli.expressivity.predict.predict:main", + "streaming_evaluate=seamless_communication.cli.streaming.evaluate:main", + ], + }, + include_package_data=True, +) diff --git a/seamless_communication/src/__init__.py b/seamless_communication/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seamless_communication/src/seamless_communication/__init__.py b/seamless_communication/src/seamless_communication/__init__.py new file mode 100644 index 0000000..911ce54 --- /dev/null +++ b/seamless_communication/src/seamless_communication/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from pathlib import Path + +from fairseq2.assets import FileAssetMetadataProvider, asset_store + +__version__ = "0.1.0" + + +def _update_asset_store() -> None: + cards_dir = Path(__file__).parent.joinpath("cards") + + asset_store.metadata_providers.append(FileAssetMetadataProvider(cards_dir)) + + +_update_asset_store() diff --git a/seamless_communication/src/seamless_communication/cards/conformer_shaw.yaml b/seamless_communication/src/seamless_communication/cards/conformer_shaw.yaml new file mode 100644 index 0000000..81c6136 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cards/conformer_shaw.yaml @@ -0,0 +1,10 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +name: conformer_shaw +model_type: wav2vec2 +model_arch: conformer_shaw_600m +checkpoint: "https://huggingface.co/facebook/conformer-shaw/resolve/main/conformer_shaw.pt" diff --git a/seamless_communication/src/seamless_communication/cards/expresso.yaml b/seamless_communication/src/seamless_communication/cards/expresso.yaml new file mode 100644 index 0000000..0ec3802 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cards/expresso.yaml @@ -0,0 +1,8 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +name: expresso +uri: "https://dl.fbaipublicfiles.com/textless_nlp/expresso/data/expresso.tar" \ No newline at end of file diff --git a/seamless_communication/src/seamless_communication/cards/mexpresso_text.yaml b/seamless_communication/src/seamless_communication/cards/mexpresso_text.yaml new file mode 100644 index 0000000..07aca16 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cards/mexpresso_text.yaml @@ -0,0 +1,8 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +name: mexpresso_text +uri: "https://dl.fbaipublicfiles.com/seamless/datasets/mexpresso_text/mexpresso_text.tar" \ No newline at end of file diff --git a/seamless_communication/src/seamless_communication/cards/mintox.yaml b/seamless_communication/src/seamless_communication/cards/mintox.yaml new file mode 100644 index 0000000..414ec87 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cards/mintox.yaml @@ -0,0 +1,51 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +name: mintox +model_name: MinTox +etox_dataset: https://dl.fbaipublicfiles.com/nllb/NLLB-200_TWL/nllb-200_twl.zip +etox_lang_variants: + - kas_Arab + - kas_Deva + - knc_Arab + - knc_Latn + - min_Arab + - min_Latn + - zho_Hans + - zho_Hant + +sp_model: https://huggingface.co/facebook/seamless-m4t-medium/resolve/main/tokenizer.model + +# For some languages, we use the SentencePiece model. +sp_langs: + - asm + - ben + - cmn + - guj + - mya + - hin + - gom + - ibo + - jpn + - kan + - khm + - kor + - lao + - mai + - mal + - mar + - mni + - npi + - oan + - ory + - pan + - rwr + - sat + - tam + - tel + - tha + - wuu + - yue diff --git a/seamless_communication/src/seamless_communication/cards/nar_t2u_aligner.yaml b/seamless_communication/src/seamless_communication/cards/nar_t2u_aligner.yaml new file mode 100644 index 0000000..58c625c --- /dev/null +++ b/seamless_communication/src/seamless_communication/cards/nar_t2u_aligner.yaml @@ -0,0 +1,51 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# MIT_LICENSE file in the root directory of this source tree. + +name: nar_t2u_aligner +char_tokenizer: "https://huggingface.co/facebook/seamless-streaming/resolve/main/spm_char_lang38_tc.model" +model_type: unity2_aligner +model_arch: nar_t2u_aligner +checkpoint: "https://dl.fbaipublicfiles.com/seamless/models/unity2_aligner.pt" +num_units: 10000 +unit_langs: + - arb + - ben + - cat + - ces + - cmn + - cym + - dan + - deu + - eng + - est + - fin + - fra + - hin + - ind + - ita + - jpn + - kan + - kor + - mlt + - nld + - pes + - pol + - por + - ron + - rus + - slk + - spa + - swe + - swh + - tam + - tel + - tgl + - tha + - tur + - ukr + - urd + - uzn + - vie diff --git a/seamless_communication/src/seamless_communication/cards/seamlessM4T_large.yaml b/seamless_communication/src/seamless_communication/cards/seamlessM4T_large.yaml new file mode 100644 index 0000000..b2a50d4 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cards/seamlessM4T_large.yaml @@ -0,0 +1,50 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# MIT_LICENSE file in the root directory of this source tree. + +name: seamlessM4T_large +base: unity_nllb-100 +model_arch: base +checkpoint: "https://huggingface.co/facebook/seamless-m4t-large/resolve/main/multitask_unity_large.pt" +num_units: 10000 +unit_langs: + - arb + - ben + - cat + - ces + - cmn + - cym + - dan + - deu + - eng + - est + - fin + - fra + - hin + - ind + - ita + - jpn + - kan + - kor + - mlt + - nld + - pes + - pol + - por + - ron + - rus + - slk + - spa + - swe + - swh + - tam + - tel + - tgl + - tha + - tur + - ukr + - urd + - uzn + - vie diff --git a/seamless_communication/src/seamless_communication/cards/seamlessM4T_medium.yaml b/seamless_communication/src/seamless_communication/cards/seamlessM4T_medium.yaml new file mode 100644 index 0000000..dfddb8e --- /dev/null +++ b/seamless_communication/src/seamless_communication/cards/seamlessM4T_medium.yaml @@ -0,0 +1,50 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# MIT_LICENSE file in the root directory of this source tree. + +name: seamlessM4T_medium +base: unity_nllb-200 +model_arch: medium +checkpoint: "https://huggingface.co/facebook/seamless-m4t-medium/resolve/main/multitask_unity_medium.pt" +num_units: 10000 +unit_langs: + - arb + - ben + - cat + - ces + - cmn + - cym + - dan + - deu + - eng + - est + - fin + - fra + - hin + - ind + - ita + - jpn + - kan + - kor + - mlt + - nld + - pes + - pol + - por + - ron + - rus + - slk + - spa + - swe + - swh + - tam + - tel + - tgl + - tha + - tur + - ukr + - urd + - uzn + - vie diff --git a/seamless_communication/src/seamless_communication/cards/seamlessM4T_v2_large.yaml b/seamless_communication/src/seamless_communication/cards/seamlessM4T_v2_large.yaml new file mode 100644 index 0000000..dd773b0 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cards/seamlessM4T_v2_large.yaml @@ -0,0 +1,51 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# MIT_LICENSE file in the root directory of this source tree. + +name: seamlessM4T_v2_large +base: unity_nllb-100 +model_arch: base_v2 +char_tokenizer: "https://huggingface.co/facebook/seamless-m4t-v2-large/resolve/main/spm_char_lang38_tc.model" +checkpoint: "https://huggingface.co/facebook/seamless-m4t-v2-large/resolve/main/seamlessM4T_v2_large.pt" +num_units: 10000 +unit_langs: + - arb + - ben + - cat + - ces + - cmn + - cym + - dan + - deu + - eng + - est + - fin + - fra + - hin + - ind + - ita + - jpn + - kan + - kor + - mlt + - nld + - pes + - pol + - por + - ron + - rus + - slk + - spa + - swe + - swh + - tam + - tel + - tgl + - tha + - tur + - ukr + - urd + - uzn + - vie diff --git a/seamless_communication/src/seamless_communication/cards/seamless_expressivity.yaml b/seamless_communication/src/seamless_communication/cards/seamless_expressivity.yaml new file mode 100644 index 0000000..531a1d4 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cards/seamless_expressivity.yaml @@ -0,0 +1,51 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# MIT_LICENSE file in the root directory of this source tree. + +name: seamless_expressivity +base: unity_nllb-100 +model_arch: expressivity_v2 +char_tokenizer: "https://huggingface.co/facebook/seamless-streaming/resolve/main/spm_char_lang38_tc.model" +checkpoint: "https://github.com/facebookresearch/seamless_communication;gated=true" +num_units: 10000 +unit_langs: + - arb + - ben + - cat + - ces + - cmn + - cym + - dan + - deu + - eng + - est + - fin + - fra + - hin + - ind + - ita + - jpn + - kan + - kor + - mlt + - nld + - pes + - pol + - por + - ron + - rus + - slk + - spa + - swe + - swh + - tam + - tel + - tgl + - tha + - tur + - ukr + - urd + - uzn + - vie diff --git a/seamless_communication/src/seamless_communication/cards/seamless_streaming_monotonic_decoder.yaml b/seamless_communication/src/seamless_communication/cards/seamless_streaming_monotonic_decoder.yaml new file mode 100644 index 0000000..246ab22 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cards/seamless_streaming_monotonic_decoder.yaml @@ -0,0 +1,10 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +name: seamless_streaming_monotonic_decoder +model_type: monotonic_decoder +model_arch: dense_1b +checkpoint: "https://huggingface.co/facebook/seamless-streaming/resolve/main/seamless_streaming_monotonic_decoder.pt" diff --git a/seamless_communication/src/seamless_communication/cards/seamless_streaming_unity.yaml b/seamless_communication/src/seamless_communication/cards/seamless_streaming_unity.yaml new file mode 100644 index 0000000..ad3475a --- /dev/null +++ b/seamless_communication/src/seamless_communication/cards/seamless_streaming_unity.yaml @@ -0,0 +1,51 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +name: seamless_streaming_unity +base: unity_nllb-100 +model_arch: base_v2 +char_tokenizer: "https://huggingface.co/facebook/seamless-streaming/resolve/main/spm_char_lang38_tc.model" +checkpoint: "https://huggingface.co/facebook/seamless-streaming/resolve/main/seamless_streaming_unity.pt" +num_units: 10000 +unit_langs: + - arb + - ben + - cat + - ces + - cmn + - cym + - dan + - deu + - eng + - est + - fin + - fra + - hin + - ind + - ita + - jpn + - kan + - kor + - mlt + - nld + - pes + - pol + - por + - ron + - rus + - slk + - spa + - swe + - swh + - tam + - tel + - tgl + - tha + - tur + - ukr + - urd + - uzn + - vie diff --git a/seamless_communication/src/seamless_communication/cards/unity_nllb-100.yaml b/seamless_communication/src/seamless_communication/cards/unity_nllb-100.yaml new file mode 100644 index 0000000..0d2befc --- /dev/null +++ b/seamless_communication/src/seamless_communication/cards/unity_nllb-100.yaml @@ -0,0 +1,109 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# MIT_LICENSE file in the root directory of this source tree. + +name: unity_nllb-100 +model_type: unity +tokenizer: "https://huggingface.co/facebook/seamless-m4t-large/resolve/main/tokenizer.model" +default_lang: eng +langs: + - afr + - amh + - arb + - ary + - arz + - asm + - azj + - bel + - ben + - bos + - bul + - cat + - ceb + - ces + - ckb + - cmn + - cmn_Hant + - cym + - dan + - deu + - ell + - eng + - est + - eus + - fin + - fra + - fuv + - gaz + - gle + - glg + - guj + - heb + - hin + - hrv + - hun + - hye + - ibo + - ind + - isl + - ita + - jav + - jpn + - kan + - kat + - kaz + - khk + - khm + - kir + - kor + - lao + - lit + - lug + - luo + - lvs + - mai + - mal + - mar + - mkd + - mlt + - mni + - mya + - nld + - nno + - nob + - npi + - nya + - ory + - pan + - pbt + - pes + - pol + - por + - ron + - rus + - sat + - slk + - slv + - sna + - snd + - som + - spa + - srp + - swe + - swh + - tam + - tel + - tgk + - tgl + - tha + - tur + - ukr + - urd + - uzn + - vie + - yor + - yue + - zsm + - zul diff --git a/seamless_communication/src/seamless_communication/cards/unity_nllb-200.yaml b/seamless_communication/src/seamless_communication/cards/unity_nllb-200.yaml new file mode 100644 index 0000000..623e5fe --- /dev/null +++ b/seamless_communication/src/seamless_communication/cards/unity_nllb-200.yaml @@ -0,0 +1,213 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# MIT_LICENSE file in the root directory of this source tree. + +name: unity_nllb-200 +model_type: unity +tokenizer: "https://huggingface.co/facebook/seamless-m4t-medium/resolve/main/tokenizer.model" +default_lang: eng +langs: + - ace + - ace_Latn + - acm + - acq + - aeb + - afr + - ajp + - aka + - amh + - apc + - arb + - ars + - ary + - arz + - asm + - ast + - awa + - ayr + - azb + - azj + - bak + - bam + - ban + - bel + - bem + - ben + - bho + - bjn + - bjn_Latn + - bod + - bos + - bug + - bul + - cat + - ceb + - ces + - cjk + - ckb + - crh + - cym + - dan + - deu + - dik + - dyu + - dzo + - ell + - eng + - epo + - est + - eus + - ewe + - fao + - pes + - fij + - fin + - fon + - fra + - fur + - fuv + - gla + - gle + - glg + - grn + - guj + - hat + - hau + - heb + - hin + - hne + - hrv + - hun + - hye + - ibo + - ilo + - ind + - isl + - ita + - jav + - jpn + - kab + - kac + - kam + - kan + - kas + - kas_Deva + - kat + - knc + - knc_Latn + - kaz + - kbp + - kea + - khm + - kik + - kin + - kir + - kmb + - kon + - kor + - kmr + - lao + - lvs + - lij + - lim + - lin + - lit + - lmo + - ltg + - ltz + - lua + - lug + - luo + - lus + - mag + - mai + - mal + - mar + - min + - mkd + - plt + - mlt + - mni + - khk + - mos + - mri + - zsm + - mya + - nld + - nno + - nob + - npi + - nso + - nus + - nya + - oci + - gaz + - ory + - pag + - pan + - pap + - pol + - por + - prs + - pbt + - quy + - ron + - run + - rus + - sag + - san + - sat + - scn + - shn + - sin + - slk + - slv + - smo + - sna + - snd + - som + - sot + - spa + - als + - srd + - srp + - ssw + - sun + - swe + - swh + - szl + - tam + - tat + - tel + - tgk + - tgl + - tha + - tir + - taq + - taq_Tfng + - tpi + - tsn + - tso + - tuk + - tum + - tur + - twi + - tzm + - uig + - ukr + - umb + - urd + - uzn + - vec + - vie + - war + - wol + - xho + - ydd + - yor + - yue + - cmn + - cmn_Hant + - zul diff --git a/seamless_communication/src/seamless_communication/cards/vocoder_36langs.yaml b/seamless_communication/src/seamless_communication/cards/vocoder_36langs.yaml new file mode 100644 index 0000000..91b952a --- /dev/null +++ b/seamless_communication/src/seamless_communication/cards/vocoder_36langs.yaml @@ -0,0 +1,198 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# MIT_LICENSE file in the root directory of this source tree. + +name: vocoder_36langs +model_type: vocoder_code_hifigan +model_arch: base +checkpoint: "https://huggingface.co/facebook/seamless-m4t-vocoder/resolve/main/vocoder_36langs.pt" +model_config: { + "lang_spkr_idx_map": { + "multilingual": { + "arb": 0, + "ben": 1, + "cat": 2, + "ces": 3, + "cmn": 4, + "cym": 5, + "dan": 6, + "deu": 7, + "eng": 8, + "est": 9, + "fin": 10, + "fra": 11, + "hin": 12, + "ind": 13, + "ita": 14, + "jpn": 15, + "kor": 16, + "mlt": 17, + "nld": 18, + "pes": 19, + "pol": 20, + "por": 21, + "ron": 22, + "rus": 23, + "slk": 24, + "spa": 25, + "swe": 26, + "swh": 27, + "tel": 28, + "tgl": 29, + "tha": 30, + "tur": 31, + "ukr": 32, + "urd": 33, + "uzn": 34, + "vie": 35 + }, + "multispkr": { + "arb": [ + 0 + ], + "ben": [ + 2, + 1 + ], + "cat": [ + 3 + ], + "ces": [ + 4 + ], + "cmn": [ + 5 + ], + "cym": [ + 6 + ], + "dan": [ + 7, + 8 + ], + "deu": [ + 9 + ], + "eng": [ + 10 + ], + "est": [ + 11, + 12, + 13 + ], + "fin": [ + 14 + ], + "fra": [ + 15 + ], + "hin": [ + 16 + ], + "ind": [ + 17, + 24, + 18, + 20, + 19, + 21, + 23, + 27, + 26, + 22, + 25 + ], + "ita": [ + 29, + 28 + ], + "jpn": [ + 30 + ], + "kor": [ + 31 + ], + "mlt": [ + 32, + 33, + 34 + ], + "nld": [ + 35 + ], + "pes": [ + 36 + ], + "pol": [ + 37 + ], + "por": [ + 38 + ], + "ron": [ + 39 + ], + "rus": [ + 40 + ], + "slk": [ + 41 + ], + "spa": [ + 42 + ], + "swe": [ + 43, + 45, + 44 + ], + "swh": [ + 46, + 48, + 47 + ], + "tel": [ + 49 + ], + "tgl": [ + 50 + ], + "tha": [ + 51, + 54, + 55, + 52, + 53 + ], + "tur": [ + 58, + 57, + 56 + ], + "ukr": [ + 59 + ], + "urd": [ + 60, + 61, + 62 + ], + "uzn": [ + 63, + 64, + 65 + ], + "vie": [ + 66, + 67, + 70, + 71, + 68, + 69 + ] + } + } +} diff --git a/seamless_communication/src/seamless_communication/cards/vocoder_pretssel.yaml b/seamless_communication/src/seamless_communication/cards/vocoder_pretssel.yaml new file mode 100644 index 0000000..52ccbcd --- /dev/null +++ b/seamless_communication/src/seamless_communication/cards/vocoder_pretssel.yaml @@ -0,0 +1,182 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# MIT_LICENSE file in the root directory of this source tree. + +name: vocoder_pretssel +model_type: vocoder_pretssel +model_arch: 24khz +checkpoint: "https://github.com/facebookresearch/seamless_communication;gated=true" +sample_rate: 24000 +model_config: + langs: + - cmn + - deu + - eng + - fra + - ita + - spa + gcmvn_stats: + mean: + - 9.023406257490224 + - 9.406622923058864 + - 10.554165334059368 + - 11.475190058682356 + - 12.179117104099705 + - 12.603782921407062 + - 12.769632747861747 + - 12.714276772934083 + - 12.747612172560233 + - 12.750373688097946 + - 12.948050207790237 + - 13.121829398704277 + - 13.40130828476734 + - 13.58028050886195 + - 13.601835409305883 + - 13.608734047373218 + - 13.538274892335826 + - 13.391518457210937 + - 13.382843811359622 + - 13.0524299456858 + - 12.785193828396269 + - 12.876608812372632 + - 12.59571918874957 + - 12.674484745567813 + - 12.57325195345546 + - 12.651938120109422 + - 12.556821722150424 + - 12.639338348530158 + - 12.610449431411217 + - 12.639992872912376 + - 12.697503827987052 + - 12.754788270377214 + - 12.837605043617405 + - 12.964379088501497 + - 13.11997048142582 + - 13.267395589173432 + - 13.384668687260483 + - 13.495000208959356 + - 13.606835320307384 + - 13.578073476073252 + - 13.689796531497368 + - 13.643079802391588 + - 13.7340755472615 + - 13.735199777666043 + - 13.79347692248429 + - 13.875183654243305 + - 13.967272256671393 + - 14.058507936754117 + - 14.114704594203507 + - 14.156211337193277 + - 14.14747081594401 + - 14.173917097974343 + - 14.22330474758318 + - 14.251272943225572 + - 14.230904505178053 + - 14.226937644205396 + - 14.222223350670225 + - 14.211638354996317 + - 14.208930098405544 + - 14.19476983404041 + - 14.2195925729048 + - 14.16490878238837 + - 14.115436751205117 + - 14.039442767347872 + - 13.976934063901625 + - 13.917068116556464 + - 13.856293662219073 + - 13.773769842100085 + - 13.706245521082796 + - 13.685052933361192 + - 13.68570131643094 + - 13.714811890011152 + - 13.751451253935347 + - 13.772212258132148 + - 13.76013448427468 + - 13.702368406557508 + - 13.600406368803617 + - 13.369574889658164 + - 12.998399608309988 + - 12.443732902848723 + std: + - 3.729248515707457 + - 4.001623098079929 + - 4.570009061358065 + - 4.811572361201577 + - 5.010239923828185 + - 5.152145212706857 + - 5.223885876119451 + - 5.224443623432338 + - 5.161790275239061 + - 5.098988232815804 + - 5.090890035509122 + - 5.130345212529546 + - 5.165849688173366 + - 5.164761699263693 + - 5.131177988219367 + - 5.085522051815558 + - 5.035829108165894 + - 4.987478975310455 + - 4.932652442855969 + - 4.8650037198748075 + - 4.799238163232527 + - 4.727086345775988 + - 4.646858066575789 + - 4.5733249959652715 + - 4.51685060334288 + - 4.467449073425149 + - 4.4296881304192075 + - 4.4028775449713775 + - 4.397905653025904 + - 4.3862594566308015 + - 4.366485847923521 + - 4.344483498393771 + - 4.324692736391383 + - 4.310481738978154 + - 4.3053492473916 + - 4.3035205126659655 + - 4.2987898577000605 + - 4.287403454800855 + - 4.27087296372773 + - 4.25387490294079 + - 4.233513102251301 + - 4.212047255068752 + - 4.1810370158214445 + - 4.186014591107853 + - 4.194806047136222 + - 4.2183377208747075 + - 4.249293562464735 + - 4.268847210561774 + - 4.270455756367186 + - 4.25811368227528 + - 4.245975115347766 + - 4.23058010369271 + - 4.203075111087773 + - 4.20123812057283 + - 4.187143614375688 + - 4.172633823274146 + - 4.162541203161947 + - 4.156022884601996 + - 4.1618428838805706 + - 4.157259439238067 + - 4.139859013016601 + - 4.150685014911159 + - 4.152025499126372 + - 4.165010788120131 + - 4.15179422331336 + - 4.137041631098819 + - 4.10861757770052 + - 4.119916019361405 + - 4.131749366642117 + - 4.119438578634397 + - 4.100095269698108 + - 4.073900009963118 + - 4.0580796715728855 + - 4.050916705279105 + - 4.037976834115189 + - 4.023757063156459 + - 3.9987849927993353 + - 3.989251079820668 + - 3.9464430977885256 + - 3.8673932921278995 diff --git a/seamless_communication/src/seamless_communication/cards/vocoder_pretssel_16khz.yaml b/seamless_communication/src/seamless_communication/cards/vocoder_pretssel_16khz.yaml new file mode 100644 index 0000000..e47cb4a --- /dev/null +++ b/seamless_communication/src/seamless_communication/cards/vocoder_pretssel_16khz.yaml @@ -0,0 +1,182 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# MIT_LICENSE file in the root directory of this source tree. + +name: vocoder_pretssel_16khz +model_type: vocoder_pretssel +model_arch: 16khz +checkpoint: "https://github.com/facebookresearch/seamless_communication;gated=true" +sample_rate: 16000 +model_config: + langs: + - cmn + - deu + - eng + - fra + - ita + - spa + gcmvn_stats: + mean: + - 9.023406257490224 + - 9.406622923058864 + - 10.554165334059368 + - 11.475190058682356 + - 12.179117104099705 + - 12.603782921407062 + - 12.769632747861747 + - 12.714276772934083 + - 12.747612172560233 + - 12.750373688097946 + - 12.948050207790237 + - 13.121829398704277 + - 13.40130828476734 + - 13.58028050886195 + - 13.601835409305883 + - 13.608734047373218 + - 13.538274892335826 + - 13.391518457210937 + - 13.382843811359622 + - 13.0524299456858 + - 12.785193828396269 + - 12.876608812372632 + - 12.59571918874957 + - 12.674484745567813 + - 12.57325195345546 + - 12.651938120109422 + - 12.556821722150424 + - 12.639338348530158 + - 12.610449431411217 + - 12.639992872912376 + - 12.697503827987052 + - 12.754788270377214 + - 12.837605043617405 + - 12.964379088501497 + - 13.11997048142582 + - 13.267395589173432 + - 13.384668687260483 + - 13.495000208959356 + - 13.606835320307384 + - 13.578073476073252 + - 13.689796531497368 + - 13.643079802391588 + - 13.7340755472615 + - 13.735199777666043 + - 13.79347692248429 + - 13.875183654243305 + - 13.967272256671393 + - 14.058507936754117 + - 14.114704594203507 + - 14.156211337193277 + - 14.14747081594401 + - 14.173917097974343 + - 14.22330474758318 + - 14.251272943225572 + - 14.230904505178053 + - 14.226937644205396 + - 14.222223350670225 + - 14.211638354996317 + - 14.208930098405544 + - 14.19476983404041 + - 14.2195925729048 + - 14.16490878238837 + - 14.115436751205117 + - 14.039442767347872 + - 13.976934063901625 + - 13.917068116556464 + - 13.856293662219073 + - 13.773769842100085 + - 13.706245521082796 + - 13.685052933361192 + - 13.68570131643094 + - 13.714811890011152 + - 13.751451253935347 + - 13.772212258132148 + - 13.76013448427468 + - 13.702368406557508 + - 13.600406368803617 + - 13.369574889658164 + - 12.998399608309988 + - 12.443732902848723 + std: + - 3.729248515707457 + - 4.001623098079929 + - 4.570009061358065 + - 4.811572361201577 + - 5.010239923828185 + - 5.152145212706857 + - 5.223885876119451 + - 5.224443623432338 + - 5.161790275239061 + - 5.098988232815804 + - 5.090890035509122 + - 5.130345212529546 + - 5.165849688173366 + - 5.164761699263693 + - 5.131177988219367 + - 5.085522051815558 + - 5.035829108165894 + - 4.987478975310455 + - 4.932652442855969 + - 4.8650037198748075 + - 4.799238163232527 + - 4.727086345775988 + - 4.646858066575789 + - 4.5733249959652715 + - 4.51685060334288 + - 4.467449073425149 + - 4.4296881304192075 + - 4.4028775449713775 + - 4.397905653025904 + - 4.3862594566308015 + - 4.366485847923521 + - 4.344483498393771 + - 4.324692736391383 + - 4.310481738978154 + - 4.3053492473916 + - 4.3035205126659655 + - 4.2987898577000605 + - 4.287403454800855 + - 4.27087296372773 + - 4.25387490294079 + - 4.233513102251301 + - 4.212047255068752 + - 4.1810370158214445 + - 4.186014591107853 + - 4.194806047136222 + - 4.2183377208747075 + - 4.249293562464735 + - 4.268847210561774 + - 4.270455756367186 + - 4.25811368227528 + - 4.245975115347766 + - 4.23058010369271 + - 4.203075111087773 + - 4.20123812057283 + - 4.187143614375688 + - 4.172633823274146 + - 4.162541203161947 + - 4.156022884601996 + - 4.1618428838805706 + - 4.157259439238067 + - 4.139859013016601 + - 4.150685014911159 + - 4.152025499126372 + - 4.165010788120131 + - 4.15179422331336 + - 4.137041631098819 + - 4.10861757770052 + - 4.119916019361405 + - 4.131749366642117 + - 4.119438578634397 + - 4.100095269698108 + - 4.073900009963118 + - 4.0580796715728855 + - 4.050916705279105 + - 4.037976834115189 + - 4.023757063156459 + - 3.9987849927993353 + - 3.989251079820668 + - 3.9464430977885256 + - 3.8673932921278995 diff --git a/seamless_communication/src/seamless_communication/cards/vocoder_v2.yaml b/seamless_communication/src/seamless_communication/cards/vocoder_v2.yaml new file mode 100644 index 0000000..7ee16a9 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cards/vocoder_v2.yaml @@ -0,0 +1,201 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# MIT_LICENSE file in the root directory of this source tree. + +name: vocoder_v2 +model_type: vocoder_code_hifigan +model_arch: base +checkpoint: "https://dl.fbaipublicfiles.com/seamless/models/vocoder_v2.pt" +model_config: { + "lang_spkr_idx_map": { + "multilingual": { + "arb": 0, + "ben": 1, + "cat": 2, + "ces": 3, + "cmn": 4, + "cym": 5, + "dan": 6, + "deu": 7, + "eng": 8, + "est": 9, + "fin": 10, + "fra": 11, + "hin": 12, + "ind": 13, + "ita": 14, + "jpn": 15, + "kor": 16, + "mlt": 17, + "nld": 18, + "pes": 19, + "pol": 20, + "por": 21, + "ron": 22, + "rus": 23, + "slk": 24, + "spa": 25, + "swe": 26, + "swh": 27, + "tel": 28, + "tgl": 29, + "tha": 30, + "tur": 31, + "ukr": 32, + "urd": 33, + "uzn": 34, + "vie": 35 + }, + "multispkr": { + "arb": [ + 0 + ], + "ben": [ + 1 + ], + "cat": [ + 2 + ], + "ces": [ + 3 + ], + "cmn": [ + 4, + 5 + ], + "cym": [ + 6 + ], + "dan": [ + 7, + 8 + ], + "deu": [ + 9 + ], + "eng": [ + 10 + ], + "est": [ + 11, + 12, + 13 + ], + "fin": [ + 14 + ], + "fra": [ + 15 + ], + "hin": [ + 16 + ], + "ind": [ + 17, + 24, + 18, + 20, + 19, + 21, + 23, + 27, + 26, + 22, + 25 + ], + "ita": [ + 29, + 28 + ], + "jpn": [ + 30 + ], + "kor": [ + 31 + ], + "mlt": [ + 32, + 33, + 34 + ], + "nld": [ + 35, + 37, + 36 + ], + "pes": [ + 38 + ], + "pol": [ + 39 + ], + "por": [ + 40 + ], + "ron": [ + 41 + ], + "rus": [ + 43, + 42 + ], + "slk": [ + 44 + ], + "spa": [ + 45 + ], + "swe": [ + 46, + 48, + 47 + ], + "swh": [ + 49, + 51, + 50 + ], + "tel": [ + 52 + ], + "tgl": [ + 53 + ], + "tha": [ + 54, + 57, + 58, + 55, + 56 + ], + "tur": [ + 61, + 60, + 59 + ], + "ukr": [ + 62 + ], + "urd": [ + 63, + 64, + 65 + ], + "uzn": [ + 66, + 67, + 68 + ], + "vie": [ + 69, + 70, + 73, + 74, + 71, + 72 + ] + } + } +} diff --git a/seamless_communication/src/seamless_communication/cards/xlsr2_1b_v2.yaml b/seamless_communication/src/seamless_communication/cards/xlsr2_1b_v2.yaml new file mode 100644 index 0000000..b50b41f --- /dev/null +++ b/seamless_communication/src/seamless_communication/cards/xlsr2_1b_v2.yaml @@ -0,0 +1,10 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# MIT_LICENSE file in the root directory of this source tree. + +name: xlsr2_1b_v2 +model_type: wav2vec2 +model_arch: xlsr2_1b_v2 +checkpoint: "https://dl.fbaipublicfiles.com/seamlessM4T/models/unit_extraction/xlsr2_1b_v2.pt" diff --git a/seamless_communication/src/seamless_communication/cli/__init__.py b/seamless_communication/src/seamless_communication/cli/__init__.py new file mode 100644 index 0000000..15d8859 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. diff --git a/seamless_communication/src/seamless_communication/cli/eval_utils/__init__.py b/seamless_communication/src/seamless_communication/cli/eval_utils/__init__.py new file mode 100644 index 0000000..22cedc4 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/eval_utils/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + + +from seamless_communication.cli.eval_utils.compute_metrics import ( + compute_quality_metrics as compute_quality_metrics, +) +from seamless_communication.cli.eval_utils.compute_metrics import ( + get_tokenizer as get_tokenizer, +) +from seamless_communication.cli.eval_utils.lang_mapping import ( + LANG2_LANG3 as LANG2_LANG3, +) +from seamless_communication.cli.eval_utils.lang_mapping import ( + LANG3_LANG2 as LANG3_LANG2, +) diff --git a/seamless_communication/src/seamless_communication/cli/eval_utils/compute_metrics.py b/seamless_communication/src/seamless_communication/cli/eval_utils/compute_metrics.py new file mode 100644 index 0000000..3108d7b --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/eval_utils/compute_metrics.py @@ -0,0 +1,371 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import json +import logging +from pathlib import Path +from typing import Optional, Tuple, Union + +import pandas as pd +import whisper +from fairseq2.typing import Device +from jiwer import cer, wer +from sacrebleu.metrics.base import Score, Signature +from sacrebleu.metrics.bleu import BLEU +from sacrebleu.metrics.chrf import CHRF +from seamless_communication.cli.eval_utils.lang_mapping import LANG3_LANG2 +from tqdm import tqdm +from whisper import Whisper +from whisper.normalizers import BasicTextNormalizer, EnglishTextNormalizer + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s -- %(name)s: %(message)s", +) + +logger = logging.getLogger(__name__) + + +def init_whisper_model( + device: Device, + whisper_model_name: str = "large", +) -> Whisper: + return whisper.load_model(name=whisper_model_name, device=device) + + +def transcribe_series( + audio_paths_series: pd.Series, + asr_model: Whisper, + audio_lang: str, + beam_size: int = 1, + temperature: float = 0.0, +) -> pd.Series: + """Transcribes each audio filepath from series and returns series of transcriptions + Args: + audio_paths_series (pd.Series): each line contains path to audio file. + asr_model: ASR model to do the transcribing process e.g. Whisper + audio_lang (str): what language is used in the given audio, used by ASR model + beam_size (int): whisper beam size. Defaults to 1 + temperature (float): whisper temperature. Defaults to 0.0 to avoid fallback decoding (see details below). + Returns: + pd.Series: Series where each line has a transcription of corresponding audio from audio_paths_series + Whisper model implements decoding with fallback: https://github.com/openai/whisper/blob/main/whisper/transcribe.py#L147 + The core idea is that decoding at each time step might happen multiple times if at least one criterion to "fall back" i.e. + start over is fired. Number of fallback iterations is determined by the schedule of temperature values: + https://github.com/openai/whisper/blob/main/whisper/transcribe.py#L41 + By default this schedule is active and temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0) i.e. even with beam_size 5 it might fell back and + turn on sampling by using temperature > 0, in this case the beam search is not used in the fall back iteration. + Explicit setting of temperature=0.0 overwrites the schedule and fall back decoding has only one for loop iteration i.e. no fall backs. + This allows us to do reproducible evaluation without sample variations. Beware that this might introduce the repetition loops in + the transcriptions and lead to worse ASR-BLEU score in the end. + """ + + if len(audio_lang) == 3: + # to make it work with whisper + audio_lang = LANG3_LANG2[audio_lang] + + transcriptions = {} + + for idx, audio_path in tqdm( + audio_paths_series.items(), + desc=f"Transcribing {audio_paths_series.name} column", + total=len(audio_paths_series), + ): + hypo = asr_model.transcribe( + audio_path, + temperature=temperature, + beam_size=beam_size, + language=audio_lang, + )["text"].strip() + transcriptions[idx] = hypo + + transcriptions_series = pd.Series(transcriptions) + transcriptions_series.name = f"{audio_paths_series.name}_transcribed" + + return transcriptions_series + + +def whisper_normalize_series( + transcription_series: pd.Series, text_lang: str +) -> pd.Series: + """Normalizes the text series using whisper noramlizer. English has a specific one in whisper package. + Args: + transcription_series (pd.Series): Each line contains arbitrary text written in text_lang + text_lang (str): Language of the text in series + Returns: + pd.Series: Series with normalized text + """ + if text_lang == "eng": + normalizer = EnglishTextNormalizer() + else: + normalizer = BasicTextNormalizer() + + norm_transcriptions = {} + + for idx, text in transcription_series.items(): + norm_transcriptions[idx] = normalizer(text) + + norm_transcriptions_series = pd.Series(norm_transcriptions) + norm_transcriptions_series.name = transcription_series.name + + return norm_transcriptions_series + + +def compute_asr_bleu( + audio_paths_series: pd.Series, + ref_text_series: pd.Series, + lang: str, + asr_model: Whisper, + whisper_normalize_text: bool = True, + beam_size: int = 1, + temperature: float = 0.0, + return_transcriptions: bool = True, +) -> Tuple[Score, Signature, pd.DataFrame]: + """Wraps functions above to compute corpus-level ASR-BLEU + ASR decoding hyper-parameters are hard coded to ensure reproducibility across evaluations + Args: + audio_paths_series (pd.Series): each line contains path to audio + ref_text_series (pd.Series): each line contains the text reference to compare audio with + lang (str): the language of both audio and ref_text + asr_model: whisper ASR model + whisper_normalize_text (bool): normalize both text hypotheses and reference if True. Defaults to True. + beam_size (int): beam_size for whisper generation + temperature (float): Temperature sampling value for whisper generation + return_transcriptions (bool) + """ + + audio_transcriptions = transcribe_series( + audio_paths_series, + asr_model, + audio_lang=lang, + beam_size=beam_size, + temperature=temperature, + ) + asr_bleu, asr_bleu_signature = compute_corpus_metric_score( + audio_transcriptions, ref_text_series, lang, whisper_normalize_text + ) + asr_bleu_signature.info["whisper_asr_beam_size"] = beam_size + asr_bleu_signature.info["whisper_asr_temperature"] = temperature + asr_bleu_signature.info["whisper_asr_language"] = lang + + transcript_df = None + if return_transcriptions: + transcript_df = pd.concat( + [ + audio_paths_series, + audio_transcriptions, + ref_text_series, + ], + axis=1, + keys=["audio", "transcript", "reference"], + ) + return asr_bleu, asr_bleu_signature, transcript_df + + +def get_tokenizer(lang: str, metric: str = "bleu") -> str: + """Get tokenizer for language + Args: + lang (str): Three letter code of the language + metric (str): Metric being computed. Valid values are "bleu" and "asr" + """ + lang_tok_map = { + "cmn": "char", + "jpn": "char", + "tha": "char", + "lao": "char", + "mya": "char", + } + default = ( + "13a" if metric == "bleu" else "word" + ) # 13a is the default tokenizer for bleu and wer for asr + tok = lang_tok_map.get(lang, default) + return tok + + +def compute_asr_error_rate( + hyp_text_series: pd.Series, + ref_text_series: pd.Series, + lang: str, + whisper_normalize_text: bool = True, +) -> Tuple[float, str]: + """Wraps normalization functions and computes ASR WER/CER score + Args: + hyp_text_series (pd.Series): each line contains s2t model prediction or first pass prediction + ref_text_series (pd.Series): _description_ + lang (str): _description_ + whisper_normalize_text (bool, optional): normalize both text hypotheses and reference if True. Defaults to True. + Returns: + (MetricScore, MetricScoreSignature) + """ + if whisper_normalize_text: + hyp_text_series = whisper_normalize_series(hyp_text_series, lang) + ref_text_series = whisper_normalize_series(ref_text_series, lang) + + tokenizer_name = get_tokenizer(lang, metric="error_rate") + metric_name = wer if tokenizer_name == "word" else cer + metric_score = metric_name(hyp_text_series.to_list(), ref_text_series.to_list()) + return metric_score, f"{metric_name.__name__} is {metric_score}" + + +def compute_corpus_metric_score( + hyp_text_series: pd.Series, + ref_text_series: pd.Series, + lang: str, + whisper_normalize_text: bool = True, + metric: str = "bleu", +) -> Tuple[Score, Signature]: + """Wraps normalization functions and compute corpus-level BLEU/chrF++ score + Args: + hyp_text_series (pd.Series): each line contains s2t model prediction or first pass prediction + ref_text_series (pd.Series): _description_ + lang (str): _description_ + whisper_normalize_text (bool, optional): normalize both text hypotheses and reference if True. Defaults to True. + Returns: + (MetricScore, MetricScoreSignature) + """ + if whisper_normalize_text: + hyp_text_series = whisper_normalize_series(hyp_text_series, lang) + ref_text_series = whisper_normalize_series(ref_text_series, lang) + + tokenizer_name = get_tokenizer(lang) + corpus_metric_score_metric: Union[BLEU, CHRF] + if metric == "bleu": + corpus_metric_score_metric = BLEU( + lowercase=whisper_normalize_text, tokenize=tokenizer_name + ) # lowercase applied if we use whisper_normalize_text + elif metric == "chrF++": + corpus_metric_score_metric = CHRF(word_order=2) + + corpus_metric_score = corpus_metric_score_metric.corpus_score( + hyp_text_series.to_list(), [ref_text_series.to_list()] + ) + corpus_metric_score_signature = corpus_metric_score_metric.get_signature() + corpus_metric_score_signature.info["whisper_normalize"] = whisper_normalize_text + + return corpus_metric_score, corpus_metric_score_signature + + +def compute_quality_metrics( + output_manifest_tsv_path: Path, + output_path: Path, + tgt_lang: str, + task: str, + device: Device, + whisper_model_name: str = "large", + whisper_normalize_text_output: bool = False, + ref_text_col_name: str = "ref_tgt_text", + pred_text_col_name: Optional[str] = "pred_tgt_text", + pred_audio_col_name: str = "pred_tgt_audio", +) -> str: + """Wraps asr and s2t bleu functions to call it with TSV manifest composed on expressivity side + Args: + output_manifest_tsv_path (Path): output manifest which has "ref_text", "hypo_audio", "s2t_out" column names + output_path (Path): Directory to write files with metrics + tgt_lang (str): what language we evaluate on + task (str): Task we are currently evaluating for + device (Device): Device to use for inference + whisper_model_name (str): Whisper model name. Defaults to "large". + whisper_normalize_text_output (bool): Normalizes text output using whisper_normalizer if set to true + ref_text_col_name (str): Column name in the tsv corresponding to reference target text + pred_text_col_name (str): Column name in the tsv corresponding to predicted target text + pred_audio_col_name (str): Column name in the tsv corresponding to predicted target audio. + Setting this value to none will skip speech metrics + """ + df = pd.read_csv( + output_manifest_tsv_path, sep="\t", quoting=3, encoding="utf-8", escapechar="\\" + ) + task = task.upper() + + if not output_path.exists(): + output_path.mkdir(parents=True, exist_ok=True) + + if task in ["S2TT", "S2ST", "T2TT"] and pred_text_col_name: + metric = "chrF++" if task == "T2TT" else "bleu" + text_metric, text_metric_signature = compute_corpus_metric_score( + hyp_text_series=df[pred_text_col_name], + ref_text_series=df[ref_text_col_name], + lang=tgt_lang, + whisper_normalize_text=whisper_normalize_text_output, + metric=metric, + ) + text_metric_json = text_metric.format( + signature=text_metric_signature.format(), is_json=True + ) + + if task == "T2TT": + filename = "t2tt_chrf.json" + cur_task = "T2TT" + else: + filename = ( + "s2tt_bleu_normalized.json" + if whisper_normalize_text_output + else "s2tt_bleu.json" + ) + cur_task = "S2TT" + + with open(output_path / filename, "w") as f: + f.write(text_metric_json) + + logger.info(f"{cur_task} {metric}:\n{text_metric_json}") + + if task in ["T2ST", "S2ST"]: + whisper_model = init_whisper_model(device, whisper_model_name) + ( + asr_bleu_normalized, + asr_bleu_normalized_signature, + transcripts_df, + ) = compute_asr_bleu( + audio_paths_series=df[pred_audio_col_name], + ref_text_series=df[ref_text_col_name], + lang=tgt_lang, + asr_model=whisper_model, + whisper_normalize_text=True, + ) + transcripts_df.to_csv( + (output_path / "whisper_audio_transcriptions.tsv"), + sep="\t", + index=False, + encoding="utf-8", + escapechar="\\", + ) + + asr_bleu_normalized_signature.info["whisper_asr_model"] = whisper_model_name + + asr_bleu_normalized_json = asr_bleu_normalized.format( + signature=asr_bleu_normalized_signature.format(), is_json=True + ) + filename = f"{task.lower()}_asr_bleu_normalized.json" + + with open( + output_path / filename, + "w", + ) as f: + f.write(asr_bleu_normalized_json) + + logger.info(f"{task} ASR Normalized BLEU:\n{asr_bleu_normalized_json}") + + if task == "ASR": + asr_error_rate, asr_error_rate_signature = compute_asr_error_rate( + hyp_text_series=df[pred_text_col_name], + ref_text_series=df[ref_text_col_name], + lang=tgt_lang, + whisper_normalize_text=whisper_normalize_text_output, + ) + d = { + "name": "WER", + "score": asr_error_rate, + "signature": asr_error_rate_signature, + } + asr_error_rate_json = json.dumps(d, indent=1, ensure_ascii=False) + + filename = "asr_error_rate.json" + + with open(output_path / filename, "w") as f: + f.write(asr_error_rate_json) + + logger.info(f"ASR : {asr_error_rate_json}") + + return filename diff --git a/seamless_communication/src/seamless_communication/cli/eval_utils/lang_mapping.py b/seamless_communication/src/seamless_communication/cli/eval_utils/lang_mapping.py new file mode 100644 index 0000000..14e4282 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/eval_utils/lang_mapping.py @@ -0,0 +1,176 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +LANG2_LANG3 = { + "en": "eng", + "ar": "arb", + "as": "asm", + "be": "bel", + "bg": "bul", + "bn": "ben", + "ca": "cat", + "ckb": "ckb", + "cs": "ces", + "cy": "cym", + "da": "dan", + "de": "deu", + "el": "ell", + "es": "spa", + "et": "est", + "fa": "pes", + "fi": "fin", + "fr": "fra", + "ga": "gle", + "hi": "hin", + "hu": "hun", + "id": "ind", + "it": "ita", + "ja": "jpn", + "ka": "kat", + "ky": "kir", + "lg": "lug", + "lt": "lit", + "lv": "lvs", + "mn": "khk", + "mr": "mar", + "mt": "mlt", + "nl": "nld", + "pa": "pan", + "pl": "pol", + "pt": "por", + "ro": "ron", + "ru": "rus", + "sk": "slk", + "sl": "slv", + "sv": "swe", + "sw": "swh", + "ta": "tam", + "th": "tha", + "tr": "tur", + "uk": "ukr", + "ur": "urd", + "uz": "uzn", + "vi": "vie", + "yue": "yue", + "af": "afr", + "is": "isl", + "lb": "ltz", + "no": "nob", + "gl": "glg", + "kea": "kea", + "bs": "bos", + "hr": "hrv", + "mk": "mkd", + "sr": "srp", + "hy": "hye", + "az": "azj", + "kk": "kaz", + "ko": "kor", + "gu": "guj", + "kn": "kan", + "ne": "npi", + "or": "ory", + "sd": "snd", + "te": "tel", + "ceb": "ceb", + "jv": "jav", + "ms": "zlm", + "ml": "mal", + "tl": "tgl", + "tl": "fil", + "my": "mya", + "km": "khm", + "lo": "lao", + "he": "heb", + "ps": "pbt", + "tg": "tgk", + "am": "amh", + "ig": "ibo", + "ln": "lin", + "nso": "nso", + "so": "som", + "xh": "xho", + "yo": "yor", + "zu": "zul", + "kam": "kam", + "luo": "luo", + "ny": "nya", + "om": "gaz", + "sn": "sna", + "umb": "umb", + "ga-IE": "gle", + "pa": "pan", + "sv": "swe", + "ast": "ast", + "ff": "ful", + "mi": "mri", + "ha": "hau", + "wo": "wol", + "oc": "oci", + "ilo": "ilo", + "ba": "bak", + "br": "bre", + "fy": "fry", + "yi": "yid", + "tn": "tsn", + "gd": "gla", + "ht": "hat", + "mg": "mlg", + "ns": "nso", + "si": "sin", + "sq": "sqi", + "ss": "ssw", + "su": "sun", + "zh": "cmn", + "ab": "abk", + "bas": "bas", + "cnh": "cnh", + "cv": "chv", + "dv": "div", + "eo": "epo", + "eu": "eus", + "fy-NL": "fry", + "gn": "grn", + "hsb": "hsb", + "hy": "hye", + "ia": "ina", + "kab": "kab", + "kmr": "kmr", + "mdf": "mdf", + "mhr": "mhr", + "myv": "myv", + "nan-tw": "hbl", + "ne": "npi", + "nn-NO": "nno", + "rm-sursilv": "rm-sursilv", + "rm-vallader": "rm-vallader", + "rw": "kin", + "sah": "sah", + "sat": "sat", + "sc": "srd", + "tig": "tig", + "tok": "tok", + "tt": "tat", + "ug": "uig", + "vot": "vot", + "mrj": "mrj", + "skr": "skr", + "ti": "tir", + "tw": "twi", + "bo": "bod", + "fo": "fao", + "gv": "glv", + "haw": "haw", + "la": "lat", + "sa": "san", + "sco": "sco", + "war": "war", + "he": "heb", + "jw": "jav", + "nn": "nno", + "tk": "tuk", +} +LANG3_LANG2 = {v: k for k, v in LANG2_LANG3.items()} diff --git a/seamless_communication/src/seamless_communication/cli/expressivity/__init__.py b/seamless_communication/src/seamless_communication/cli/expressivity/__init__.py new file mode 100644 index 0000000..5bf98f7 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/expressivity/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. \ No newline at end of file diff --git a/seamless_communication/src/seamless_communication/cli/expressivity/data/__init__.py b/seamless_communication/src/seamless_communication/cli/expressivity/data/__init__.py new file mode 100644 index 0000000..5bf98f7 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/expressivity/data/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. \ No newline at end of file diff --git a/seamless_communication/src/seamless_communication/cli/expressivity/data/prepare_mexpresso.py b/seamless_communication/src/seamless_communication/cli/expressivity/data/prepare_mexpresso.py new file mode 100644 index 0000000..599a265 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/expressivity/data/prepare_mexpresso.py @@ -0,0 +1,234 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +""" +Script to create mExpresso Eng-XXX S2T dataset. +""" + +import argparse +import logging +import multiprocessing as mp +import os +import pandas as pd +import pathlib +import re +import seamless_communication # need this to load dataset cards +import torchaudio + +from pathlib import Path +from tqdm import tqdm +from typing import List, Optional, Tuple + +from fairseq2.assets import asset_store, download_manager + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s: %(message)s", +) + +logger = logging.getLogger(__name__) + + +def multiprocess_map( + a_list: list, + func: callable, + n_workers: Optional[int] = None, + chunksize: int = 1, + desc=None, +): + if n_workers is None: + n_workers = mp.cpu_count() + n_workers = min(n_workers, mp.cpu_count()) + with mp.get_context("spawn").Pool(processes=n_workers) as pool: + results = list( + tqdm( + pool.imap(func, a_list, chunksize=chunksize), + total=len(a_list), + desc=desc, + ) + ) + return results + + +def convert_to_16khz_wav(config: Tuple[str, str]) -> str: + input_audio, output_audio = config + input_wav, input_sr = torchaudio.load(input_audio) + effects = [ + ["rate", "16000"], + ["channels", "1"], + ] + wav, _ = torchaudio.sox_effects.apply_effects_tensor( + input_wav, input_sr, effects=effects + ) + os.makedirs(Path(output_audio).parent, exist_ok=True) + torchaudio.save( + output_audio, wav, sample_rate=16000, encoding="PCM_S", bits_per_sample=16 + ) + return output_audio + + +def build_en_manifest_from_oss(oss_root: Path, output_folder: Path) -> pd.DataFrame: + # We only open source the following styles + WHITELIST_STYLE = [ + "default", + "default_emphasis", + "default_essentials", + "confused", + "happy", + "sad", + "enunciated", + "whisper", + "laughing", + ] + + results = [] + with open(oss_root / "read_transcriptions.txt") as fin: + for line in fin: + uid, text = line.strip().split("\t") + sps = uid.split("_") + oss_speaker = sps[0] + style = "_".join(sps[1:-1]) + base_style = style.split("_")[0] + if style not in WHITELIST_STYLE: + continue + # Normalize the text to remove and etc + text = re.sub(r" <.*?>", "", text) + text = re.sub(r"<.*?> ", "", text) + results.append( + { + "id": uid, + "speaker": oss_speaker, + "text": text, + "orig_audio": ( + oss_root + / "audio_48khz" + / "read" + / oss_speaker + / base_style + / "base" + / f"{uid}.wav" + ).as_posix(), + "label": style, + } + ) + + df = pd.DataFrame(results) + + # Sanity checks + # Check 1: audio files exists + orig_audio_exists = df["orig_audio"].apply(lambda x: os.path.isfile(x)) + assert all(orig_audio_exists), df[~orig_audio_exists].iloc[0]["orig_audio"] + + # Convert 48kHz -> 16kHz + target_audio_root = output_folder / "audio_16khz_wav" + os.makedirs(target_audio_root, exist_ok=True) + input_output_audios = [ + ( + row["orig_audio"], + (target_audio_root / row["speaker"] / (row["id"] + ".wav")).as_posix(), + ) + for i, row in df.iterrows() + ] + logger.info("converting from 48khz to mono 16khz") + multiprocess_map(input_output_audios, convert_to_16khz_wav, chunksize=50) + df.loc[:, "audio"] = [output_audio for _, output_audio in input_output_audios] + audio_exists = df["audio"].apply(lambda x: os.path.isfile(x)) + assert all(audio_exists), df[~audio_exists].iloc[0]["audio"] + output_manifest = f"{output_folder}/en_manifest.tsv" + df.to_csv(output_manifest, sep="\t", quoting=3, index=None) + logger.info(f"Output {len(df)} rows to {output_manifest}") + return df + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Prepare mExpresso Eng-XXX S2T manifest" + ) + parser.add_argument( + "output_folder", + type=lambda p: pathlib.Path(p).resolve(), # always convert to absolute path + help="Output folder for the downsampled Expresso En audios and combined manifest. " + "The output folder path will be expanded to absolute path.", + ) + parser.add_argument( + "--existing-expresso-root", + type=str, + help="Existing root folder if you have downloaded Expresso dataset. " + "The folder path should include 'read_transcriptions.txt' and 'audio_48khz'", + ) + args = parser.parse_args() + + mexpresso_card = asset_store.retrieve_card("mexpresso_text") + mexpresso_root_path = download_manager.download_dataset( + mexpresso_card.field("uri").as_uri(), + "mExpresso_text", + ) + logger.info(f"The mExpresso dataset is downloaded to {mexpresso_root_path}") + mexpresso_path = mexpresso_root_path / "mexpresso_text" + + # downsample all English speech + if args.existing_expresso_root is not None: + logger.info( + f"Re-use user manually downloaded Expresso from {args.existing_expresso_root}" + ) + en_expresso_path = Path(args.existing_expresso_root) + else: + en_expresso_card = asset_store.retrieve_card("expresso") + en_expresso_root_path = download_manager.download_dataset( + en_expresso_card.field("uri").as_uri(), + "Expresso", + ) + logger.info( + f"The English Expresso dataset is downloaded to {en_expresso_root_path}" + ) + en_expresso_path = en_expresso_root_path / "expresso" + en_expresso_folder = args.output_folder / "En_Expresso" + en_expresso_df = build_en_manifest_from_oss( + Path(en_expresso_path), en_expresso_folder + ) + + for subset in ["dev", "test"]: + for lang in ["spa", "fra", "ita", "cmn", "deu"]: + df = pd.read_csv( + f"{mexpresso_path}/{subset}_mexpresso_{lang}.tsv", sep="\t", quoting=3 + ).rename(columns={"text": "tgt_text"}) + num_released_items = len(df) + df = df.merge( + en_expresso_df.rename( + columns={ + "text": "src_text", + "audio": "src_audio", + "speaker": "src_speaker", + } + ), + on="id", + how="inner", + ) + assert ( + len(df) == num_released_items + ), f"Missing items from downloaded En Expresso" + df["src_lang"] = "eng" + df["tgt_lang"] = lang + # Check all the audio files exist + assert all(os.path.isfile(audio) for audio in df["src_audio"].tolist()) + output_manifest_path = args.output_folder / f"{subset}_mexpresso_eng_{lang}.tsv" + df[ + [ + "id", + "src_audio", # converted 16kHz audio path + "src_speaker", # source speaker + "src_text", # source text + "src_lang", # source language id + "tgt_text", # target text + "tgt_lang", # target language id + "label", # style of utterance + ] + ].to_csv(output_manifest_path, sep="\t", quoting=3, index=None) + logger.info(f"Output {len(df)} rows to {output_manifest_path}") + + +if __name__ == "__main__": + main() diff --git a/seamless_communication/src/seamless_communication/cli/expressivity/evaluate/__init__.py b/seamless_communication/src/seamless_communication/cli/expressivity/evaluate/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seamless_communication/src/seamless_communication/cli/expressivity/evaluate/evaluate.py b/seamless_communication/src/seamless_communication/cli/expressivity/evaluate/evaluate.py new file mode 100644 index 0000000..84f41c3 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/expressivity/evaluate/evaluate.py @@ -0,0 +1,322 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import argparse +import contextlib +import logging +from argparse import Namespace +from pathlib import Path +from typing import Optional + +import pandas as pd +import torch +import torchaudio +from fairseq2.data import Collater, DataPipeline, FileMapper +from fairseq2.data.audio import ( + AudioDecoder, + WaveformToFbankConverter, + WaveformToFbankOutput, +) +from fairseq2.data.text import StrSplitter, read_text +from fairseq2.typing import DataType, Device +from torch import Tensor +from tqdm import tqdm + +from seamless_communication.cli.expressivity.predict.pretssel_generator import ( + PretsselGenerator, +) +from seamless_communication.cli.m4t.evaluate.evaluate import ( + adjust_output_for_corrupted_inputs, + count_lines, +) +from seamless_communication.cli.m4t.predict import ( + add_inference_arguments, + set_generation_opts, +) +from seamless_communication.inference import BatchedSpeechOutput, Translator +from seamless_communication.models.unity import ( + load_gcmvn_stats, + load_unity_unit_tokenizer, +) +from seamless_communication.store import add_gated_assets + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s -- %(name)s: %(message)s", +) + +logger = logging.getLogger(__name__) + + +def build_data_pipeline( + args: Namespace, + device: Device, + dtype: DataType, + gcmvn_mean: Tensor, + gcmvn_std: Tensor, +) -> DataPipeline: + with open(args.data_file, "r") as f: + header = f.readline().strip("\n").split("\t") + assert ( + args.audio_field in header + ), f"Input file does not contain {args.audio_field} field" + + n_parallel = 4 + + split_tsv = StrSplitter(names=header) + + pipeline_builder = read_text(args.data_file, rtrim=True).skip(1).map(split_tsv) + + assert args.audio_root_dir is not None + + map_file = FileMapper(root_dir=args.audio_root_dir, cached_fd_count=10) + + pipeline_builder.map( + map_file, selector=args.audio_field, num_parallel_calls=n_parallel + ) + + decode_audio = AudioDecoder(dtype=torch.float32, device=device) + + convert_to_fbank = WaveformToFbankConverter( + num_mel_bins=80, + waveform_scale=2**15, + channel_last=True, + standardize=False, + device=device, + dtype=dtype, + ) + + def normalize_fbank(data: WaveformToFbankOutput) -> WaveformToFbankOutput: + fbank = data["fbank"] + std, mean = torch.std_mean(fbank, dim=0) + data["fbank"] = fbank.subtract(mean).divide(std) + data["gcmvn_fbank"] = fbank.subtract(gcmvn_mean).divide(gcmvn_std) + return data + + pipeline_builder.map( + [decode_audio, convert_to_fbank, normalize_fbank], + selector=f"{args.audio_field}.data", + num_parallel_calls=n_parallel, + ) + + pipeline_builder.bucket(bucket_size=args.batch_size) + + collate = Collater(pad_value=0, pad_to_multiple=1) + + pipeline_builder.map(collate, num_parallel_calls=n_parallel) + + pipeline_builder.prefetch(4) + + return pipeline_builder.and_return() + + +def main() -> None: + parser = argparse.ArgumentParser(description="Running SeamlessExpressive inference") + parser.add_argument( + "data_file", type=Path, help="Data file (.tsv) to be evaluated." + ) + + parser = add_inference_arguments(parser) + parser.add_argument( + "--gated-model-dir", + type=Path, + required=False, + help="SeamlessExpressive model directory.", + ) + parser.add_argument( + "--batch_size", + type=int, + help="Inference batch size.", + default=4, + ) + parser.add_argument( + "--audio_root_dir", + type=Path, + help="Root directory for the audio filenames in the data file.", + default="", + ) + parser.add_argument( + "--audio_field", + type=str, + help="Field that includes the input audio file paths.", + default="src_audio", + ) + parser.add_argument( + "--ref_field", + type=str, + help="Reference target text field to compute the BLEU score against.", + default=None, + ) + parser.add_argument( + "--duration_factor", + type=float, + help="The duration factor for NAR T2U model.", + default=1.0, + ) + parser.add_argument( + "--output_result_tsv", + type=bool, + help="Whether to output results in tsv format (for full-blown evaluation)", + default=True, + ) + args = parser.parse_args() + + if args.gated_model_dir: + add_gated_assets(args.gated_model_dir) + + if torch.cuda.is_available(): + device = torch.device("cuda:0") + dtype = torch.float16 + else: + device = torch.device("cpu") + dtype = torch.float32 + + unit_tokenizer = load_unity_unit_tokenizer(args.model_name) + + _gcmvn_mean, _gcmvn_std = load_gcmvn_stats(args.vocoder_name) + gcmvn_mean = torch.tensor(_gcmvn_mean, device=device, dtype=dtype) + gcmvn_std = torch.tensor(_gcmvn_std, device=device, dtype=dtype) + + pipeline = build_data_pipeline(args, device, dtype, gcmvn_mean, gcmvn_std) + + translator = Translator( + args.model_name, + vocoder_name_or_card=None, + device=device, + dtype=dtype, + ) + + text_generation_opts, unit_generation_opts = set_generation_opts(args) + + logger.info(f"{text_generation_opts=}") + logger.info(f"{unit_generation_opts=}") + logger.info( + f"unit_generation_ngram_filtering={args.unit_generation_ngram_filtering}" + ) + + pretssel_generator = PretsselGenerator( + args.vocoder_name, + vocab_info=unit_tokenizer.vocab_info, + device=device, + dtype=dtype, + ) + + total_steps = count_lines(args.data_file) - 1 + progress_bar = tqdm(total=total_steps) + + output_path = args.output_path / args.data_file.stem + output_path.mkdir(parents=True, exist_ok=True) + + waveforms_dir = output_path / "waveform" + waveforms_dir.mkdir(parents=True, exist_ok=True) + + hyps = [] + refs = [] + audio_hyps = [] + + with contextlib.ExitStack() as stack: + hyp_file = stack.enter_context( + open(output_path / f"text_output-{args.data_file.stem}.txt", "w") + ) + unit_file = stack.enter_context( + open(output_path / f"unit_output-{args.data_file.stem}.txt", "w") + ) + + sample_id = 0 + for example in pipeline: + valid_sequences: Optional[Tensor] = None + src = example[args.audio_field]["data"]["fbank"] + # Skip corrupted audio tensors. + valid_sequences = ~torch.any( + torch.any(torch.isnan(src["seqs"]), dim=1), dim=1 + ) + if not valid_sequences.all(): + logger.warning( + f"Sample IDs {sample_id} to {sample_id + args.batch_size} has some corrupted input." + ) + src["seqs"] = src["seqs"][valid_sequences] + src["seq_lens"] = src["seq_lens"][valid_sequences] + + # Skip performing inference when the input is entirely corrupted. + if src["seqs"].numel() > 0: + prosody_encoder_input = example[args.audio_field]["data"]["gcmvn_fbank"] + text_output, unit_output = translator.predict( + src, + "s2st", + args.tgt_lang, + src_lang=args.src_lang, + text_generation_opts=text_generation_opts, + unit_generation_opts=unit_generation_opts, + unit_generation_ngram_filtering=args.unit_generation_ngram_filtering, + duration_factor=args.duration_factor, + prosody_encoder_input=prosody_encoder_input, + ) + + assert unit_output is not None + speech_output = pretssel_generator.predict( + unit_output.units, + tgt_lang=args.tgt_lang, + prosody_encoder_input=prosody_encoder_input, + ) + + else: + text_output = [] + speech_output = BatchedSpeechOutput(units=[], audio_wavs=[]) + + if valid_sequences is not None and not valid_sequences.all(): + text_output, speech_output = adjust_output_for_corrupted_inputs( # type: ignore[assignment] + valid_sequences, + text_output, + speech_output, + ) + + hyps += [str(s) for s in text_output] + if args.ref_field is not None and args.ref_field in example: + refs += [str(s) for s in example[args.ref_field]] + + for i in range(len(text_output)): + t = text_output[i] + idx = str(example["id"][i]) + hyp_file.write(f"{t}\n") + + u = speech_output.units[i] + str_units = [str(i) for i in u] + unit_file.write(" ".join(str_units) + "\n") + torchaudio.save( + waveforms_dir / f"{idx}_pred.wav", + speech_output.audio_wavs[i][0].to(torch.float32).cpu(), + sample_rate=speech_output.sample_rate, + ) + audio_hyps.append((waveforms_dir / f"{idx}_pred.wav").as_posix()) + + sample_id += 1 + progress_bar.update(1) + + progress_bar.close() + logger.info(f"Processed {len(hyps)} hyps, {len(refs)} refs") + + if args.output_result_tsv: + output_tsv_file = output_path / f"generate-{args.data_file.stem}.tsv" + output_tsv = pd.read_csv(args.data_file, quoting=3, sep="\t") + text_out = [] + with open(hyp_file.name) as file: + for line in file: + text_out.append(line.strip()) + + unit_out = [] + with open(unit_file.name) as file: + for line in file: + unit_out.append(line.strip()) + + output_tsv["hypo_audio"] = audio_hyps + output_tsv["s2t_out"] = text_out + output_tsv["orig_unit"] = unit_out + output_tsv.to_csv(output_tsv_file, quoting=3, sep="\t", index=False) + logger.info(f"Output results in {output_tsv_file}") + + +if __name__ == "__main__": + main() diff --git a/seamless_communication/src/seamless_communication/cli/expressivity/evaluate/post_process_pauserate.py b/seamless_communication/src/seamless_communication/cli/expressivity/evaluate/post_process_pauserate.py new file mode 100644 index 0000000..f421e9b --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/expressivity/evaluate/post_process_pauserate.py @@ -0,0 +1,48 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import pandas as pd +import csv +import scipy +from typing import Dict + + +def get_pause(pause_data_tsv: str) -> Dict[str, float]: + utt_pause_align_data = pd.read_csv( + pause_data_tsv, + sep="\t", + quoting=csv.QUOTE_MINIMAL, + ) + metrics = {} + pause_duration_weight = ( + utt_pause_align_data.total_weight / utt_pause_align_data.total_weight.sum() + ) + for score_name in [ + "wmean_duration_score", + "wmean_alignment_score", + "wmean_joint_score", + ]: + metrics[score_name] = ( + utt_pause_align_data[f"{score_name}"] * pause_duration_weight + ).sum() + return metrics + + +def get_rate(target_speech_tsv: str, source_speech_tsv: str) -> float: + speech_unit = "syllable" + + target_speech_df = pd.read_csv( + target_speech_tsv, sep="\t", quoting=csv.QUOTE_MINIMAL + ).set_index("id") + source_speech_df = pd.read_csv( + source_speech_tsv, sep="\t", quoting=csv.QUOTE_MINIMAL + ).set_index("id") + + # using "syllable" speech unit for rate computation + src_speech_rate = source_speech_df[f"speech_rate_{speech_unit}"].to_numpy() + tgt_speech_rate = target_speech_df[f"speech_rate_{speech_unit}"].to_numpy() + src_tgt_spearman = scipy.stats.spearmanr(src_speech_rate, tgt_speech_rate) + return src_tgt_spearman.correlation # type: ignore[no-any-return] diff --git a/seamless_communication/src/seamless_communication/cli/expressivity/evaluate/run_asr_bleu.py b/seamless_communication/src/seamless_communication/cli/expressivity/evaluate/run_asr_bleu.py new file mode 100644 index 0000000..6d70561 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/expressivity/evaluate/run_asr_bleu.py @@ -0,0 +1,33 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from fire import Fire +from seamless_communication.cli.eval_utils.compute_metrics import ( + compute_quality_metrics, +) +from fairseq2.typing import Device +from pathlib import Path + + +def run_asr_bleu_expressive_model( + generation_dir_path: str, + generate_tsv_filename: str, + tgt_lang: str, +): + compute_quality_metrics( + f"{generation_dir_path}/{generate_tsv_filename}", + Path(generation_dir_path), + tgt_lang, + "S2ST", + device=Device("cuda"), + ref_text_col_name="tgt_text", + pred_text_col_name="s2t_out", + pred_audio_col_name="hypo_audio", + ) + + +if __name__ == "__main__": + Fire(run_asr_bleu_expressive_model) diff --git a/seamless_communication/src/seamless_communication/cli/expressivity/predict/__init__.py b/seamless_communication/src/seamless_communication/cli/expressivity/predict/__init__.py new file mode 100644 index 0000000..5bf98f7 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/expressivity/predict/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. \ No newline at end of file diff --git a/seamless_communication/src/seamless_communication/cli/expressivity/predict/predict.py b/seamless_communication/src/seamless_communication/cli/expressivity/predict/predict.py new file mode 100644 index 0000000..7ad0e71 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/expressivity/predict/predict.py @@ -0,0 +1,179 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import argparse +import logging +import torch +import torchaudio +from pathlib import Path + +from fairseq2.data import SequenceData +from fairseq2.data.audio import WaveformToFbankConverter + +from seamless_communication.cli.expressivity.predict.pretssel_generator import ( + PretsselGenerator, +) +from seamless_communication.cli.m4t.predict import ( + add_inference_arguments, + set_generation_opts, +) +from seamless_communication.inference import Translator +from seamless_communication.models.unity import ( + load_gcmvn_stats, + load_unity_unit_tokenizer, +) +from seamless_communication.store import add_gated_assets + + +AUDIO_SAMPLE_RATE = 16000 + + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s -- %(name)s: %(message)s", +) + +logger = logging.getLogger(__name__) + + +def remove_prosody_tokens_from_text(text: str) -> str: + # filter out prosody tokens, there is only emphasis '*', and pause '=' + text = text.replace("*", "").replace("=", "") + text = " ".join(text.split()) + return text + + +def main() -> None: + parser = argparse.ArgumentParser(description="Running SeamlessExpressive inference.") + parser.add_argument("input", type=str, help="Audio WAV file path.") + + parser = add_inference_arguments(parser) + parser.add_argument( + "--gated-model-dir", + type=Path, + required=False, + help="SeamlessExpressive model directory.", + ) + parser.add_argument( + "--duration_factor", + type=float, + help="The duration factor for NAR T2U model.", + default=1.0, + ) + args = parser.parse_args() + + if not args.tgt_lang or args.output_path is None: + raise Exception( + "--tgt_lang, --output_path must be provided for SeamlessExpressive inference." + ) + + if args.gated_model_dir: + add_gated_assets(args.gated_model_dir) + + if torch.cuda.is_available(): + device = torch.device("cuda:0") + dtype = torch.float16 + else: + device = torch.device("cpu") + dtype = torch.float32 + + logger.info(f"Running inference on {device=} with {dtype=}.") + + unit_tokenizer = load_unity_unit_tokenizer(args.model_name) + + translator = Translator( + args.model_name, + vocoder_name_or_card=None, + device=device, + dtype=dtype, + ) + + pretssel_generator = PretsselGenerator( + args.vocoder_name, + vocab_info=unit_tokenizer.vocab_info, + device=device, + dtype=dtype, + ) + + fbank_extractor = WaveformToFbankConverter( + num_mel_bins=80, + waveform_scale=2**15, + channel_last=True, + standardize=False, + device=device, + dtype=dtype, + ) + + _gcmvn_mean, _gcmvn_std = load_gcmvn_stats(args.vocoder_name) + gcmvn_mean = torch.tensor(_gcmvn_mean, device=device, dtype=dtype) + gcmvn_std = torch.tensor(_gcmvn_std, device=device, dtype=dtype) + + wav, sample_rate = torchaudio.load(args.input) + wav = torchaudio.functional.resample(wav, orig_freq=sample_rate, new_freq=16_000) + wav = wav.transpose(0, 1) + + data = fbank_extractor( + { + "waveform": wav, + "sample_rate": 16000, + } + ) + fbank = data["fbank"] + gcmvn_fbank = fbank.subtract(gcmvn_mean).divide(gcmvn_std) + std, mean = torch.std_mean(fbank, dim=0) + fbank = fbank.subtract(mean).divide(std) + + src = SequenceData( + seqs=fbank.unsqueeze(0), + seq_lens=torch.LongTensor([fbank.shape[0]]), + is_ragged=False, + ) + src_gcmvn = SequenceData( + seqs=gcmvn_fbank.unsqueeze(0), + seq_lens=torch.LongTensor([gcmvn_fbank.shape[0]]), + is_ragged=False, + ) + + text_generation_opts, unit_generation_opts = set_generation_opts(args) + + logger.info(f"{text_generation_opts=}") + logger.info(f"{unit_generation_opts=}") + logger.info( + f"unit_generation_ngram_filtering={args.unit_generation_ngram_filtering}" + ) + + text_output, unit_output = translator.predict( + src, + "s2st", + args.tgt_lang, + text_generation_opts=text_generation_opts, + unit_generation_opts=unit_generation_opts, + unit_generation_ngram_filtering=args.unit_generation_ngram_filtering, + duration_factor=args.duration_factor, + prosody_encoder_input=src_gcmvn, + ) + + assert unit_output is not None + speech_output = pretssel_generator.predict( + unit_output.units, + tgt_lang=args.tgt_lang, + prosody_encoder_input=src_gcmvn, + ) + + logger.info(f"Saving expressive translated audio in {args.tgt_lang}") + torchaudio.save( + args.output_path, + speech_output.audio_wavs[0][0].to(torch.float32).cpu(), + sample_rate=speech_output.sample_rate, + ) + + text_out = remove_prosody_tokens_from_text(str(text_output[0])) + + logger.info(f"Translated text in {args.tgt_lang}: {text_out}") + + +if __name__ == "__main__": + main() diff --git a/seamless_communication/src/seamless_communication/cli/expressivity/predict/pretssel_generator.py b/seamless_communication/src/seamless_communication/cli/expressivity/predict/pretssel_generator.py new file mode 100644 index 0000000..0754e33 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/expressivity/predict/pretssel_generator.py @@ -0,0 +1,100 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import List + +import torch +from torch.nn import Module + +from fairseq2.typing import DataType, Device + +from fairseq2.assets import asset_store +from fairseq2.data import ( + Collater, + SequenceData, + VocabularyInfo, +) +from fairseq2.nn.padding import get_seqs_and_padding_mask + +from seamless_communication.inference import BatchedSpeechOutput +from seamless_communication.models.generator.loader import load_pretssel_vocoder_model + + +class PretsselGenerator(Module): + def __init__( + self, + pretssel_name_or_card: str, + vocab_info: VocabularyInfo, + device: Device, + dtype: DataType = torch.float16, + ): + super().__init__() + # Load the model. + if device == torch.device("cpu"): + dtype = torch.float32 + + self.device = device + self.dtype = dtype + + self.pretssel_model = load_pretssel_vocoder_model( + pretssel_name_or_card, + device=device, + dtype=dtype, + ) + self.pretssel_model.eval() + + vocoder_model_card = asset_store.retrieve_card(pretssel_name_or_card) + self.output_sample_rate = vocoder_model_card.field("sample_rate").as_(int) + + self.vocab_info = vocab_info + self.unit_collate = Collater(pad_value=vocab_info.pad_idx) + self.duration_collate = Collater(pad_value=0) + self.unit_eos_token = torch.tensor([vocab_info.eos_idx], device=device) + + @torch.inference_mode() + def predict( + self, + units: List[List[int]], + tgt_lang: str, + prosody_encoder_input: SequenceData, + ) -> BatchedSpeechOutput: + + units_batch, durations = [], [] + for u in units: + unit = torch.tensor(u).to(self.unit_eos_token) + + # adjust the control symbols for the embedding + unit += 4 + unit = torch.cat([unit, self.unit_eos_token], dim=0) + + unit, duration = torch.unique_consecutive(unit, return_counts=True) + + # adjust for the last eos token + duration[-1] = 0 + + units_batch.append(unit) + durations.append(duration * 2) + + speech_units = self.unit_collate(units_batch) + durations = self.duration_collate(durations)["seqs"] + + units_tensor, unit_padding_mask = get_seqs_and_padding_mask(speech_units) + prosody_input_seqs, prosody_padding_mask = get_seqs_and_padding_mask( + prosody_encoder_input + ) + + audio_wavs = self.pretssel_model( + units_tensor, + tgt_lang, + prosody_input_seqs, + padding_mask=unit_padding_mask, + prosody_padding_mask=prosody_padding_mask, + durations=durations, + ) + return BatchedSpeechOutput( + units=units, + audio_wavs=audio_wavs, + sample_rate=self.output_sample_rate, + ) diff --git a/seamless_communication/src/seamless_communication/cli/m4t/__init__.py b/seamless_communication/src/seamless_communication/cli/m4t/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seamless_communication/src/seamless_communication/cli/m4t/audio_to_units/README.md b/seamless_communication/src/seamless_communication/cli/m4t/audio_to_units/README.md new file mode 100644 index 0000000..acfbd8e --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/m4t/audio_to_units/README.md @@ -0,0 +1,19 @@ +# Convert raw audio into units (unit_extraction) + +Raw audio needs to be converted to units to train UnitY models and vocoders. Units act as supervision for UnitY models, and are the input to the vocoders which synthesize speech from these units. + +The unit extraction pipeline comprises the following steps: +- Compute features from layer 35 (determined empirically) of the pretrained XLSR v2 model ([paper](https://arxiv.org/abs/2111.09296)), which is a wav2vec2 model at the core. +- Assign features for each timestep to a collection of precomputed K-Means centroids to produce a sequence of units similar to extracting Hubert units as described in this [paper](https://arxiv.org/pdf/2107.05604.pdf). + + +## Quick start: +`audio_to_units` is run with the CLI, from the root directory of the repository. + +```bash +m4t_audio_to_units +``` + +`audio_to_units` calls for `UnitExtractor` which provides a `predict` method to convert an audio to units. + +The convenience method `resynthesize_audio` of `UnitExtractor`, can be used to resynthesize audio waveforms from units. diff --git a/seamless_communication/src/seamless_communication/cli/m4t/audio_to_units/__init__.py b/seamless_communication/src/seamless_communication/cli/m4t/audio_to_units/__init__.py new file mode 100644 index 0000000..5d20256 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/m4t/audio_to_units/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. diff --git a/seamless_communication/src/seamless_communication/cli/m4t/audio_to_units/audio_to_units.py b/seamless_communication/src/seamless_communication/cli/m4t/audio_to_units/audio_to_units.py new file mode 100644 index 0000000..3813597 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/m4t/audio_to_units/audio_to_units.py @@ -0,0 +1,56 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import argparse +import logging + +import torch + +from seamless_communication.models.unit_extractor import UnitExtractor + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser( + description="Convert raw audio to units (and optionally audio) using UnitExtractor." + ) + parser.add_argument("audio", type=str, help="Audio WAV file path.") + parser.add_argument( + "--kmeans_uri", + type=str, + help="URL path to the K-Means model.", + default="https://dl.fbaipublicfiles.com/seamlessM4T/models/unit_extraction/kmeans_10k.npy", + ) + parser.add_argument( + "--model_name", + type=str, + help="Feature extraction model name (`xlsr2_1b_v2`)", + default="xlsr2_1b_v2", + ) + parser.add_argument( + "--out_layer_number", + type=int, + help="Layer number of the feature extraction model to pull out features from.", + default=35, + ) + + args = parser.parse_args() + + if torch.cuda.is_available(): + device = torch.device("cuda:0") + logger.info("Running unit_extraction on the GPU.") + else: + device = torch.device("cpu") + logger.info("Running unit_extraction on the CPU.") + + unit_extractor = UnitExtractor(args.model_name, args.kmeans_uri, device=device) + units = unit_extractor.predict(args.audio, args.out_layer_number - 1) + logger.info(f"Converted to units: {units}") + + +if __name__ == "__main__": + main() diff --git a/seamless_communication/src/seamless_communication/cli/m4t/evaluate/README.md b/seamless_communication/src/seamless_communication/cli/m4t/evaluate/README.md new file mode 100644 index 0000000..4d58620 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/m4t/evaluate/README.md @@ -0,0 +1,21 @@ +# Evaluating SeamlessM4T models + +Refer to the [SeamlessM4T README](../../../../../docs/m4t) for an overview of the M4T models. + +Refer to the [inference README](../predict/README.md) for how to run inference with SeamlessM4T models. + +## Quick start: +We use SACREBLEU library for computing BLEU scores and [JiWER library](https://github.com/jitsi/jiwer) is used to compute these CER and WER scores. + +Evaluation can be run with the CLI, from the root directory of the repository. + +The model can be specified with `--model_name`: `seamlessM4T_v2_large` or `seamlessM4T_large` or `seamlessM4T_medium` + +```bash +m4t_evaluate --data_file --task --tgt_lang --output_path --ref_field --audio_root_dir +``` +## Note +1. We use raw (unnormalized) references to compute BLEU scores for S2TT, T2TT tasks. +2. For ASR task, src_lang needs to be passed as +3. `--src_lang` arg needs to be specified to run evaluation for T2TT task + diff --git a/seamless_communication/src/seamless_communication/cli/m4t/evaluate/__init__.py b/seamless_communication/src/seamless_communication/cli/m4t/evaluate/__init__.py new file mode 100644 index 0000000..5d20256 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/m4t/evaluate/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. diff --git a/seamless_communication/src/seamless_communication/cli/m4t/evaluate/evaluate.py b/seamless_communication/src/seamless_communication/cli/m4t/evaluate/evaluate.py new file mode 100644 index 0000000..6205bd3 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/m4t/evaluate/evaluate.py @@ -0,0 +1,440 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import argparse +import contextlib +import itertools +import logging +import subprocess +from argparse import Namespace +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +import torch +import torchaudio +from fairseq2.data import Collater, DataPipeline, FileMapper +from fairseq2.data.audio import AudioDecoder, WaveformToFbankConverter +from fairseq2.data.text import StrSplitter, TextTokenizer, read_text +from fairseq2.data.typing import StringLike +from fairseq2.typing import DataType, Device +from torch import Tensor +from tqdm import tqdm + +from seamless_communication.cli.eval_utils import ( + compute_quality_metrics, +) +from seamless_communication.cli.m4t.predict import ( + add_inference_arguments, + set_generation_opts, +) +from seamless_communication.inference import ( + BatchedSpeechOutput, + Modality, + SequenceGeneratorOptions, + Translator, +) +from seamless_communication.models.unity import load_unity_text_tokenizer + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s -- %(name)s: %(message)s", +) + +logger = logging.getLogger(__name__) + + +@dataclass +class EvalContext: + task: str + """String representing the task. Valid choices are + "S2ST", "S2TT", "T2ST", "T2TT", "ASR".""" + + input_modality: Modality + """The input modality of the task.""" + + output_modality: Modality + """The output modality of the task.""" + + model_name: str + """The name of the S2T UnitY model.""" + + data_file: Path + """The pathname of the test TSV data file.""" + + audio_root_dir: Optional[Path] + """The pathname of the directory under which + audio files are stored.""" + + target_lang: str + """The target translation language.""" + + source_lang: Optional[str] + """The source language.""" + + batch_size: int + """The batch size for model input.""" + + device: Device + """The device on which to run inference.""" + + dtype: DataType + """The data type with which to run inference.""" + + output_path: Path + """The pathname of the output directory to save + the evaluation results.""" + + ref_field: str + """The reference target text field to compute + the BLEU score against.""" + + text_generation_opts: SequenceGeneratorOptions + """Text generation hyperparameters.""" + + unit_generation_opts: Optional[SequenceGeneratorOptions] + """Unit generation hyperparameters, not applicable + for the NAR T2U decoder.""" + + unit_generation_ngram_filtering: bool + """If True, removes consecutive repeating ngrams + from the decoded unit output.""" + + +def count_lines(filename: Path) -> int: + result = subprocess.run(["wc", "-l", filename], stdout=subprocess.PIPE) + return int(result.stdout.decode().split()[0]) + + +def build_data_pipeline( + ctx: EvalContext, + text_tokenizer: TextTokenizer, +) -> DataPipeline: + with open(ctx.data_file, "r") as f: + header = f.readline().strip("\n").split("\t") + first_example = f.readline().strip("\n").split("\t") + + # TODO: This will be soon auto-tuned. Right now hand-tuned for devfair. + n_parallel = 4 + + split_tsv = StrSplitter(names=header) + + pipeline_builder = read_text(ctx.data_file, rtrim=True).skip(1).map(split_tsv) + + if ctx.input_modality == Modality.SPEECH: + assert ctx.audio_root_dir is not None + + map_file = FileMapper(root_dir=ctx.audio_root_dir, cached_fd_count=10) + + pipeline_builder.map(map_file, selector="audio", num_parallel_calls=n_parallel) + + decode_audio = AudioDecoder(dtype=torch.float32, device=ctx.device) + + convert_to_fbank = WaveformToFbankConverter( + num_mel_bins=80, + waveform_scale=2**15, + channel_last=True, + standardize=True, + device=ctx.device, + dtype=ctx.dtype, + ) + + pipeline_builder.map( + [decode_audio, convert_to_fbank], + selector="audio.data", + num_parallel_calls=n_parallel, + ) + else: + if "src_lang" in header: + source_lang = first_example[header.index("src_lang")] + ctx.source_lang = source_lang + elif ctx.source_lang is None: + raise ValueError( + ( + "'src_lang' is missing in the data_file" + "header and in the arguments." + ) + ) + + token_encoder = text_tokenizer.create_encoder( + task="translation", lang=source_lang, mode="source", device=ctx.device + ) + pipeline_builder.map( + [token_encoder], + selector="src_text", + num_parallel_calls=n_parallel, + ) + + pipeline_builder.bucket(bucket_size=ctx.batch_size) + + collate = Collater(pad_value=0, pad_to_multiple=1) + + pipeline_builder.map(collate, num_parallel_calls=n_parallel) + + pipeline_builder.prefetch(4) + + return pipeline_builder.and_return() + + +def adjust_output_for_corrupted_inputs( + valid_sequences: Tensor, + text_output: List[StringLike], + speech_output: Optional[BatchedSpeechOutput], +) -> Tuple[List[StringLike], Optional[BatchedSpeechOutput]]: + adjusted_text_output: List[StringLike] = [] + adjusted_speech_output: Optional[BatchedSpeechOutput] = None + + if speech_output is not None: + assert ( + len(text_output) + == len(speech_output.units) + == len(speech_output.audio_wavs) + ) + adjusted_speech_output = BatchedSpeechOutput(units=[], audio_wavs=[]) + + batch_counter = 0 + for is_valid in valid_sequences: + if is_valid: + adjusted_text_output.append(text_output[batch_counter]) + if speech_output is not None: + assert adjusted_speech_output is not None + adjusted_speech_output.units.append(speech_output.units[batch_counter]) + adjusted_speech_output.audio_wavs.append( + speech_output.audio_wavs[batch_counter] + ) + batch_counter += 1 + else: + # For the corrupted inputs, we save the following dummy outputs: + # empty string for text, empty list for units, 1 second of silence for audio. + adjusted_text_output.append("") + if adjusted_speech_output is not None: + sample_rate = adjusted_speech_output.sample_rate + adjusted_speech_output.units.append([]) + adjusted_speech_output.audio_wavs.append( + torch.zeros(sample_rate).unsqueeze(0).unsqueeze(0) + ) + return ( + adjusted_text_output, + adjusted_speech_output, + ) + + +def run_eval( + translator: Translator, + text_tokenizer: TextTokenizer, + ctx: EvalContext, + whisper_model_name: str, +) -> None: + pipeline = build_data_pipeline(ctx, text_tokenizer) + + total_steps = count_lines(ctx.data_file) - 1 + progress_bar = tqdm(total=total_steps) + + output_path = ctx.output_path / ctx.data_file.stem + output_path.mkdir(parents=True, exist_ok=True) + + if ctx.output_modality == Modality.SPEECH: + waveforms_dir = output_path / f"waveform_{ctx.data_file.stem}" + waveforms_dir.mkdir(parents=True, exist_ok=True) + + model_outputs_tsv = output_path / f"model-outputs-{ctx.data_file.stem}.txt" + unit_outputs_tsv = output_path / f"unit_output-{ctx.data_file.stem}.txt" + with open(model_outputs_tsv, "w") as hyp_file, open( + unit_outputs_tsv, "w" + ) if ctx.output_modality == Modality.SPEECH else contextlib.nullcontext( + itertools.repeat(None) + ) as unit_file: + sample_id = 0 + if ctx.output_modality == Modality.SPEECH: + hyp_file.write("ref_tgt_text\tpred_tgt_text\tpred_tgt_audio\n") + else: + hyp_file.write("ref_tgt_text\tpred_tgt_text\n") + for example in pipeline: + valid_sequences: Optional[Tensor] = None + if ctx.input_modality == Modality.SPEECH: + src = example["audio"]["data"]["fbank"] + # Skip corrupted audio tensors. + valid_sequences = ~torch.any( + torch.any(torch.isnan(src["seqs"]), dim=1), dim=1 + ) + if not valid_sequences.all(): + logger.warning( + f"Sample IDs {sample_id} to {sample_id + ctx.batch_size} has some corrupted input." + ) + src["seqs"] = src["seqs"][valid_sequences] + src["seq_lens"] = src["seq_lens"][valid_sequences] + else: + src = example["src_text"] + + # Skip performing inference when the input is entirely corrupted. + if src["seqs"].numel() > 0: + (text_output, speech_output,) = translator.predict( + src, + ctx.task, + ctx.target_lang, + src_lang=ctx.source_lang, + text_generation_opts=ctx.text_generation_opts, + unit_generation_opts=ctx.unit_generation_opts, + unit_generation_ngram_filtering=ctx.unit_generation_ngram_filtering, + ) + else: + text_output = [] + if ctx.output_modality == Modality.SPEECH: + speech_output = BatchedSpeechOutput(units=[], audio_wavs=[]) + else: + speech_output = None + + if valid_sequences is not None and not valid_sequences.all(): + (text_output, speech_output,) = adjust_output_for_corrupted_inputs( + valid_sequences, + text_output, + speech_output, + ) + + hyps = [str(s) for s in text_output] + refs = [str(s) for s in example[ctx.ref_field]] + + for i in range(len(text_output)): + if ctx.output_modality == Modality.SPEECH: + assert speech_output is not None + u = speech_output.units[i] + str_units = [str(i) for i in u] + unit_file.write(" ".join(str_units) + "\n") + wav_fp = str(waveforms_dir / f"{sample_id}_pred.wav") + torchaudio.save( + wav_fp, + speech_output.audio_wavs[i][0].to(torch.float32).cpu(), + sample_rate=speech_output.sample_rate, + ) + hyp_file.write(f"{refs[i]}\t{hyps[i]}\t{wav_fp}\n") + else: + hyp_file.write(f"{refs[i]}\t{hyps[i]}\n") + + sample_id += 1 + progress_bar.update(1) + + progress_bar.close() + logger.info(f"Processed {sample_id} samples") + + compute_quality_metrics( + output_manifest_tsv_path=model_outputs_tsv, + output_path=output_path, + tgt_lang=ctx.target_lang, + task=ctx.task, + device=ctx.device, + whisper_model_name=whisper_model_name, + ) + + +def main(optional_args: Optional[Dict[str, Any]] = None) -> None: + parser = argparse.ArgumentParser( + description="M4T evaluation for tasks supported by Translator." + ) + parser.add_argument( + "--data_file", type=str, help="Data file (.tsv) to be evaluated." + ) + + parser = add_inference_arguments(parser) + parser.add_argument( + "--batch_size", + type=int, + help="Inference batch size.", + default=4, + ) + parser.add_argument( + "--audio_root_dir", + type=str, + help="Root directory for the audio filenames in the data file.", + default="", + ) + parser.add_argument( + "--ref_field", + type=str, + help="Reference target text field to compute the BLEU score against.", + default="tgt_text", + ) + parser.add_argument( + "--whisper_model_name", + type=str, + help="Whisper model to be used for ASR-BLEU scoring", + default="large", + ) + args, unknown = parser.parse_known_args() + default_args = vars(args) + default_args.update(optional_args) if optional_args else default_args + args = Namespace(**default_args) + + if not args.data_file or not args.task or not args.tgt_lang: + raise Exception( + "Please provide required arguments for evaluation - data_file, task, tgt_lang" + ) + + if not Path(args.data_file).exists(): + raise ValueError(f"Invalid data_file to be evaluated: {args.data_file}") + + input_modality, output_modality = Translator.get_modalities_from_task_str(args.task) + + if input_modality == Modality.SPEECH and not Path(args.audio_root_dir).exists(): + raise ValueError( + f"Invalid audio_root_dir: {args.audio_root_dir} for speech input." + ) + + if torch.cuda.is_available(): + device = torch.device("cuda:0") + dtype = torch.float16 + else: + device = torch.device("cpu") + dtype = torch.float32 + + text_tokenizer = load_unity_text_tokenizer(args.model_name) + + # TODO: Avoid loading the T2U model, vocoder when the output + # modality is text. + translator = Translator( + args.model_name, + args.vocoder_name, + device, + text_tokenizer=text_tokenizer, + dtype=dtype, + input_modality=input_modality, + output_modality=output_modality, + ) + + text_generation_opts, unit_generation_opts = set_generation_opts(args) + + logger.info(f"{text_generation_opts=}") + logger.info(f"{unit_generation_opts=}") + logger.info( + f"unit_generation_ngram_filtering={args.unit_generation_ngram_filtering}" + ) + + # fmt: off + ctx = EvalContext( + task=args.task, + input_modality=input_modality, + output_modality=output_modality, + model_name=args.model_name, + data_file=Path(args.data_file), + audio_root_dir=Path(args.audio_root_dir), + target_lang=args.tgt_lang, + source_lang=args.src_lang, + batch_size=args.batch_size, + device=device, + dtype=dtype, + ref_field=args.ref_field, + text_generation_opts=text_generation_opts, + unit_generation_opts=unit_generation_opts, + unit_generation_ngram_filtering=args.unit_generation_ngram_filtering, + output_path=args.output_path, + ) + # fmt: on + logger.info(f"Running inference on {device=} with {dtype=}, {ctx.batch_size=}.") + + run_eval(translator, text_tokenizer, ctx, args.whisper_model_name) + + +if __name__ == "__main__": + main() diff --git a/seamless_communication/src/seamless_communication/cli/m4t/finetune/README.md b/seamless_communication/src/seamless_communication/cli/m4t/finetune/README.md new file mode 100644 index 0000000..8032f9b --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/m4t/finetune/README.md @@ -0,0 +1,133 @@ +## Finetuning scripts for M4T + +This section demonstrates an example of M4T finetuning on a single translation direction: English-to-Korean. + +The trainer and dataloader were designed mainly for demonstration purposes. Their simplicity should facilitate the code transparency and portability. + +## Data preparation + +M4T training dataset is a multimodal parallel corpus. Each training sample has four parts: audio and text representation of the sample in the source language, and its corresponding audio and text representation in the target language. + +That kind of dataset can be prepared using `dataset.py` script that downloads FLEURS dataset from [HuggingFace datasets hub](https://huggingface.co/datasets/google/fleurs), (optionally) extracts units from the target audio samples, and prepares a manifest consumable by `finetune.py`. Manifest is a text file where each line represents information about a single dataset sample, serialized in JSON format. + +List of input arguments for `dataset.py`: + +```bash + --source_lang SOURCE_LANG + M4T langcode of the dataset SOURCE language + --target_lang TARGET_LANG + M4T langcode of the dataset TARGET language + --split SPLIT Dataset split/shard to download (`train`, `test`) + --save_dir SAVE_DIR Directory where the datasets will be stored with HuggingFace datasets cache files +``` + +Language codes should follow the notation adopted by M4T models. + +Below is an example bash script that prepares a training and evaluation dataset for the translation direction English-to-Korean: + +```bash +export DATASET_DIR=~/m4t_dataset +mkdir -p $DATASET_DIR + +m4t_prepare_dataset \ + --source_lang eng \ + --target_lang kor \ + --split train \ + --save_dir $DATASET_DIR +m4t_prepare_dataset \ + --source_lang eng \ + --target_lang kor \ + --split validation \ + --save_dir $DATASET_DIR +``` + + +Output manifests will be stored in `${DATASET_DIR}/train_manifest.json` and `${DATASET_DIR}/validation_manifest.json`. + + +## Finetuning + +`finetune.py` is an example finetuning script that initializes dataloaders, and launches training loop with periodic scoring against the validation dataset. +It is recommended to launch it with [`torchrun`](https://pytorch.org/docs/stable/elastic/run.html). Multi-gpu and multi-node training are supported out of the box. + +List of input arguments for `finetune.py`: + +```bash + --train_dataset TRAIN_DATASET + Path to manifest with train samples + --eval_dataset EVAL_DATASET + Path to manifest with eval samples + --model_name MODEL_NAME + Base model name (e.g, `seamlessM4T_medium`, `seamlessM4T_large`) + --save_model_to SAVE_MODEL_TO + Path to save best finetuned model + --seed SEED Randomizer seed value + --batch_size BATCH_SIZE + Batch size for training and evaluation + --patience PATIENCE Set early termination after `patience` number of evaluations without eval loss improvements + --max_epochs MAX_EPOCHS + Max number of training epochs + --learning_rate LEARNING_RATE + Finetuning learning rate + --warmup_steps WARMUP_STEPS + Number of steps with linearly increasing learning rate + --eval_steps EVAL_STEPS + Get eval loss after each `eval_steps` training steps + --log_steps LOG_STEPS + Log inner loss after each `log_steps` training steps + --mode {FinetuneMode.SPEECH_TO_SPEECH,FinetuneMode.SPEECH_TO_TEXT,FinetuneMode.TEXT_TO_SPEECH} + * `SPEECH_TO_SPEECH` -- finetune S2T and T2U parts of the model; + * `TEXT_TO_SPEECH` -- finetune only T2U; + * `SPEECH_TO_TEXT` -- finetune only S2T +``` + +The scripts supports three modes of finetuning: +- `SPEECH_TO_SPEECH`: in this case all model weights except the text encoder will be engaged; +- `TEXT_TO_SPEECH`: only text-to-unit part of the model will be engaged in the finetuning, other weights will be frozen; +- `SPEECH_TO_TEXT`: only speech-to-text part of the model will be engaged in the finetuning. + +The referenced finetuning script does not support finetuning of the text encoder. + + +Below is an example bash script that launches finetuning of M4T-large on the dataset prepared earlier, using a single node with eight GPUs: + +``` +torchrun \ + --rdzv-backend=c10d \ + --rdzv-endpoint=localhost:0 \ + --nnodes=1 \ + --nproc-per-node=8 \ + --no-python \ + m4t_finetune \ + --mode SPEECH_TO_TEXT \ + --train_dataset $DATASET_DIR/train_manifest.json \ + --eval_dataset $DATASET_DIR/validation_manifest.json \ + --learning_rate 1e-6 \ + --warmup_steps 100 \ + --max_epochs 10 \ + --patience 3 \ + --model_name seamlessM4T_large \ + --save_model_to $DATASET_DIR/checkpoint.pt +``` + +Excerpt from an example finetuning log: + +``` +... +2023-08-21 14:46:16,936 INFO -- trainer.1100368: Eval after 300 updates: loss=8.7755 best_loss=8.7755 patience_steps_left=3 +2023-08-21 14:46:16,936 INFO -- trainer.1100368: Saving model +2023-08-21 14:46:35,863 INFO -- trainer.1100368: Epoch 006 / update 00310: train loss=16.3768 last lr=5.68E-08 +2023-08-21 14:46:42,610 INFO -- trainer.1100368: Epoch 006 / update 00320: train loss=16.3730 last lr=5.59E-08 +2023-08-21 14:46:48,285 INFO -- trainer.1100368: Epoch 006 / update 00330: train loss=16.4598 last lr=5.50E-08 +2023-08-21 14:46:54,390 INFO -- trainer.1100368: Epoch 006 / update 00340: train loss=16.4218 last lr=5.42E-08 +2023-08-21 14:47:08,461 INFO -- trainer.1100368: Epoch 006 / update 00350: train loss=16.3906 last lr=5.35E-08 +2023-08-21 14:47:09,067 INFO -- trainer.1100368: Run evaluation +2023-08-21 14:47:19,205 INFO -- trainer.1100368: Eval after 350 updates: loss=8.7462 best_loss=8.7462 patience_steps_left=3 +2023-08-21 14:47:19,205 INFO -- trainer.1100368: Saving model +2023-08-21 14:47:44,981 INFO -- trainer.1100368: Epoch 007 / update 00360: train loss=16.4267 last lr=5.27E-08 +2023-08-21 14:47:51,383 INFO -- trainer.1100368: Epoch 007 / update 00370: train loss=16.3630 last lr=5.20E-08 +2023-08-21 14:47:58,305 INFO -- trainer.1100368: Epoch 007 / update 00380: train loss=16.3666 last lr=5.13E-08 +2023-08-21 14:48:04,396 INFO -- trainer.1100368: Epoch 007 / update 00390: train loss=16.3605 last lr=5.06E-08 +2023-08-21 14:48:10,630 INFO -- trainer.1100368: Epoch 007 / update 00400: train loss=16.3518 last lr=5.00E-08 +... +``` diff --git a/seamless_communication/src/seamless_communication/cli/m4t/finetune/__init__.py b/seamless_communication/src/seamless_communication/cli/m4t/finetune/__init__.py new file mode 100644 index 0000000..5d20256 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/m4t/finetune/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. diff --git a/seamless_communication/src/seamless_communication/cli/m4t/finetune/dataloader.py b/seamless_communication/src/seamless_communication/cli/m4t/finetune/dataloader.py new file mode 100644 index 0000000..5f797d7 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/m4t/finetune/dataloader.py @@ -0,0 +1,230 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + + +import json +import logging +from dataclasses import dataclass +from typing import Any, Dict, Iterable, List, Optional + +import numpy as np +import torch +import torchaudio +import torchaudio.compliance.kaldi as ta_kaldi +from datasets import Dataset +from datasets.distributed import split_dataset_by_node +from fairseq2.data.text import TextTokenEncoder +from fairseq2.models.nllb import NllbTokenizer +from torch import Tensor +from torch.nn.functional import pad as pad_tensor +from torch.utils.data import DataLoader + +from seamless_communication.datasets.datatypes import LangPairSample +from seamless_communication.models.unity.unit_tokenizer import ( + UnitTokenEncoder, + UnitTokenizer, +) + +logger = logging.getLogger(__name__) + + +@dataclass +class SeqsBatch: + src_tokens: Optional[Tensor] + src_lengths: Optional[Tensor] + target_tokens: Optional[Tensor] + prev_output_tokens: Optional[Tensor] + target_lengths: Optional[Tensor] + + def __del__(self) -> None: + """Explicitly delete tensors + to force GPU memory cleanup""" + for tensor in [ + self.src_tokens, + self.src_lengths, + self.target_tokens, + self.prev_output_tokens, + self.target_lengths, + ]: + if tensor is not None: + del tensor + + +@dataclass +class MultimodalSeqsBatch: + speech_to_text: SeqsBatch + text_to_units: SeqsBatch + + def __del__(self) -> None: + del self.speech_to_text + del self.text_to_units + + +@dataclass +class BatchingConfig: + fbank_feats_pad_idx: int = 0 + """The pad index to use in fbanks batching.""" + + batch_size: int = 5 + + rank: int = 0 + """The rank of this worker in the process group.""" + + world_size: int = 1 + """The world size of the process group.""" + + num_workers: int = 2 + """Parallelism in dataset preparation.""" + + float_dtype: torch.dtype = torch.float16 + """Select between fp16/fp32 for float tensors """ + + +def worker_init_fn(worker_id): + np.random.seed(np.random.get_state()[1][0] + worker_id) + + +class UnitYDataLoader: + def __init__( + self, + text_tokenizer: NllbTokenizer, + unit_tokenizer: UnitTokenizer, + dataset_manifest_path: str, + batching_config: BatchingConfig, + ): + self.text_tokenizer = text_tokenizer + self.text_encoders_per_lang: Dict[str, TextTokenEncoder] = {} + self.unit_tokenizer = unit_tokenizer + self.unit_encoders_per_lang: Dict[str, UnitTokenEncoder] = {} + self.batching_config = batching_config + self.dataset = self._load_manifest(dataset_manifest_path) + + def get_dataloader(self) -> DataLoader: + subset = split_dataset_by_node( + self.dataset, + rank=self.batching_config.rank, + world_size=self.batching_config.world_size, + ) + data_loader = DataLoader( + dataset=subset, + batch_size=self.batching_config.batch_size, + shuffle=True, + num_workers=self.batching_config.num_workers, + collate_fn=self._prepare_batch, + worker_init_fn=worker_init_fn, + ) + return data_loader + + def __iter__(self) -> Iterable[MultimodalSeqsBatch]: + return self.get_dataloader().__iter__() + + def _get_source_fbank(self, sample: LangPairSample) -> Tensor: + audio_input = torchaudio.load(sample.source.audio_local_path)[0] + return ta_kaldi.fbank(audio_input, num_mel_bins=80) + + def _get_tokenized_target_text(self, sample: LangPairSample) -> Tensor: + """Expected sequence is [, , ..text tokens.., ]""" + target_lang = sample.target.lang + if target_lang not in self.text_encoders_per_lang: + self.text_encoders_per_lang[ + target_lang + ] = self.text_tokenizer.create_encoder(lang=target_lang, mode="target") + tokens = self.text_encoders_per_lang[target_lang](sample.target.text) + eos_idx = self.text_tokenizer.vocab_info.eos_idx + tokens = torch.concat([tokens, torch.LongTensor([eos_idx])]) + return tokens + + def _get_tokenized_units(self, sample: LangPairSample) -> Optional[Tensor]: + """Expected sequence is [, , ..unit tokens.., ]""" + if sample.target.units is None: + return None + target_lang = sample.target.lang + if target_lang not in self.unit_encoders_per_lang: + self.unit_encoders_per_lang[ + target_lang + ] = self.unit_tokenizer.create_encoder(lang=target_lang) + tokens = self.unit_encoders_per_lang[target_lang]( + torch.LongTensor(sample.target.units).unsqueeze(0) + ) + eos_idx = self.unit_tokenizer.vocab_info.eos_idx + tokens = torch.concat([tokens.squeeze(0), torch.LongTensor([eos_idx])]) + return tokens + + def _batch_tensors(self, tensors: List[Tensor], pad_value: Any) -> Tensor: + padding_size = max(tensor.shape[0] for tensor in tensors) + dims = len(tensors[0].shape) + padded_tensors = [] + for tensor in tensors: + padding = [0] * 2 * dims + padding[-1] = padding_size - tensor.shape[0] + padded_tensors.append(pad_tensor(tensor, padding, "constant", pad_value)) + return torch.stack([tensor for tensor in padded_tensors], dim=0) + + def _prepare_batch(self, raw_samples: List[Dict[str, Any]]) -> MultimodalSeqsBatch: + samples = [LangPairSample.from_json(sample) for sample in raw_samples] + # input speech + src_tokens_list = [self._get_source_fbank(sample) for sample in samples] + src_tokens = self._batch_tensors( + src_tokens_list, pad_value=self.batching_config.fbank_feats_pad_idx + ).to(self.batching_config.float_dtype) + src_lengths = torch.LongTensor( + [src_tokens.shape[0] for src_tokens in src_tokens_list] + ) + # output text + text_tokens_list = [ + self._get_tokenized_target_text(sample) for sample in samples + ] + text_pad_idx = self.text_tokenizer.vocab_info.pad_idx + prev_outputs_tokens = self._batch_tensors( + [tokens[:-1] for tokens in text_tokens_list], pad_value=text_pad_idx + ) + target_tokens = self._batch_tensors( + [tokens[1:] for tokens in text_tokens_list], pad_value=text_pad_idx + ) + tokens_lengths = torch.LongTensor( + [tokens.shape[0] - 1 for tokens in text_tokens_list] + ) + # output units + units_list_raw = [self._get_tokenized_units(sample) for sample in samples] + if None in units_list_raw: + prev_outputs_units = None + target_units = None + units_lengths = None + else: + units_list: List[Tensor] = [ + value for value in units_list_raw if value is not None + ] + units_pad_idx = self.unit_tokenizer.vocab_info.pad_idx + prev_outputs_units = self._batch_tensors( + [tokens[:-1] for tokens in units_list], pad_value=units_pad_idx + ) + target_units = self._batch_tensors( + [tokens[1:] for tokens in units_list], pad_value=units_pad_idx + ) + units_lengths = torch.LongTensor( + [tokens.shape[0] - 1 for tokens in units_list] + ) + return MultimodalSeqsBatch( + speech_to_text=SeqsBatch( + src_tokens=src_tokens, + src_lengths=src_lengths, + target_tokens=target_tokens, + prev_output_tokens=prev_outputs_tokens, + target_lengths=tokens_lengths, + ), + text_to_units=SeqsBatch( + src_tokens=None, + src_lengths=None, + target_tokens=target_units, + prev_output_tokens=prev_outputs_units, + target_lengths=units_lengths, + ), + ) + + def _load_manifest(self, dataset_manifest_path: str) -> Dataset: + with open(dataset_manifest_path) as fp_in: + dataset = [json.loads(line) for line in fp_in] + return Dataset.from_list(dataset) diff --git a/seamless_communication/src/seamless_communication/cli/m4t/finetune/dataset.py b/seamless_communication/src/seamless_communication/cli/m4t/finetune/dataset.py new file mode 100644 index 0000000..9b4de54 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/m4t/finetune/dataset.py @@ -0,0 +1,200 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + + +import argparse +import dataclasses +import json +import logging +import os +from pathlib import Path + +import torch + +from seamless_communication.datasets.huggingface import ( + Speech2SpeechFleursDatasetBuilder, + SpeechTokenizer, +) +from seamless_communication.models.unit_extractor import UnitExtractor + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s -- %(name)s: %(message)s", +) + +logger = logging.getLogger("dataset") + + +# Full list of FLEURS langcodes is available at https://huggingface.co/datasets/google/fleurs +# Full list of M4T langcodes is available +# in paper "SeamlessM4T—Massively Multilingual & Multimodal Machine Translation" (Table 5) +UNITY_TO_FLEURS_LANG_MAPPING = { + "eng": "en_us", + "ita": "it_it", + "afr": "af_za", + "asm": "as_in", + "bel": "be_by", + "bul": "bg_bg", + "ben": "bn_in", + "cat": "ca_es", + "ces": "cs_cz", + "dan": "da_dk", + "deu": "de_de", + "ell": "el_gr", + "fin": "fi_fi", + "fra": "fr_fr", + "glg": "gl_es", + "heb": "he_il", + "hin": "hi_in", + "hrv": "hr_hr", + "hun": "hu_hu", + "ind": "id_id", + "ibo": "ig_ng", + "isl": "is_is", + "ita": "it_it", + "jpn": "ja_jp", + "jav": "jv_id", + "kaz": "kk_kz", + "kan": "kn_in", + "kir": "ky_kg", + "kor": "ko_kr", + "lit": "lt_lt", + "mkd": "mk_mk", + "mlt": "mt_mt", + "mya": "my_mm", + "nld": "nl_nl", + "pan": "pa_in", + "pol": "pl_pl", + "ron": "ro_ro", + "rus": "ru_ru", + "snd": "sd_in", + "slk": "sk_sk", + "srp": "sr_rs", + "swh": "sw_ke", + "tam": "ta_in", + "tel": "te_in", + "tha": "th_th", + "tur": "tr_tr", + "ukr": "uk_ua", + "urd": "ur_pk", + "uzn": "uz_uz", + "vie": "vi_vn", + "yor": "yo_ng", + "zul": "zu_za", +} + + +def _check_lang_code_mapping(lang: str) -> None: + if lang not in UNITY_TO_FLEURS_LANG_MAPPING: + raise ValueError( + f"No language code mapping for {lang}(M4T)->??(FLEURs). " + "Please expand `UNITY_TO_FLEURS_LANG_MAPPING`" + ) + + +class UnitSpeechTokenizer(SpeechTokenizer): + MODEL_NAME = "xlsr2_1b_v2" + KMEANS_MODEL_URI = "https://dl.fbaipublicfiles.com/seamlessM4T/models/unit_extraction/kmeans_10k.npy" + OUTPUT_LAYER_IDX = 34 + + def __init__(self, device: torch.device): + super().__init__() + self.device = device + self.unit_extractor = UnitExtractor( + model_name_or_card=self.MODEL_NAME, + kmeans_uri=self.KMEANS_MODEL_URI, + device=self.device, + ) + + def encode(self, wav: torch.Tensor, sample_rate: int) -> torch.Tensor: + return self.unit_extractor.predict( + wav.to(self.device), + out_layer_idx=self.OUTPUT_LAYER_IDX, + sample_rate=sample_rate, + ) + + +def download_fleurs_dataset( + source_lang: str, + target_lang: str, + split: str, + save_directory: str, +) -> str: + _check_lang_code_mapping(source_lang) + _check_lang_code_mapping(target_lang) + device = ( + torch.device("cuda:0") if torch.cuda.device_count() > 0 else torch.device("cpu") + ) + tokenizer = UnitSpeechTokenizer(device=device) + dataset_iterator = Speech2SpeechFleursDatasetBuilder( + source_lang=UNITY_TO_FLEURS_LANG_MAPPING[source_lang], + target_lang=UNITY_TO_FLEURS_LANG_MAPPING[target_lang], + dataset_cache_dir=save_directory, + speech_tokenizer=tokenizer, + skip_source_audio=True, # don't extract units from source audio + skip_target_audio=False, + split=split, + ) + manifest_path: str = os.path.join(save_directory, f"{split}_manifest.json") + with open(manifest_path, "w") as fp_out: + for idx, sample in enumerate(dataset_iterator.__iter__(), start=1): + # correction as FleursDatasetBuilder return fleurs lang codes + sample.source.lang = source_lang + sample.target.lang = target_lang + sample.target.waveform = None # already extracted units + fp_out.write(json.dumps(dataclasses.asdict(sample)) + "\n") + logger.info(f"Saved {idx} samples for split={split} to {manifest_path}") + return manifest_path + + +def init_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description=( + "Helper script to download training/evaluation dataset (FLEURS)," + "extract units from target audio and save the dataset as a manifest " + "consumable by `finetune.py`." + ) + ) + parser.add_argument( + "--source_lang", + type=str, + required=True, + help="M4T langcode of the dataset SOURCE language", + ) + parser.add_argument( + "--target_lang", + type=str, + required=True, + help="M4T langcode of the dataset TARGET language", + ) + parser.add_argument( + "--split", + type=str, + required=True, + help="Dataset split/shard to download (`train`, `validation`, `test`)", + ) + parser.add_argument( + "--save_dir", + type=Path, + required=True, + help="Directory where the datastets will be stored with HuggingFace datasets cache files", + ) + return parser + + +def main() -> None: + args = init_parser().parse_args() + manifest_path = download_fleurs_dataset( + source_lang=args.source_lang, + target_lang=args.target_lang, + split=args.split, + save_directory=args.save_dir, + ) + logger.info(f"Manifest saved to: {manifest_path}") + + +if __name__ == "__main__": + main() diff --git a/seamless_communication/src/seamless_communication/cli/m4t/finetune/dist_utils.py b/seamless_communication/src/seamless_communication/cli/m4t/finetune/dist_utils.py new file mode 100644 index 0000000..017952e --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/m4t/finetune/dist_utils.py @@ -0,0 +1,76 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + + +import logging +import os +from datetime import timedelta +from typing import List + +import torch +import torch.distributed as dist +import torch.multiprocessing + +logger = logging.getLogger(__name__) + + +def is_dist_initialized() -> bool: + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_rank() -> int: + if not is_dist_initialized(): + return 0 + return dist.get_rank() + + +def get_local_rank() -> int: + if not is_dist_initialized(): + return 0 + return int(os.environ["LOCAL_RANK"]) + + +def get_world_size() -> int: + if not is_dist_initialized(): + return 1 + return dist.get_world_size() + + +def is_main_process() -> bool: + return get_rank() == 0 + + +def init_distributed(loggers: List[logging.Logger]) -> None: + """Initializes the distributed backend""" + torch.multiprocessing.set_start_method("spawn") + if "RANK" not in os.environ: + logger.error( + "Cannot init disributed context, as environment varaibles are not set." + ) + return + rank = int(os.environ["RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + local_rank = int(os.environ["LOCAL_RANK"]) + logger.info( + f"Rank={rank} local rank={local_rank}, world_size={world_size}, is_master={rank == 0}" + ) + dist.init_process_group( + backend="nccl", + init_method="env://", + world_size=world_size, + rank=rank, + timeout=timedelta(seconds=180), + ) + logger.info(f"Setting cuda:{local_rank} as main device") + if not is_main_process(): + for to_mute in loggers: + to_mute.setLevel(logging.ERROR) + torch.cuda.set_device(local_rank) + dist.barrier() diff --git a/seamless_communication/src/seamless_communication/cli/m4t/finetune/finetune.py b/seamless_communication/src/seamless_communication/cli/m4t/finetune/finetune.py new file mode 100644 index 0000000..903a467 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/m4t/finetune/finetune.py @@ -0,0 +1,183 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import argparse +import logging +import os +from pathlib import Path + +import torch +from fairseq2.models.nllb.tokenizer import NllbTokenizer + +from seamless_communication.cli.m4t.finetune import dataloader, dist_utils, trainer +from seamless_communication.models.unity import ( + UnitTokenizer, + UnitYModel, + load_unity_model, + load_unity_text_tokenizer, + load_unity_unit_tokenizer, +) + +logging.basicConfig( + level=logging.INFO, + format=f"%(asctime)s %(levelname)s -- %(name)s.{os.getpid()}: %(message)s", +) + +logger = logging.getLogger("finetune") + + +def init_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Example finetuning script for M4T models" + ) + parser.add_argument( + "--train_dataset", + type=Path, + required=True, + help="Path to manifest with train samples", + ) + parser.add_argument( + "--eval_dataset", + type=Path, + required=True, + help="Path to manifest with eval samples", + ) + parser.add_argument( + "--model_name", + type=str, + default="seamlessM4T_medium", + help="Base model name (`seamlessM4T_medium`, `seamlessM4T_large`)", + ) + parser.add_argument( + "--save_model_to", + type=Path, + required=True, + help="Path to save best finetuned model", + ) + parser.add_argument( + "--seed", + type=int, + default=2343, + help="Randomizer seed value", + ) + parser.add_argument( + "--batch_size", + type=int, + default=5, + help="Batch size for training and evaluation", + ) + parser.add_argument( + "--patience", + type=int, + default=3, + help=( + "Set early termination after `patience` number of evaluations " + "without eval loss improvements" + ), + ) + parser.add_argument( + "--max_epochs", + type=int, + default=10, + help=("Max number of training epochs"), + ) + parser.add_argument( + "--learning_rate", + type=float, + default=1e-7, + help=("Finetuning learning rate"), + ) + parser.add_argument( + "--warmup_steps", + type=int, + default=100, + help=("Number of steps with linearly increasing learning rate"), + ) + parser.add_argument( + "--eval_steps", + type=int, + default=50, + help=("Get eval loss after each `eval_steps` training steps "), + ) + parser.add_argument( + "--log_steps", + type=int, + default=10, + help=("Log inner loss after each `log_steps` training steps"), + ) + parser.add_argument( + "--mode", + type=trainer.FinetuneMode, + choices=list(trainer.FinetuneMode), + default=trainer.FinetuneMode.SPEECH_TO_TEXT, + help=( + "* `SPEECH_TO_SPEECH` -- finetune S2T and T2U parts of the model; " + "* `TEXT_TO_SPEECH` -- finetune only T2U; " + "* `SPEECH_TO_TEXT` -- finetune only S2T" + ), + ) + return parser + + +def main() -> None: + args = init_parser().parse_args() + dist_utils.init_distributed([logger, trainer.logger]) + device = torch.device("cuda") + text_tokenizer: NllbTokenizer = load_unity_text_tokenizer(args.model_name) + unit_tokenizer: UnitTokenizer = load_unity_unit_tokenizer(args.model_name) + finetune_params = trainer.FinetuneParams( + finetune_mode=args.mode, + save_model_path=args.save_model_to, + device=device, + train_batch_size=args.batch_size, + eval_batch_size=args.batch_size, + patience=args.patience, + max_epochs=args.max_epochs, + learning_rate=args.learning_rate, + warmup_steps=args.warmup_steps, + eval_steps=args.eval_steps, + log_steps=args.log_steps, + ) + logger.info(f"Finetune params: {finetune_params}") + model: UnitYModel = load_unity_model( + args.model_name, device=finetune_params.device, dtype=torch.float16 + ) + logger.info(f"Model {model}") + assert model.target_vocab_info == text_tokenizer.vocab_info + assert model.t2u_model is not None + assert model.t2u_model.target_vocab_info == unit_tokenizer.vocab_info + + train_dataloader = dataloader.UnitYDataLoader( + text_tokenizer=text_tokenizer, + unit_tokenizer=unit_tokenizer, + batching_config=dataloader.BatchingConfig( + batch_size=finetune_params.train_batch_size, + rank=dist_utils.get_rank(), + world_size=dist_utils.get_world_size(), + ), + dataset_manifest_path=args.train_dataset, + ) + eval_dataloader = dataloader.UnitYDataLoader( + text_tokenizer=text_tokenizer, + unit_tokenizer=unit_tokenizer, + batching_config=dataloader.BatchingConfig( + batch_size=finetune_params.eval_batch_size, + rank=dist_utils.get_rank(), + world_size=dist_utils.get_world_size(), + ), + dataset_manifest_path=args.eval_dataset, + ) + finetune = trainer.UnitYFinetune( + model=model, + params=finetune_params, + train_data_loader=train_dataloader, + eval_data_loader=eval_dataloader, + ) + finetune.run() + + +if __name__ == "__main__": + main() diff --git a/seamless_communication/src/seamless_communication/cli/m4t/finetune/trainer.py b/seamless_communication/src/seamless_communication/cli/m4t/finetune/trainer.py new file mode 100644 index 0000000..2ce8de8 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/m4t/finetune/trainer.py @@ -0,0 +1,374 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + + +import logging +from contextlib import contextmanager +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Optional, Tuple + +import torch +import torch.distributed as dist +import torch.nn as nn +from fairseq2.data import VocabularyInfo +from fairseq2.models.sequence import SequenceModelOutput +from fairseq2.nn.padding import PaddingMask +from fairseq2.optim.lr_scheduler import MyleLR +from fairseq2.typing import Device +from torch.optim import Adam + +from seamless_communication.cli.m4t.finetune import dataloader, dist_utils +from seamless_communication.models.unity import UnitYModel + +logger = logging.getLogger(__name__) + + +class FinetuneMode(Enum): + SPEECH_TO_SPEECH = "SPEECH_TO_SPEECH" + SPEECH_TO_TEXT = "SPEECH_TO_TEXT" + TEXT_TO_SPEECH = "TEXT_TO_SPEECH" + + +@dataclass +class FinetuneParams: + save_model_path: Path + """Path were to save finetuned model.""" + + finetune_mode: FinetuneMode = FinetuneMode.TEXT_TO_SPEECH + """Allows to freeze S2T or T2U part of the model""" + + max_epochs: int = 10 + """ Maximum number of trainign epochs""" + + label_smoothing: float = 0.2 + """ Label smoothing coefficient for nll_loss """ + + warmup_steps: int = 100 + """ Number of steps with linearly increasing LR""" + + log_steps: int = 10 + """ Log inner loss after each `log_steps` training steps""" + + eval_steps: int = 50 + """ Get eval loss after each `eval_steps` training steps """ + + patience: int = 3 + """ Terminate if eval loss did not improve + over the last `patience * eval_steps` training steps""" + + learning_rate: float = 1e-5 + """ Optimizer learining rate """ + + train_batch_size: int = 5 + """The batch size during train steps""" + + eval_batch_size: int = 5 + """The batch size during evaluation.""" + + device: Device = torch.device("cuda") + """ Where to run computation""" + + +class UnitYFinetuneWrapper(nn.Module): + """Convenience wrapper that does a forward pass + and returns S2T and T2U logits""" + + def __init__(self, model: UnitYModel, mode: FinetuneMode, device: Device): + super().__init__() + assert model.t2u_model is not None + self.model: UnitYModel = model + self.freeze_s2t: bool = mode == FinetuneMode.TEXT_TO_SPEECH + self.freeze_t2u: bool = mode == FinetuneMode.SPEECH_TO_TEXT + self.device = device + + def forward( + self, batch: dataloader.MultimodalSeqsBatch + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + assert self.model.t2u_model is not None + dummy_context = contextmanager(lambda: iter([None]))() + with torch.no_grad() if self.freeze_s2t else dummy_context: # type:ignore + assert batch.speech_to_text.src_tokens is not None + seqs = batch.speech_to_text.src_tokens.to(self.device) + seq_lens = batch.speech_to_text.src_lengths.to(self.device) + speech_encoder_out, speech_encoder_padding_mask = self.model.encode_speech( + seqs=seqs, padding_mask=PaddingMask(seq_lens, seqs.size(1)) + ) + assert batch.speech_to_text.prev_output_tokens is not None + seqs = batch.speech_to_text.prev_output_tokens.to(self.device) + seq_lens = batch.speech_to_text.target_lengths.to(self.device) + text_decoder_out, text_decoder_padding_mask = self.model.decode( + seqs=seqs, + padding_mask=PaddingMask(seq_lens, seqs.size(1)), + encoder_output=speech_encoder_out, + encoder_padding_mask=speech_encoder_padding_mask, + ) + text_logits = self.model.final_proj(text_decoder_out) + if batch.text_to_units.prev_output_tokens is None: + return (text_logits, None) + dummy_context = contextmanager(lambda: iter([None]))() + with torch.no_grad() if self.freeze_t2u else dummy_context: # type:ignore + ( + unit_encoder_out, + unit_encoder_padding_mask, + ) = self.model.t2u_model.encode( + text_decoder_output=text_decoder_out, + text_decoder_padding_mask=text_decoder_padding_mask, + ) + seqs = batch.text_to_units.prev_output_tokens.to(self.device) + seq_lens = batch.text_to_units.target_lengths.to(self.device) + unit_decoder_out, _ = self.model.t2u_model.decode( + seqs=seqs, + padding_mask=PaddingMask(seq_lens, seqs.size(1)), + encoder_output=unit_encoder_out, + encoder_padding_mask=unit_encoder_padding_mask, + ) + unit_logits = self.model.t2u_model.final_proj(unit_decoder_out) + + return (text_logits, unit_logits) + + +class CalcLoss: + """Calculates negative log likelihood loss for S2T and T2U""" + + def __init__( + self, + label_smoothing: float, + s2t_vocab_info: VocabularyInfo, + t2u_vocab_info: VocabularyInfo, + ): + self.label_smoothing = label_smoothing + self.s2t_vocab_info = s2t_vocab_info + self.t2u_vocab_info = t2u_vocab_info + + def __call__( + self, + batch: dataloader.MultimodalSeqsBatch, + text_logits: torch.Tensor, + unit_logits: Optional[torch.Tensor], + ) -> torch.Tensor: + assert batch.speech_to_text.target_lengths is not None + s2t_numel = torch.sum(batch.speech_to_text.target_lengths).to( + text_logits.device + ) + s2t_loss = SequenceModelOutput( + logits=text_logits, vocab_info=self.s2t_vocab_info + ).compute_loss( + targets=batch.speech_to_text.target_tokens.to(text_logits.device), + ignore_prefix_size=1, + label_smoothing=self.label_smoothing, + ) + if unit_logits is None: + return s2t_loss / s2t_numel + assert batch.text_to_units.target_lengths is not None + s2u_numel = torch.sum(batch.text_to_units.target_lengths).to(unit_logits.device) + s2u_loss = SequenceModelOutput( + logits=unit_logits, vocab_info=self.t2u_vocab_info + ).compute_loss( + targets=batch.text_to_units.target_tokens.to(unit_logits.device), + ignore_prefix_size=1, + label_smoothing=self.label_smoothing, + ) + return s2t_loss / s2t_numel + s2u_loss / s2u_numel + + +class LossCollector: + """Aggregrates loss history across nodes""" + + def __init__(self, device: Optional[Device] = None, reduce_op: str = "avg"): + self.n_samples: float = 0 + self.val_sum: float = 0.0 + self.reduce_op = reduce_op + self.device = device + self.is_distributed = dist_utils.is_dist_initialized() + + def reset(self) -> None: + self.n_samples = 0 + self.val_sum = 0.0 + + def update(self, n_samples: int, batch_loss: float) -> None: + self.n_samples += n_samples + self.val_sum += batch_loss + + def reduce(self) -> float: + n_samples, val_sum = self._collect() + if self.reduce_op == "avg": + return val_sum / (n_samples + 1) + if self.reduce_op == "sum": + return val_sum + raise ValueError() + + def _collect(self) -> Tuple[float, float]: + if not self.is_distributed: + return self.n_samples, self.val_sum + local_val = torch.tensor([[self.n_samples, self.val_sum]], device=self.device) + all_vals = [ + torch.zeros((1, 2), device=self.device) + for _ in range(dist_utils.get_world_size()) + ] + dist.all_gather(all_vals, local_val) + losses = torch.concat(all_vals, dim=0) + reduced = torch.sum(losses, dim=0).reshape(2).cpu() + return reduced[0].item(), reduced[1].item() + + +class UnitYFinetune: + def __init__( + self, + model: UnitYModel, + params: FinetuneParams, + train_data_loader: dataloader.UnitYDataLoader, + eval_data_loader: Optional[dataloader.UnitYDataLoader] = None, + ): + self.params = params + + assert model.t2u_model is not None + self.calc_loss = CalcLoss( + label_smoothing=self.params.label_smoothing, + s2t_vocab_info=model.target_vocab_info, + t2u_vocab_info=model.t2u_model.target_vocab_info, + ) + self.model = self._wrap_model_for_trainining(model=model) + self.train_data_loader = train_data_loader + self.eval_data_loader = eval_data_loader + self.optimizer = Adam( + params=self.model.parameters(), + lr=self.params.learning_rate, + betas=(0.9, 0.98), + eps=1e-08, + maximize=False, + weight_decay=0.0, + fused=True, + ) + self.grad_scaler = torch.cuda.amp.GradScaler() + self.lr_scheduler = MyleLR( + optimizer=self.optimizer, + num_warmup_steps=self.params.warmup_steps, + start_lr=1e-9, + ) + + self.train_loss_hist = LossCollector(device=params.device) + self.epoch_idx: int = 0 + self.update_idx: int = 0 + self.patience_left: int = self.params.patience + self.best_eval_loss: Optional[float] = None + self.is_best_state: bool = False + + def _reset_stats(self) -> None: + self.train_loss_hist.reset() + self.epoch_idx = 0 + self.update_idx = 0 + self.patience_left = self.params.patience + self.best_eval_loss = None + self.is_best_state = False + + def _wrap_model_for_trainining(self, model: UnitYModel) -> nn.Module: + wrapped_model = UnitYFinetuneWrapper( + model=model, mode=self.params.finetune_mode, device=self.params.device + ) + if not dist_utils.is_dist_initialized(): + return wrapped_model + return nn.parallel.DistributedDataParallel( + wrapped_model, + device_ids=[dist_utils.get_local_rank()], + find_unused_parameters=True, + ) + + def _update_eval_stats(self, eval_loss: float) -> None: + self.is_best_state = ( + self.best_eval_loss is None or eval_loss < self.best_eval_loss + ) + self.best_eval_loss = eval_loss if self.is_best_state else self.best_eval_loss + self.patience_left = ( + self.params.patience if self.is_best_state else self.patience_left - 1 + ) + logger.info( + f"Eval after {self.update_idx} updates: " + f"loss={eval_loss:.4f} " + f"best_loss={self.best_eval_loss:.4f} " + f"patience_steps_left={self.patience_left}" + ) + + def _eval_model(self) -> None: + """Calc avg loss on eval dataset and update evaluation stats""" + if self.eval_data_loader is None: + return + logger.info("Run evaluation") + loss_hist = LossCollector(device=self.params.device) + self.model.eval() + with torch.no_grad(): + for batch in self.eval_data_loader.get_dataloader(): + assert batch.speech_to_text.src_tokens is not None + loss = self.calc_loss(batch, *self.model(batch)) + if loss.isnan(): + logger.warning("Eval loss value is NaN, setting to inf") + loss_val = float("Inf") + else: + loss_val = loss.item() + del batch # force memory release + loss_hist.update(1, loss_val) + eval_loss = loss_hist.reduce() + self._update_eval_stats(eval_loss) + + def _train_step_log(self): + """Log train stats""" + if (self.update_idx + 1) % self.params.log_steps == 0: + avg_loss = self.train_loss_hist.reduce() + self.train_loss_hist.reset() + logger.info( + f"Epoch {str(self.epoch_idx + 1).zfill(3)} / " + f"update {str(self.update_idx + 1).zfill(5)}: " + f"train loss={avg_loss:.4f} " + f"last lr={self.lr_scheduler.get_last_lr()[0]:.2E}" + ) + + def _train_step(self, batch: dataloader.MultimodalSeqsBatch) -> None: + """Run one train step""" + self.model.train() + self.optimizer.zero_grad() + tokens, units = self.model(batch) + loss = self.calc_loss(batch, tokens, units) + self.grad_scaler.scale(loss).backward() + self.grad_scaler.step(self.optimizer) + self.grad_scaler.update() + self.lr_scheduler.step() + assert batch.speech_to_text.src_tokens is not None + self.train_loss_hist.update(1, loss.item()) + self._train_step_log() + + def _save_model(self): + logger.info("Saving model") + if dist_utils.is_main_process(): + state_dict = { + key.replace("module.model.", ""): value + for key, value in self.model.state_dict().items() + } + torch.save(state_dict, self.params.save_model_path) + if dist_utils.is_dist_initialized(): + dist.barrier() + + def run(self): + logger.info("Start finetuning") + self._reset_stats() + self._eval_model() + batch_itr = self.train_data_loader.get_dataloader() + while self.epoch_idx < self.params.max_epochs and self.patience_left: + for train_batch in batch_itr: + self._train_step(batch=train_batch) + if self.update_idx and self.update_idx % self.params.eval_steps == 0: + self._eval_model() + if self.is_best_state: + self._save_model() + elif not self.patience_left: + no_improve_steps = self.params.eval_steps * self.params.patience + logger.info( + "Early termination, as eval loss did not improve " + f"over last {no_improve_steps} updates" + ) + break + self.update_idx += 1 + self.epoch_idx += 1 diff --git a/seamless_communication/src/seamless_communication/cli/m4t/predict/README.md b/seamless_communication/src/seamless_communication/cli/m4t/predict/README.md new file mode 100644 index 0000000..5235365 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/m4t/predict/README.md @@ -0,0 +1,126 @@ +# Inference with SeamlessM4T models +Refer to the [SeamlessM4T README](../../../../../docs/m4t) for an overview of the M4T models. + +Inference is run with the CLI, from the root directory of the repository. + +The model can be specified with `--model_name` `seamlessM4T_v2_large`, `seamlessM4T_large` or `seamlessM4T_medium`: + +**S2ST**: +```bash +m4t_predict --task s2st --tgt_lang --output_path --model_name seamlessM4T_v2_large +``` + +**S2TT:** +```bash +m4t_predict --task s2tt --tgt_lang --model_name seamlessM4T_v2_large +``` + +**T2TT:** +```bash +m4t_predict --task t2tt --tgt_lang --src_lang --model_name seamlessM4T_v2_large +``` + +**T2ST:** +```bash +m4t_predict --task t2st --tgt_lang --src_lang --output_path --model_name seamlessM4T_v2_large + +``` +**ASR:** +```bash +m4t_predict --task asr --tgt_lang --model_name seamlessM4T_v2_large +``` +## Inference breakdown + +Inference calls for the `Translator` object instantiated with a multitask UnitY or UnitY2 model with the options: +- [`seamlessM4T_v2_large`](https://huggingface.co/facebook/seamless-m4t-v2-large) +- [`seamlessM4T_large`](https://huggingface.co/facebook/seamless-m4t-large) +- [`seamlessM4T_medium`](https://huggingface.co/facebook/seamless-m4t-medium) + +and a vocoder: +- `vocoder_v2` for `seamlessM4T_v2_large`. +- `vocoder_36langs` for `seamlessM4T_large` or `seamlessM4T_medium`. + +```python +import torch +from seamless_communication.inference import Translator + + +# Initialize a Translator object with a multitask model, vocoder on the GPU. +translator = Translator("seamlessM4T_large", "vocoder_36langs", torch.device("cuda:0"), torch.float16) +``` + +Now `predict()` can be used to run inference as many times on any of the supported tasks. + +Given an input audio with `` or an input text `` in ``, +we first set the `text_generation_opts`, `unit_generation_opts` and then translate into `` as follows: + +**S2ST and T2ST (speech output):** + +```python +# S2ST +text_output, speech_output = translator.predict( + input=, + task_str="S2ST", + tgt_lang=, + text_generation_opts=text_generation_opts, + unit_generation_opts=unit_generation_opts +) + +# T2ST +text_output, speech_output = translator.predict( + input=, + task_str="T2ST", + tgt_lang=, + src_lang=, + text_generation_opts=text_generation_opts, + unit_generation_opts=unit_generation_opts +) + +``` +Note that `` must be specified for T2ST. + +The generated units are synthesized and the output audio file is saved with: + +```python +# Save the translated audio output: +import torchaudio +torchaudio.save( + , + speech_output.audio_wavs[0][0].cpu(), + sample_rate=speech_output.sample_rate, +) +``` +**S2TT, T2TT and ASR (text output):** + +```python +# S2TT +text_output, _ = translator.predict( + input=, + task_str="S2TT", + tgt_lang=, + text_generation_opts=text_generation_opts, + unit_generation_opts=None +) + +# ASR +# This is equivalent to S2TT with `=`. + text_output, _ = translator.predict( + input=, + task_str="ASR", + tgt_lang=, + text_generation_opts=text_generation_opts, + unit_generation_opts=None +) + +# T2TT +text_output, _ = translator.predict( + input=, + task_str="T2TT", + tgt_lang=, + src_lang=, + text_generation_opts=text_generation_opts, + unit_generation_opts=None +) + +``` +Note that `` must be specified for T2TT diff --git a/seamless_communication/src/seamless_communication/cli/m4t/predict/__init__.py b/seamless_communication/src/seamless_communication/cli/m4t/predict/__init__.py new file mode 100644 index 0000000..1303ff6 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/m4t/predict/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from seamless_communication.cli.m4t.predict.predict import ( + add_inference_arguments as add_inference_arguments, +) +from seamless_communication.cli.m4t.predict.predict import ( + set_generation_opts as set_generation_opts, +) diff --git a/seamless_communication/src/seamless_communication/cli/m4t/predict/predict.py b/seamless_communication/src/seamless_communication/cli/m4t/predict/predict.py new file mode 100644 index 0000000..29e5915 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/m4t/predict/predict.py @@ -0,0 +1,247 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import argparse +import logging +from argparse import Namespace +from pathlib import Path +from typing import Tuple + +import torch +import torchaudio +from fairseq2.generation import NGramRepeatBlockProcessor + +from seamless_communication.inference import SequenceGeneratorOptions, Translator + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s -- %(name)s: %(message)s", +) + +logger = logging.getLogger(__name__) + + +def add_inference_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + parser.add_argument("--task", type=str, help="Task type") + parser.add_argument( + "--tgt_lang", type=str, help="Target language to translate/transcribe into." + ) + parser.add_argument( + "--src_lang", + type=str, + help="Source language, only required if input is text.", + default=None, + ) + parser.add_argument( + "--output_path", + type=Path, + help="Path to save the generated audio.", + default=None, + ) + parser.add_argument( + "--model_name", + type=str, + help=( + "Base model name (`seamlessM4T_medium`, " + "`seamlessM4T_large`, `seamlessM4T_v2_large`)" + ), + default="seamlessM4T_v2_large", + ) + parser.add_argument( + "--vocoder_name", + type=str, + help="Vocoder model name", + default="vocoder_v2", + ) + # Text generation args. + parser.add_argument( + "--text_generation_beam_size", + type=int, + help="Beam size for incremental text decoding.", + default=5, + ) + parser.add_argument( + "--text_generation_max_len_a", + type=int, + help="`a` in `ax + b` for incremental text decoding.", + default=1, + ) + parser.add_argument( + "--text_generation_max_len_b", + type=int, + help="`b` in `ax + b` for incremental text decoding.", + default=200, + ) + parser.add_argument( + "--text_generation_ngram_blocking", + type=bool, + help=( + "Enable ngram_repeat_block for incremental text decoding." + "This blocks hypotheses with repeating ngram tokens." + ), + default=False, + ) + parser.add_argument( + "--no_repeat_ngram_size", + type=int, + help="Size of ngram repeat block for both text & unit decoding.", + default=4, + ) + # Unit generation args. + parser.add_argument( + "--unit_generation_beam_size", + type=int, + help=( + "Beam size for incremental unit decoding" + "not applicable for the NAR T2U decoder." + ), + default=5, + ) + parser.add_argument( + "--unit_generation_max_len_a", + type=int, + help=( + "`a` in `ax + b` for incremental unit decoding" + "not applicable for the NAR T2U decoder." + ), + default=25, + ) + parser.add_argument( + "--unit_generation_max_len_b", + type=int, + help=( + "`b` in `ax + b` for incremental unit decoding" + "not applicable for the NAR T2U decoder." + ), + default=50, + ) + parser.add_argument( + "--unit_generation_ngram_blocking", + type=bool, + help=( + "Enable ngram_repeat_block for incremental unit decoding." + "This blocks hypotheses with repeating ngram tokens." + ), + default=False, + ) + parser.add_argument( + "--unit_generation_ngram_filtering", + type=bool, + help=( + "If True, removes consecutive repeated ngrams" + "from the decoded unit output." + ), + default=False, + ) + parser.add_argument( + "--text_unk_blocking", + type=bool, + help=( + "If True, set penalty of UNK to inf in text generator " + "to block unk output." + ), + default=False, + ) + return parser + + +def set_generation_opts( + args: Namespace, +) -> Tuple[SequenceGeneratorOptions, SequenceGeneratorOptions]: + # Set text, unit generation opts. + text_generation_opts = SequenceGeneratorOptions( + beam_size=args.text_generation_beam_size, + soft_max_seq_len=( + args.text_generation_max_len_a, + args.text_generation_max_len_b, + ), + ) + if args.text_unk_blocking: + text_generation_opts.unk_penalty = torch.inf + if args.text_generation_ngram_blocking: + text_generation_opts.step_processor = NGramRepeatBlockProcessor( + ngram_size=args.no_repeat_ngram_size + ) + + unit_generation_opts = SequenceGeneratorOptions( + beam_size=args.unit_generation_beam_size, + soft_max_seq_len=( + args.unit_generation_max_len_a, + args.unit_generation_max_len_b, + ), + ) + if args.unit_generation_ngram_blocking: + unit_generation_opts.step_processor = NGramRepeatBlockProcessor( + ngram_size=args.no_repeat_ngram_size + ) + return text_generation_opts, unit_generation_opts + + +def main() -> None: + parser = argparse.ArgumentParser( + description="M4T inference on supported tasks using Translator." + ) + parser.add_argument("input", type=str, help="Audio WAV file path or text input.") + + parser = add_inference_arguments(parser) + args = parser.parse_args() + if not args.task or not args.tgt_lang: + raise Exception( + "Please provide required arguments for evaluation - task, tgt_lang" + ) + + if args.task.upper() in {"S2ST", "T2ST"} and args.output_path is None: + raise ValueError("output_path must be provided to save the generated audio") + + if torch.cuda.is_available(): + device = torch.device("cuda:0") + dtype = torch.float16 + else: + device = torch.device("cpu") + dtype = torch.float32 + + logger.info(f"Running inference on {device=} with {dtype=}.") + + translator = Translator(args.model_name, args.vocoder_name, device, dtype=dtype) + + text_generation_opts, unit_generation_opts = set_generation_opts(args) + + logger.info(f"{text_generation_opts=}") + logger.info(f"{unit_generation_opts=}") + logger.info( + f"unit_generation_ngram_filtering={args.unit_generation_ngram_filtering}" + ) + + # If the input is audio, resample to 16kHz + if args.task.upper() in {"S2ST", "ASR", "S2TT"}: + wav, sample_rate = torchaudio.load(args.input) + translator_input = torchaudio.functional.resample( + wav, orig_freq=sample_rate, new_freq=16_000 + ) + else: + translator_input = args.input + + text_output, speech_output = translator.predict( + translator_input, + args.task, + args.tgt_lang, + src_lang=args.src_lang, + text_generation_opts=text_generation_opts, + unit_generation_opts=unit_generation_opts, + unit_generation_ngram_filtering=args.unit_generation_ngram_filtering, + ) + + if speech_output is not None: + logger.info(f"Saving translated audio in {args.tgt_lang}") + torchaudio.save( + args.output_path, + speech_output.audio_wavs[0][0].to(torch.float32).cpu(), + sample_rate=speech_output.sample_rate, + ) + logger.info(f"Translated text in {args.tgt_lang}: {text_output[0]}") + + +if __name__ == "__main__": + main() diff --git a/seamless_communication/src/seamless_communication/cli/streaming/README.md b/seamless_communication/src/seamless_communication/cli/streaming/README.md new file mode 100644 index 0000000..2c3f0b8 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/streaming/README.md @@ -0,0 +1,47 @@ +# Evaluating SeamlessStreaming and Seamless models +SeamlessStreaming is the streaming only model and Seamless is the expressive streaming model. + +## Quick start: + +Evaluation can be run with the `streaming_evaluate` CLI. + +We use the `seamless_streaming_unity` for loading the speech encoder and T2U models, and `seamless_streaming_monotonic_decoder` for loading the text decoder for streaming evaluation. This is already set as defaults for the `streaming_evaluate` CLI, but can be overridden using the `--unity-model-name` and `--monotonic-decoder-model-name` args if required. + +Note that the numbers in our paper use single precision floating point format (fp32) for evaluation by setting `--dtype fp32`. Also note that the results from running these evaluations might be slightly different from the results reported in our paper (which will be updated soon with the new results). + +### S2TT: +Set the task to `s2tt` for evaluating the speech-to-text translation part of the SeamlessStreaming model. + +```bash +streaming_evaluate --task s2tt --data-file --audio-root-dir --output --tgt-lang <3_letter_lang_code> +``` + +Note: The `--ref-field` can be used to specify the name of the reference column in the dataset. + +### ASR: +Set the task to `asr` for evaluating the automatic speech recognition part of the SeamlessStreaming model. Make sure to pass the source language as the `--tgt-lang` arg. + +```bash +streaming_evaluate --task asr --data-file --audio-root-dir --output --tgt-lang <3_letter_source_lang_code> +``` + +### S2ST: + +#### SeamlessStreaming: + +Set the task to `s2st` for evaluating the speech-to-speech translation part of the SeamlessStreaming model. + +```bash +streaming_evaluate --task s2st --data-file --audio-root-dir --output --tgt-lang <3_letter_lang_code> +``` + +#### Seamless: +The Seamless model is a unified model for streaming expressive speech-to-speech translation. Use the `--expressive` arg for running evaluation of this unified model. + +```bash +streaming_evaluate --task s2st --data-file --audio-root-dir --output --tgt-lang <3_letter_lang_code> --expressive --gated-model-dir +``` + +The Seamless model uses `vocoder_pretssel` which is a 24KHz version (`vocoder_pretssel`) by default. In the current version of our paper, we use 16KHz version (`vocoder_pretssel_16khz`) for the evaluation, so in order to reproduce those results please add this arg to the above command: `--vocoder-name vocoder_pretssel_16khz`. + +`vocoder_pretssel` or `vocoder_pretssel_16khz` checkpoints are gated, please check out [this section](/README.md#seamlessexpressive-models) to acquire these checkpoints. Also, make sure to add `--gated-model-dir ` diff --git a/seamless_communication/src/seamless_communication/cli/streaming/__init__.py b/seamless_communication/src/seamless_communication/cli/streaming/__init__.py new file mode 100644 index 0000000..15d8859 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/streaming/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. diff --git a/seamless_communication/src/seamless_communication/cli/streaming/evaluate.py b/seamless_communication/src/seamless_communication/cli/streaming/evaluate.py new file mode 100644 index 0000000..a9343c0 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/streaming/evaluate.py @@ -0,0 +1,104 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import argparse +import logging + +from fairseq2.assets import asset_store, download_manager + +from seamless_communication.cli.streaming.scorers.seamless_quality_scorer import ( + SeamlessQualityScorer as SeamlessQualityScorer, +) +from seamless_communication.streaming.agents.seamless_s2st import SeamlessS2STAgent +from seamless_communication.streaming.agents.seamless_streaming_s2st import ( + SeamlessStreamingS2STAgent, +) +from seamless_communication.streaming.agents.seamless_streaming_s2t import ( + SeamlessStreamingS2TAgent, +) + +from simuleval.cli import evaluate + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s -- %(name)s: %(message)s", +) + +logger = logging.getLogger(__name__) + + +def main() -> None: + parser = argparse.ArgumentParser( + add_help=False, + description="Streaming evaluation of Seamless UnitY models", + conflict_handler="resolve", + ) + + parser.add_argument( + "--task", + choices=["s2st", "s2tt", "asr"], + required=True, + type=str, + help="Target language to translate/transcribe into.", + ) + parser.add_argument( + "--expressive", + action="store_true", + default=False, + help="Expressive streaming S2ST inference", + ) + + args, _ = parser.parse_known_args() + + model_configs = dict( + source_segment_size=320, + device="cuda:0", + dtype="fp16", + min_starting_wait_w2vbert=192, + decision_threshold=0.5, + no_early_stop=True, + max_len_a=0, + max_len_b=100, + ) + + eval_configs = dict(quality_metrics="SEAMLESS_QUALITY_SCORER") + if args.task == "s2st": + model_configs["min_unit_chunk_size"] = 50 + eval_configs["latency_metrics"] = "StartOffset EndOffset" + + if args.expressive: + agent_class = SeamlessS2STAgent + else: + agent_class = SeamlessStreamingS2STAgent + elif args.task in ["s2tt", "asr"]: + assert args.expressive is False, "S2TT inference cannot be expressive." + agent_class = SeamlessStreamingS2TAgent + parser.add_argument( + "--unity-model-name", + type=str, + help="Unity model name.", + default="seamless_streaming_unity", + ) + args, _ = parser.parse_known_args() + asset_card = asset_store.retrieve_card(name=args.unity_model_name) + tokenizer_uri = asset_card.field("tokenizer").as_uri() + tokenizer_path = download_manager.download_tokenizer( + tokenizer_uri, asset_card.name, force=False, progress=True + ) + eval_configs["latency_metrics"] = "AL LAAL" + eval_configs["eval_latency_unit"] = "spm" + eval_configs["eval_latency_spm_model"] = tokenizer_path + + base_config = dict( + dataloader="fairseq2_s2tt", + dataloader_class="seamless_communication.streaming.dataloaders.s2tt.SimulEvalSpeechToTextDataloader", + ) + + evaluate(agent_class, {**base_config, **model_configs, **eval_configs}, parser) + + +if __name__ == "__main__": + main() diff --git a/seamless_communication/src/seamless_communication/cli/streaming/scorers/__init__.py b/seamless_communication/src/seamless_communication/cli/streaming/scorers/__init__.py new file mode 100644 index 0000000..15d8859 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/streaming/scorers/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. diff --git a/seamless_communication/src/seamless_communication/cli/streaming/scorers/seamless_quality_scorer.py b/seamless_communication/src/seamless_communication/cli/streaming/scorers/seamless_quality_scorer.py new file mode 100644 index 0000000..d7c5846 --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/streaming/scorers/seamless_quality_scorer.py @@ -0,0 +1,136 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from __future__ import annotations + +import json +from argparse import ArgumentParser, Namespace +from pathlib import Path +from typing import Dict, Optional + +import pandas +from fairseq2.typing import Device +from seamless_communication.cli.eval_utils import compute_quality_metrics +from simuleval.evaluator.instance import LogInstance +from simuleval.evaluator.scorers.quality_scorer import ( + QualityScorer, + register_quality_scorer, +) + + +@register_quality_scorer("SEAMLESS_QUALITY_SCORER") +class SeamlessQualityScorer(QualityScorer): # type: ignore + def __init__( + self, + tgt_lang: str, + task: str, + output_dir: str, + device: Device = "cuda:0", + whisper_model_name: str = "large", + whisper_normalize_text_output: Optional[bool] = None, + ref_text_col_name: str = "ref_tgt_text", + pred_text_col_name: str = "pred_tgt_text", + pred_audio_col_name: str = "pred_tgt_audio", + ) -> None: + super().__init__() + self.tgt_lang = tgt_lang + self.task = task.upper() + self.device = device + self.output_dir = Path(output_dir) + self.whisper_model_name = whisper_model_name + self.whisper_normalize_text_output = whisper_normalize_text_output + if self.whisper_normalize_text_output is None: + self.whisper_normalize_text_output = ( + False if self.task in ["S2TT", "S2ST", "T2TT"] else True + ) + self.ref_text_col_name = ref_text_col_name + self.pred_text_col_name = pred_text_col_name + self.pred_audio_col_name = pred_audio_col_name + + def __call__(self, instances: Dict[int, LogInstance]) -> float: + references = [ins.reference for ins in instances.values()] + df = pandas.DataFrame({self.ref_text_col_name: references}) + if self.task in ["ASR", "S2TT", "T2TT"]: + predictions = [ins.prediction for ins in instances.values()] + df[self.pred_text_col_name] = predictions + else: + predictions = [ins.prediction for ins in instances.values()] + df[self.pred_audio_col_name] = predictions + + df.to_csv( + self.output_dir / "results.tsv", + sep="\t", + quoting=3, + encoding="utf-8", + ) + filename = compute_quality_metrics( + self.output_dir / "results.tsv", + self.output_dir, + self.tgt_lang, + self.task, + self.device, + self.whisper_model_name, + self.whisper_normalize_text_output, + self.ref_text_col_name, + self.pred_text_col_name if self.task in ["ASR", "S2TT", "T2TT"] else None, + self.pred_audio_col_name, + ) + + with open(self.output_dir / filename, "r") as f: + corpus_metric_score = json.load(f)["score"] + + return corpus_metric_score # type: ignore[no-any-return] + + @staticmethod + def add_args(parser: ArgumentParser) -> None: + parser.add_argument("--task", type=str, help="Task to evaluate", required=True) + parser.add_argument( + "--tgt-lang", + type=str, + help="Target language to translate/transcribe into.", + required=True, + ) + parser.add_argument( + "--whisper-model-name", type=str, help="Whisper model name", default="large" + ) + parser.add_argument( + "--whisper-normalize-text-output", + action="store_true", + help="Normalize text output", + default=None, + ) + parser.add_argument( + "--ref-text-col-name", + type=str, + help="Reference text column name", + default="ref_tgt_text", + ) + parser.add_argument( + "--pred-text-col-name", + type=str, + help="Prediction text column name", + default="pred_tgt_text", + ) + parser.add_argument( + "--pred-audio-col-name", + type=str, + help="Prediction audio column name", + default="pred_tgt_audio", + ) + + @classmethod + def from_args(cls, args: Namespace) -> SeamlessQualityScorer: + return cls( + tgt_lang=args.tgt_lang, + task=args.task, + output_dir=args.output, + device=getattr(args, "device", "cpu"), + whisper_model_name=args.whisper_model_name, + whisper_normalize_text_output=args.whisper_normalize_text_output, + ref_text_col_name=args.ref_text_col_name, + pred_text_col_name=args.pred_text_col_name, + pred_audio_col_name=args.pred_audio_col_name, + ) diff --git a/seamless_communication/src/seamless_communication/cli/toxicity/README.md b/seamless_communication/src/seamless_communication/cli/toxicity/README.md new file mode 100644 index 0000000..398b4fb --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/toxicity/README.md @@ -0,0 +1,74 @@ +# Tool to compute toxicity in speech (ASR-ETOX) and text (ETOX) + +In this tool, we combine an ASR model (M4T or whisper) + the ETOX toxicity detection tool +to compute a toxicity score for speech segments. + +ETOX was developed as part of the NLLB project and provides a wordlist detection mechanism for 200 languages. By applying ASR on top of the ETOX detection, we can detect the toxicity in speech. You can find a description of the toxicity detection wordlists in the paper cited below. + +## ASR-ETOX Usage + +The script works by taking a TSV as input. The TSV needs a header with column names, it can have multiple columns. By defaut the script will look at the `audio` for the name of the audio file to load, this can be overriden with `--audio_column`. +The file path in the TSV can be absolute or relative to a root directory specified by `--audio_root_dir`. They can also be audiozip file formats with the appropriate byteoffset and length, e.g.: `fleurs_en_us_ogg_16khz.zip:89474600:49079`. + +You can choose the ASR model to use, by default it will use `seamlessM4T_v2_large`. If you prefer to use [whisper](https://github.com/openai/whisper) you can specify a `--model_name` that starts with `whisper_` and finishes with the whisper model name (e.g. `whisper_large`). + +## Outputs + +The output of the script is a new TSV file with three columns: +- `text` the transcription +- `toxicity` the number of toxic words detected +- `bad_words` a list of toxic words, separated by `,` + +## Sample Command + +**ASR-ETOX** + +- using M4T: +```bash +python -m seamless_communication.cli.toxicity.asr_etox --lang deu --audio_column ref_tgt_audio s2t/en-xx/deu.tsv ~/etox.tsv +``` + +- using Whisper: +```bash +python -m seamless_communication.cli.toxicity.asr_etox --model_name whisper_large --lang fra --audio_column ref_tgt_audio s2t/en-xx/fra.tsv ~/etox.test.tsv +``` + +**ETOX** + +If you only care about getting the toxicity of text, you can use the etox.py script, with one text per line, specifying the language as the first argument. + +```bash +cut -f 4 fleurs/s2t/en-xx/deu.tsv | python -m seamless_communication.cli.toxicity.etox deu > deu.toxicity.txt +``` + +You can also specify an input and output file: +```bash +python -m seamless_communication.cli.toxicity.etox deu deu.txt deu.toxicity.txt +``` + + +# Citation +If you use ETOX, ASR-ETOX and SeamlessM4T in your work, please cite: + + +```bibtex +@misc{costajussà2023toxicity, + title={Toxicity in Multilingual Machine Translation at Scale}, + author={Marta R. Costa-jussà and Eric Smith and Christophe Ropers and Daniel Licht and Jean Maillard and Javier Ferrando and Carlos Escolano}, + year={2023}, + eprint={2210.03070}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +and + +```bibtex +@article{seamlessm4t2023, + title={SeamlessM4T—Massively Multilingual \& Multimodal Machine Translation}, + author={{Seamless Communication}, Lo\"{i}c Barrault, Yu-An Chung, Mariano Cora Meglioli, David Dale, Ning Dong, Paul-Ambroise Duquenne, Hady Elsahar, Hongyu Gong, Kevin Heffernan, John Hoffman, Christopher Klaiber, Pengwei Li, Daniel Licht, Jean Maillard, Alice Rakotoarison, Kaushik Ram Sadagopan, Guillaume Wenzek, Ethan Ye, Bapi Akula, Peng-Jen Chen, Naji El Hachem, Brian Ellis, Gabriel Mejia Gonzalez, Justin Haaheim, Prangthip Hansanti, Russ Howes, Bernie Huang, Min-Jae Hwang, Hirofumi Inaguma, Somya Jain, Elahe Kalbassi, Amanda Kallet, Ilia Kulikov, Janice Lam, Daniel Li, Xutai Ma, Ruslan Mavlyutov, Benjamin Peloquin, Mohamed Ramadan, Abinesh Ramakrishnan, Anna Sun, Kevin Tran, Tuan Tran, Igor Tufanov, Vish Vogeti, Carleigh Wood, Yilin Yang, Bokai Yu, Pierre Andrews, Can Balioglu, Marta R. Costa-juss\`{a} \footnotemark[3], Onur \,{C}elebi,Maha Elbayad,Cynthia Gao, Francisco Guzm\'an, Justine Kao, Ann Lee, Alexandre Mourachko, Juan Pino, Sravya Popuri, Christophe Ropers, Safiyyah Saleem, Holger Schwenk, Paden Tomasello, Changhan Wang, Jeff Wang, Skyler Wang}, + journal={ArXiv}, + year={2023} +} +``` diff --git a/seamless_communication/src/seamless_communication/cli/toxicity/asr_etox.py b/seamless_communication/src/seamless_communication/cli/toxicity/asr_etox.py new file mode 100644 index 0000000..0d4c7ae --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/toxicity/asr_etox.py @@ -0,0 +1,255 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import argparse +import tempfile +import typing as tp +import torchaudio +from tqdm import tqdm +from seamless_communication.cli.eval_utils.compute_metrics import init_whisper_model +from seamless_communication.cli.eval_utils.lang_mapping import LANG3_LANG2 +from seamless_communication.inference.translator import Modality +import torch + +from pathlib import Path +from seamless_communication.inference import Translator +from fairseq2.data import Collater, DataPipeline, FileMapper +from fairseq2.data.audio import AudioDecoder, WaveformToFbankConverter +from fairseq2.data.text import StrSplitter, read_text +from fairseq2.typing import DataType, Device + +from seamless_communication.toxicity import load_etox_bad_word_checker + +from whisper.model import Whisper + +import logging + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s -- %(name)s: %(message)s", +) + +logger = logging.getLogger(__name__) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="ASR ETOX will compute the toxicity level of speech inputs." + ) + parser.add_argument( + "data_file", + type=Path, + help="Path to the input TSV manifest that list the audio files.", + ) + parser.add_argument( + "output_file", + type=Path, + help="Path to a TSV file where to save the results.", + ) + parser.add_argument( + "--lang", + type=str, + help="Language, language of the speech to transcribe", + required=True, + ) + parser.add_argument( + "--audio_root_dir", + type=str, + help="Root directory for the audio filenames in the data file.", + ) + parser.add_argument( + "--audio_column", + type=str, + help="Name of the column where the audiofile is listed in the input tsv.", + default="audio", + ) + parser.add_argument( + "--model_name", + type=str, + help=( + "Base model name (`seamlessM4T_medium`, " + "`seamlessM4T_large`, `seamlessM4T_v2_large`), " + " or whisper model, e.g. 'whisper_large'" + ), + default="seamlessM4T_v2_large", + ) + parser.add_argument( + "--batch_size", + type=int, + help="Inference batch size.", + default=4, + ) + parser.add_argument( + "--n_parallel", + type=int, + help="Number of data loading in parallel.", + default=4, + ) + args, _unknown = parser.parse_known_args() + + if torch.cuda.is_available(): + device = torch.device("cuda:0") + dtype = torch.float16 + else: + device = torch.device("cpu") + dtype = torch.float32 + + whisper_model = None + translator = None + is_whisper = False + + if args.model_name.startswith("whisper_"): + logger.info("loading whisper model.") + _, model_name = args.model_name.split("_", maxsplit=1) + whisper_model = init_whisper_model(device, model_name) + is_whisper = True + else: + logger.info(f"loading {args.model_name} model.") + translator = Translator( + args.model_name, + None, + device, + text_tokenizer=None, + dtype=dtype, + input_modality=Modality.SPEECH, + output_modality=Modality.TEXT, + apply_mintox=False, + ) + + logger.info("loading etox.") + bad_word_checker = load_etox_bad_word_checker("mintox") + + pipeline = build_data_pipeline( + data_file=args.data_file, + audio_root_dir=args.audio_root_dir, + batch_size=args.batch_size, + is_whisper=is_whisper, + device=device, + dtype=dtype, + n_parallel=args.n_parallel, + audio_column=args.audio_column, + ) + + logger.info("running ASR-ETOX.") + with open(args.output_file, "w", encoding="utf-8") as outf: + print("text", "toxicity", "bad_words", file=outf, sep="\t") + for example in tqdm(pipeline, unit="line"): + texts = get_text( + lang=args.lang, + example=example, + whisper_model=whisper_model, + translator=translator, + audio_column=args.audio_column, + ) + for t in texts: + bad_words = bad_word_checker.get_bad_words( + text=str(t), + lang=args.lang, + ) + print( + t, + len(bad_words), + ",".join(bad_words), + file=outf, + sep="\t", + ) + + +def get_text( + lang: str, + example: tp.Dict[str, tp.Any], + whisper_model: Whisper, + translator: Translator, + audio_column: str, +): + if whisper_model: + with tempfile.NamedTemporaryFile(suffix=".wav") as temp: + torchaudio.save( + temp.name, + example[audio_column]["data"]["waveform"]["seqs"][0] + .transpose(0, 1) + .cpu(), + int(example[audio_column]["data"]["sample_rate"][0]), + format="wav", + ) + results = whisper_model.transcribe( + temp.name, + language=LANG3_LANG2[lang], + ) + return [results["text"]] + else: + (text_output, _speech_output) = translator.predict( + example[audio_column]["data"]["fbank"], + "ASR", + lang, + src_lang=lang, + ) + return text_output + + +def build_data_pipeline( + data_file: Path, + audio_root_dir: str, + batch_size: int, + is_whisper: bool, + device: Device, + dtype: DataType, + audio_column: str = "audio", + n_parallel: int = 4, +) -> DataPipeline: + with data_file.open("r", encoding="utf-8") as f: + header = f.readline().strip("\n").split("\t") + + split_tsv = StrSplitter(names=header) + + pipeline_builder = read_text(data_file, rtrim=True).skip(1).map(split_tsv) + + map_file = FileMapper(root_dir=audio_root_dir, cached_fd_count=10) + + pipeline_builder.map( + map_file, + selector=audio_column, + num_parallel_calls=n_parallel, + ) + + decode_audio = AudioDecoder(dtype=torch.float32, device=device) + + convert_to_fbank = WaveformToFbankConverter( + num_mel_bins=80, + waveform_scale=2**15, + channel_last=True, + standardize=True, + device=device, + dtype=dtype, + ) + + # get tensor in waveform + steps = [decode_audio] + if not is_whisper: + # also get the fbanks + steps.append(convert_to_fbank) + + pipeline_builder.map( + steps, + selector=f"{audio_column}.data", + num_parallel_calls=n_parallel, + ) + + if is_whisper: + # no batching for whisper + pipeline_builder.bucket(bucket_size=batch_size) + + collate = Collater(pad_value=0, pad_to_multiple=1) + + pipeline_builder.map(collate, num_parallel_calls=n_parallel) + + pipeline_builder.prefetch(4) + + return pipeline_builder.and_return() + + +if __name__ == "__main__": + main() diff --git a/seamless_communication/src/seamless_communication/cli/toxicity/etox.py b/seamless_communication/src/seamless_communication/cli/toxicity/etox.py new file mode 100644 index 0000000..226640a --- /dev/null +++ b/seamless_communication/src/seamless_communication/cli/toxicity/etox.py @@ -0,0 +1,43 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import argparse +import sys + +from seamless_communication.toxicity import load_etox_bad_word_checker + + +def main() -> None: + parser = argparse.ArgumentParser( + description="ETOX will compute the toxicity level of text inputs (STDIN > STDOUT)." + ) + parser.add_argument( + "lang", + type=str, + help="Language, language of the speech to transcribe", + ) + parser.add_argument( + "input", nargs="?", type=argparse.FileType("r"), default=sys.stdin + ) + parser.add_argument( + "output", nargs="?", type=argparse.FileType("w"), default=sys.stdout + ) + args, _unknown = parser.parse_known_args() + + bad_word_checker = load_etox_bad_word_checker("mintox") + + print("text", "toxicity", "bad_words", sep="\t", file=args.output) + for line in args.input: + l = line.rstrip() + bad_words = bad_word_checker.get_bad_words( + text=l, + lang=args.lang, + ) + print(l, len(bad_words), ",".join(bad_words), sep="\t", file=args.output) + + +if __name__ == "__main__": + main() diff --git a/seamless_communication/src/seamless_communication/datasets/__init__.py b/seamless_communication/src/seamless_communication/datasets/__init__.py new file mode 100644 index 0000000..15d8859 --- /dev/null +++ b/seamless_communication/src/seamless_communication/datasets/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. diff --git a/seamless_communication/src/seamless_communication/datasets/datatypes.py b/seamless_communication/src/seamless_communication/datasets/datatypes.py new file mode 100644 index 0000000..5a1f7c9 --- /dev/null +++ b/seamless_communication/src/seamless_communication/datasets/datatypes.py @@ -0,0 +1,47 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + + +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +import torch + + +@dataclass +class MultimodalSample: + id: int + lang: str + text: str + audio_local_path: Optional[str] = None + waveform: Optional[torch.Tensor] = None + sampling_rate: Optional[int] = None + units: Optional[List[int]] = None + + @classmethod + def from_json(cls, js: Dict[str, Any]) -> "MultimodalSample": + return cls( + id=js["id"], + lang=js["lang"], + text=js["text"], + audio_local_path=js.get("audio_local_path"), + waveform=None, # don't serialize + sampling_rate=js.get("sampling_rate"), + units=js.get("units"), + ) + + +@dataclass +class LangPairSample: + source: MultimodalSample + target: MultimodalSample + + @classmethod + def from_json(cls, js: Dict[str, Any]) -> "LangPairSample": + return cls( + source=MultimodalSample.from_json(js["source"]), + target=MultimodalSample.from_json(js["target"]), + ) diff --git a/seamless_communication/src/seamless_communication/datasets/huggingface.py b/seamless_communication/src/seamless_communication/datasets/huggingface.py new file mode 100644 index 0000000..6d63764 --- /dev/null +++ b/seamless_communication/src/seamless_communication/datasets/huggingface.py @@ -0,0 +1,137 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + + +import logging +import os +from abc import abstractmethod +from typing import Dict, Iterable, Optional + +import numpy as np +import torch +from datasets import load_dataset + +from .datatypes import LangPairSample, MultimodalSample + +logger = logging.getLogger(__name__) + + +class SpeechTokenizer: + @abstractmethod + def encode(self, wav: torch.Tensor, sample_rate: int) -> torch.Tensor: + ... + + +class Speech2SpeechFleursDatasetBuilder: + """Assembles speech2speech dataset from google/fleurs on HuggingFace""" + + HF_FLEURS_DATASET_NAME = "google/fleurs" + + def __init__( + self, + source_lang: str, + target_lang: str, + split: str = "test", + skip_source_audio: bool = True, + skip_target_audio: bool = True, + audio_dtype: torch.dtype = torch.float32, + dataset_cache_dir: Optional[str] = None, + speech_tokenizer: Optional[SpeechTokenizer] = None, + ): + self.source_lang = source_lang + self.target_lang = target_lang + self.split = split + self.dataset_cache_dir = dataset_cache_dir + self.audio_dtype = audio_dtype + self.skip_source_audio = skip_source_audio + self.skip_target_audio = skip_target_audio + self.speech_tokenizer = speech_tokenizer + + def _prepare_sample( + self, + sample_id: int, + lang: str, + text: str, + audio_local_path: Optional[str] = None, + waveform_npy: Optional[np.ndarray] = None, + sampling_rate: Optional[int] = None, + ) -> MultimodalSample: + should_skip_audio = ( + lang == self.target_lang + and self.skip_target_audio + or lang == self.source_lang + and self.skip_source_audio + or waveform_npy is None + ) + if not should_skip_audio: + waveform = torch.from_numpy(waveform_npy).to(self.audio_dtype) + else: + waveform = None + if self.speech_tokenizer is not None and not should_skip_audio: + assert waveform is not None + assert sampling_rate is not None + units_tensor = self.speech_tokenizer.encode( + waveform, sampling_rate + ).reshape(-1) + units = units_tensor.tolist() + else: + units = None + return MultimodalSample( + id=sample_id, + lang=lang, + text=text.strip(), + audio_local_path=audio_local_path, + waveform=waveform, + sampling_rate=sampling_rate, + units=units, + ) + + def iterate_lang_audio_samples(self, lang: str) -> Iterable[MultimodalSample]: + ds = load_dataset( + self.HF_FLEURS_DATASET_NAME, + lang, + split=self.split, + cache_dir=self.dataset_cache_dir, + streaming=False, + ) + for item in ds: + audio_path = os.path.join( + os.path.dirname(item["path"]), item["audio"]["path"] + ) + (sample_id, audio_local_path, waveform, sampling_rate, text) = ( + item["id"], + audio_path, + item["audio"]["array"], + item["audio"]["sampling_rate"], + item["transcription"], + ) + yield self._prepare_sample( + sample_id=sample_id, + audio_local_path=audio_local_path, + waveform_npy=waveform, + sampling_rate=sampling_rate, + text=text, + lang=lang, + ) + + def __iter__(self) -> Iterable[LangPairSample]: + logger.info(f"Loading {self.target_lang} samples") + target_samples: Dict[int, MultimodalSample] = {} + for idx, sample in enumerate( + self.iterate_lang_audio_samples(lang=self.target_lang) + ): + if idx and idx % 100 == 0: + logger.info(f"..loaded {idx} target samples") + target_samples[sample.id] = sample + + logger.info(f"Loading {self.source_lang} samples") + for idx, sample in enumerate( + self.iterate_lang_audio_samples(lang=self.source_lang) + ): + if idx and idx % 100 == 0: + logger.info(f"..loaded {idx} source samples") + if sample.id in target_samples: + yield LangPairSample(source=sample, target=target_samples[sample.id]) diff --git a/seamless_communication/src/seamless_communication/inference/__init__.py b/seamless_communication/src/seamless_communication/inference/__init__.py new file mode 100644 index 0000000..f5c24ca --- /dev/null +++ b/seamless_communication/src/seamless_communication/inference/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from seamless_communication.inference.generator import ( + SequenceGeneratorOptions as SequenceGeneratorOptions, +) +from seamless_communication.inference.generator import UnitYGenerator as UnitYGenerator +from seamless_communication.inference.translator import ( + BatchedSpeechOutput as BatchedSpeechOutput, +) +from seamless_communication.inference.translator import Modality as Modality +from seamless_communication.inference.translator import Task as Task +from seamless_communication.inference.translator import Translator as Translator diff --git a/seamless_communication/src/seamless_communication/inference/generator.py b/seamless_communication/src/seamless_communication/inference/generator.py new file mode 100644 index 0000000..d4081a9 --- /dev/null +++ b/seamless_communication/src/seamless_communication/inference/generator.py @@ -0,0 +1,364 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass +from typing import List, Optional, Tuple + +import torch +from fairseq2.data import SequenceData, StringLike +from fairseq2.data.text import TextTokenizer +from fairseq2.generation import ( + BeamSearchSeq2SeqGenerator, + Seq2SeqGenerator, + SequenceToTextConverter, + StepProcessor, +) +from fairseq2.nn.padding import ( + PaddingMask, + apply_padding_mask, + get_seqs_and_padding_mask, + pad_seqs, +) +from fairseq2.nn.utils.module import infer_device +from torch import Tensor + +from seamless_communication.models.unity.model import ( + UnitYModel, + UnitYT2UModel, + UnitYX2TModel, +) +from seamless_communication.models.unity.unit_tokenizer import ( + UnitTokenDecoder, + UnitTokenizer, +) + + +def remove_consecutive_repeated_ngrams( + sequence: List[int], min_size: int = 1, max_size: int = 40 +) -> List[int]: + assert 1 <= min_size <= max_size + drop_idx = set() # indices that will be dropped from the sequence + + # start from the beginning, check if an ngram of size k (for k=max..min) is + # followed by its copy, if so delete the first one, and start over after + # the deleted ngram. + start = 0 + while start < len(sequence): + for k in range(max_size, min_size - 1, -1): + if sequence[start : start + k] == sequence[start + k : start + k + k]: + drop_idx |= set(range(start, start + k)) + start += k - 1 # assumes repeating subsequences don't overlap + break + start += 1 + return [token for idx, token in enumerate(sequence) if idx not in drop_idx] + + +@dataclass +class SequenceGeneratorOptions: + """Holds the options to pass to a sequence generator.""" + + beam_size: int = 5 + """The beam size.""" + + soft_max_seq_len: Tuple[int, int] = (1, 200) + """The terms ``a`` and ``b`` of ``ax + b`` where ``x`` is the source + sequence length. The generated sequences (including prefix sequence) will + have the maximum length of ``min(hard_max_seq_len, ax + b)``. See also + ``hard_max_seq_len``.""" + + hard_max_seq_len: int = 1024 + """The hard limit on maximum length of generated sequences.""" + + step_processor: Optional[StepProcessor] = None + """The processor called at each generation step.""" + + unk_penalty: float = 0.0 + """The UNK symbol penalty, where values less than 0 produce more UNKs; + values greater than 0 produce fewer UNKs.""" + + len_penalty: float = 1.0 + """The length penalty, where values less than 1.0 favor shorter + sequences; values greater than 1.0 favor longer sequences.""" + + +class UnitYGenerator: + """Generates text translations and speech units from a UnitY model.""" + + model: UnitYModel + s2t_converter: SequenceToTextConverter + t2t_converter: Optional[SequenceToTextConverter] + unit_decoder: Optional[UnitTokenDecoder] + unit_prefix_indices: Optional[Tensor] + unit_generator: Optional[Seq2SeqGenerator] + + def __init__( + self, + model: UnitYModel, + text_tokenizer: TextTokenizer, + target_lang: str, + unit_tokenizer: Optional[UnitTokenizer] = None, + text_opts: Optional[SequenceGeneratorOptions] = None, + unit_opts: Optional[SequenceGeneratorOptions] = None, + ) -> None: + """ + :param model: + The UnitY model to use for generation. + :param text_tokenizer: + The text tokenizer to use. + :param unit_tokenizer: + The unit tokenizer to use. + :param target_lang: + The target language. + :param text_generator_opts: + The options to pass to the underlying text :class:`Seq2SeqGenerator`. + :param unit_generator_opts: + The options to pass to the underlying unit :class:`Seq2SeqGenerator`. + """ + model.eval() + + self.model = model + + if text_opts is None: + text_opts = SequenceGeneratorOptions() + + if model.text_decoder is None: + raise ValueError( + "`UnitYGenerator` requires a text decoder, but the current UnitY model does not have one." + ) + assert model.text_decoder_frontend is not None + assert model.final_proj is not None + + s2t_model = UnitYX2TModel( + encoder_frontend=model.speech_encoder_frontend, + encoder=model.speech_encoder, + decoder_frontend=model.text_decoder_frontend, + decoder=model.text_decoder, + final_proj=model.final_proj, + target_vocab_info=model.target_vocab_info, + ) + + step_processors = [] + if text_opts.step_processor is not None: + step_processors.append(text_opts.step_processor) + + generator = BeamSearchSeq2SeqGenerator( + s2t_model, + beam_size=text_opts.beam_size, + max_gen_len=text_opts.soft_max_seq_len, + max_seq_len=text_opts.hard_max_seq_len, + echo_prompt=True, + step_processors=step_processors, + unk_penalty=text_opts.unk_penalty, + len_penalty=text_opts.len_penalty, + ) + self.s2t_converter = SequenceToTextConverter( + generator, text_tokenizer, "translation", target_lang + ) + + if model.text_encoder is None: + self.t2t_generator = None + else: + assert model.text_encoder_frontend is not None + assert model.text_encoder is not None + t2t_model = UnitYX2TModel( + encoder_frontend=model.text_encoder_frontend, + encoder=model.text_encoder, + decoder_frontend=model.text_decoder_frontend, + decoder=model.text_decoder, + final_proj=model.final_proj, + target_vocab_info=model.target_vocab_info, + ) + generator = BeamSearchSeq2SeqGenerator( + t2t_model, + beam_size=text_opts.beam_size, + max_gen_len=text_opts.soft_max_seq_len, + max_seq_len=text_opts.hard_max_seq_len, + echo_prompt=True, + step_processors=step_processors, + unk_penalty=text_opts.unk_penalty, + len_penalty=text_opts.len_penalty, + ) + self.t2t_converter = SequenceToTextConverter( + generator, text_tokenizer, "translation", target_lang + ) + + self.unit_generator = None + self.unit_decoder = None + # Set up unit generator. + if unit_tokenizer is not None: + if model.t2u_model is None: + raise ValueError( + "`model` does not have a T2U sub-model when `unit_tokenizer` is not None." + ) + + self.unit_decoder = unit_tokenizer.create_decoder() + + unit_encoder = unit_tokenizer.create_encoder( + lang=target_lang, device=infer_device(model.t2u_model) + ) + + self.unit_prefix_indices = unit_encoder.prefix_indices + + if isinstance(self.model.t2u_model, UnitYT2UModel): + if unit_opts is None: + # Speech sequences are typically much longer than text sequences. + unit_opts = SequenceGeneratorOptions( + soft_max_seq_len=(25, 50), hard_max_seq_len=5000 + ) + + step_processors = [] + if unit_opts.step_processor is not None: + step_processors.append(unit_opts.step_processor) + + self.unit_generator = BeamSearchSeq2SeqGenerator( + self.model.t2u_model, + beam_size=unit_opts.beam_size, + max_gen_len=unit_opts.soft_max_seq_len, + max_seq_len=unit_opts.hard_max_seq_len, + echo_prompt=True, + step_processors=step_processors, + unk_penalty=unit_opts.unk_penalty, + len_penalty=unit_opts.len_penalty, + ) + + @torch.inference_mode() + def __call__( + self, + source_seqs: Tensor, + source_padding_mask: Optional[PaddingMask], + input_modality: str = "speech", + output_modality: str = "speech", + ngram_filtering: bool = False, + duration_factor: float = 1.0, + prosody_encoder_input: Optional[SequenceData] = None, + ) -> Tuple[List[StringLike], Optional[Tensor]]: + """ + :param source_seqs: + The source sequences to use for generation. *Shape:* :math:`(N,S,*)`, + where :math:`N` is the batch size, :math:`S` is the sequence length, + and :math:`*` is any number of sequence-specific dimensions + including none. + :param source_padding_mask: + The padding mask of ``source_seqs``. *Shape:* :math:`(N,S)`, where + :math:`N` is the batch size and :math:`S` is the sequence length. + :param input_modality: + The type of modality to encode. + :param output_modality: + The type of modality to decode. + :param ngram_filtering: + If True, removes consecutive repeated ngrams + from the decoded unit output. + + :returns: + - The output of the text generator. + - The output of the unit generator. + """ + + if input_modality == "speech": + texts, text_gen_output = self.s2t_converter.batch_convert( + source_seqs, source_padding_mask + ) + elif input_modality == "text": + if self.t2t_converter is None: + raise ValueError( + "Please set `use_text_encoder` to `True` in your model config to encode text." + ) + texts, text_gen_output = self.t2t_converter.batch_convert( + source_seqs, source_padding_mask + ) + else: + raise ValueError(f"Unsupported input_modality: {input_modality}") + + # We skip T2U when we only need to output text. + if output_modality == "text": + return texts, None + + assert self.model.target_vocab_info.pad_idx is not None + + text_seq_list = [h[0].seq for h in text_gen_output.hypotheses] + + text_seqs, text_padding_mask = pad_seqs( + text_seq_list, self.model.target_vocab_info.pad_idx + ) + + # Manually trim the final EOS token to be consistent with fairseq. + text_seqs = text_seqs[:, :-1] + + if text_padding_mask is not None: + text_padding_mask = text_padding_mask.trim(1) + + # Use the output of the text generator to compute the decoder output. + decoder_output, decoder_padding_mask = self.model.decode( + text_seqs, + text_padding_mask, + text_gen_output.encoder_output, + text_gen_output.encoder_padding_mask, + ) + + assert self.model.t2u_model is not None + assert self.unit_decoder is not None + + unit_gen_output = None + prosody_encoder_out = None + if self.model.prosody_encoder_model is not None: + assert prosody_encoder_input is not None + prosody_input_seqs, prosody_padding_mask = get_seqs_and_padding_mask( + prosody_encoder_input + ) + prosody_encoder_out = self.model.prosody_encoder_model( + prosody_input_seqs, + prosody_padding_mask, + ).unsqueeze(1) + + if isinstance(self.model.t2u_model, UnitYT2UModel): + assert self.unit_generator is not None + assert self.unit_prefix_indices is not None + + # (S_pre) -> (N, S_pre) + prefix_seqs = self.unit_prefix_indices.expand(decoder_output.size(0), -1) + + unit_gen_output = self.unit_generator( + source_seqs=decoder_output, + source_padding_mask=decoder_padding_mask, + prompt_seqs=prefix_seqs, + prompt_padding_mask=None, + ) + + assert self.model.t2u_model.target_vocab_info.pad_idx is not None + + unit_seq_list = [h[0].seq for h in unit_gen_output.hypotheses] + + unit_seqs, _ = pad_seqs( + unit_seq_list, self.model.t2u_model.target_vocab_info.pad_idx + ) + else: + t2u_model_output, decoder_padding_mask, _ = self.model.t2u_model( + text_decoder_output=decoder_output, + text_decoder_padding_mask=decoder_padding_mask, + text_seqs=text_seqs, + duration_factor=duration_factor, + film_cond_emb=prosody_encoder_out, + ) + # (B, S_unit, V_unit) + unit_seqs = t2u_model_output.logits.argmax(dim=2) + # Apply the padding mask to the generated units. + unit_seqs = apply_padding_mask( + unit_seqs, decoder_padding_mask, t2u_model_output.vocab_info.pad_idx + ) + + # Convert to speech units. + units = self.unit_decoder(unit_seqs) + + # ngram-filtering doesn't apply to NAR unit decoding. + if ngram_filtering and isinstance(self.model.t2u_model, UnitYT2UModel): + if units.size(0) > 1: + raise NotImplementedError( + "unit ngram_filtering is not implemented for batch_size > 1." + ) + arr = remove_consecutive_repeated_ngrams(units[0].tolist()) + units = torch.tensor(arr).to(units).unsqueeze(0) + + return texts, units diff --git a/seamless_communication/src/seamless_communication/inference/translator.py b/seamless_communication/src/seamless_communication/inference/translator.py new file mode 100644 index 0000000..57bea93 --- /dev/null +++ b/seamless_communication/src/seamless_communication/inference/translator.py @@ -0,0 +1,428 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import logging +from dataclasses import dataclass +from enum import Enum, auto +from pathlib import Path +from typing import List, Optional, Tuple, Union, cast + +import torch +import torch.nn as nn +from fairseq2.assets import asset_store +from fairseq2.assets.card import AssetCard +from fairseq2.data import Collater, SequenceData, StringLike +from fairseq2.data.audio import AudioDecoder, WaveformToFbankConverter +from fairseq2.data.text import TextTokenizer +from fairseq2.memory import MemoryBlock +from fairseq2.nn.padding import PaddingMask, get_seqs_and_padding_mask +from fairseq2.typing import DataType, Device +from torch import Tensor + +from seamless_communication.inference.generator import ( + SequenceGeneratorOptions, + UnitYGenerator, +) +from seamless_communication.models.unity import ( + UnitTokenizer, + UnitYModel, + UnitYNART2UModel, + UnitYT2UModel, + load_unity_model, + load_unity_text_tokenizer, + load_unity_unit_tokenizer, + unity_archs, +) +from seamless_communication.models.vocoder import load_vocoder_model +from seamless_communication.toxicity import ( + ETOXBadWordChecker, + load_etox_bad_word_checker, +) +from seamless_communication.toxicity.mintox import mintox_pipeline + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s -- %(name)s: %(message)s", +) + +logger = logging.getLogger(__name__) + + +class Task(Enum): + S2ST = auto() + S2TT = auto() + T2ST = auto() + T2TT = auto() + ASR = auto() + + +class Modality(Enum): + SPEECH = "speech" + TEXT = "text" + + +@dataclass +class BatchedSpeechOutput: + units: List[List[int]] + """The batched list of generated units.""" + + audio_wavs: List[Tensor] + """The batched list of audio waveforms.""" + + sample_rate: int = 16000 + """Sample rate of the audio waveforms.""" + + +class Translator(nn.Module): + def __init__( + self, + model_name_or_card: Union[str, AssetCard], + vocoder_name_or_card: Union[str, AssetCard, None], + device: Device, + text_tokenizer: Optional[TextTokenizer] = None, + apply_mintox: bool = False, + dtype: DataType = torch.float16, + input_modality: Optional[Modality] = None, + output_modality: Optional[Modality] = None, + ): + super().__init__() + + if isinstance(model_name_or_card, str): + model_name_or_card = asset_store.retrieve_card(model_name_or_card) + + assert isinstance(model_name_or_card, AssetCard) + + if input_modality or output_modality: + unity_config = unity_archs.get_config( + model_name_or_card.field("model_arch").as_(str) + ) + # Skip loading the text encoder. + if input_modality == Modality.SPEECH: + unity_config.use_text_encoder = False + # Skip loading the T2U model. + if output_modality == Modality.TEXT: + unity_config.t2u_config = None + model_name_or_card.field("model_config").set(unity_config) + + # Load the model. + if device == torch.device("cpu"): + dtype = torch.float32 + + self.model = load_unity_model(model_name_or_card, device=device, dtype=dtype) + self.model.eval() + assert isinstance(self.model, UnitYModel) + + if text_tokenizer is None: + self.text_tokenizer: TextTokenizer = load_unity_text_tokenizer( + model_name_or_card + ) + else: + self.text_tokenizer = text_tokenizer + + self.unit_tokenizer: Optional[UnitTokenizer] = None + if self.model.t2u_model is not None: + self.unit_tokenizer = load_unity_unit_tokenizer(model_name_or_card) + + self.bad_word_checker: Optional[ETOXBadWordChecker] = None + if apply_mintox: + self.bad_word_checker = load_etox_bad_word_checker("mintox") + + self.apply_mintox = apply_mintox + + self.device = device + self.decode_audio = AudioDecoder(dtype=torch.float32, device=device) + self.convert_to_fbank = WaveformToFbankConverter( + num_mel_bins=80, + waveform_scale=2**15, + channel_last=True, + standardize=True, + device=device, + dtype=dtype, + ) + self.collate = Collater( + pad_value=self.text_tokenizer.vocab_info.pad_idx or 0, pad_to_multiple=2 + ) + self.vocoder = None + if vocoder_name_or_card is not None and ( + output_modality is None or output_modality == Modality.SPEECH + ): + self.vocoder = load_vocoder_model( + vocoder_name_or_card, device=device, dtype=dtype + ) + self.vocoder.eval() + + @classmethod + def get_prediction( + cls, + model: UnitYModel, + text_tokenizer: TextTokenizer, + unit_tokenizer: Optional[UnitTokenizer], + seqs: Tensor, + padding_mask: Optional[PaddingMask], + input_modality: Modality, + output_modality: Modality, + tgt_lang: str, + text_generation_opts: SequenceGeneratorOptions, + unit_generation_opts: Optional[SequenceGeneratorOptions], + unit_generation_ngram_filtering: bool = False, + duration_factor: float = 1.0, + prosody_encoder_input: Optional[SequenceData] = None, + ) -> Tuple[List[StringLike], Optional[Tensor]]: + # We disregard unit generations opts for the NAR T2U decoder. + if output_modality != Modality.SPEECH or isinstance( + model.t2u_model, UnitYNART2UModel + ): + unit_generation_opts = None + + generator = UnitYGenerator( + model, + text_tokenizer, + tgt_lang, + unit_tokenizer if output_modality == Modality.SPEECH else None, + text_opts=text_generation_opts, + unit_opts=unit_generation_opts, + ) + + return generator( + seqs, + padding_mask, + input_modality.value, + output_modality.value, + ngram_filtering=unit_generation_ngram_filtering, + duration_factor=duration_factor, + prosody_encoder_input=prosody_encoder_input, + ) + + @staticmethod + def get_modalities_from_task_str(task_str: str) -> Tuple[Modality, Modality]: + try: + task = Task[task_str.upper()] + except KeyError: + raise ValueError(f"Unsupported task: {task_str}") + + if task == Task.S2ST: + return Modality.SPEECH, Modality.SPEECH + # ASR is treated as S2TT with src_lang == tgt_lang + elif task == Task.S2TT or task == Task.ASR: + return Modality.SPEECH, Modality.TEXT + elif task == Task.T2TT: + return Modality.TEXT, Modality.TEXT + else: + return Modality.TEXT, Modality.SPEECH + + @torch.inference_mode() + def predict( + self, + input: Union[str, Tensor, SequenceData], + task_str: str, + tgt_lang: str, + src_lang: Optional[str] = None, + text_generation_opts: Optional[SequenceGeneratorOptions] = None, + unit_generation_opts: Optional[SequenceGeneratorOptions] = None, + spkr: Optional[int] = -1, + sample_rate: int = 16000, + unit_generation_ngram_filtering: bool = False, + duration_factor: float = 1.0, + prosody_encoder_input: Optional[SequenceData] = None, + src_text: Optional[StringLike] = None, + ) -> Tuple[List[StringLike], Optional[BatchedSpeechOutput]]: + """ + The main method used to perform inference on all tasks. + + :param input: + Either text or path to audio or audio Tensor. + :param task_str: + String representing the task. + Valid choices are "S2ST", "S2TT", "T2ST", "T2TT", "ASR" + :param tgt_lang: + Target language to decode into. + :param src_lang: + Source language of input, only required for T2ST, T2TT tasks. + :param text_generation_opts: + Text generation hyperparameters for incremental decoding. + :param unit_generation_opts: + Unit generation hyperparameters for incremental decoding. + :param spkr: + Speaker id for vocoder. + :param unit_generation_ngram_filtering: + If True, removes consecutive repeated ngrams + from the decoded unit output. + :param src_text: + Optional source transcript (obtained by ASR for instance). This is used for + applying mintox toxicity mitigation. If this is not specify and apply_mintox=True + then src_lang must be specified and ASR will be run on the audio source. + + :returns: + - Batched list of Translated text. + - Translated BatchedSpeechOutput. + """ + input_modality, output_modality = self.get_modalities_from_task_str(task_str) + + if self.apply_mintox and not (src_lang is not None or src_text is not None): + raise ValueError( + "`src_lang` must be specified when `apply_mintox` is `True` or you need to specify src_text." + ) + + if isinstance(input, dict): + src = cast(SequenceData, input) + elif input_modality == Modality.SPEECH: + audio = input + if isinstance(audio, str): + with Path(audio).open("rb") as fb: + block = MemoryBlock(fb.read()) + decoded_audio = self.decode_audio(block) + else: + assert ( + audio.dim() <= 2 + ), "The audio tensor can't be more than 2 dimensions." + if audio.dim() == 1: + audio = audio.unsqueeze(1) + elif audio.dim() == 2 and audio.size(0) < audio.size(1): + logger.warning( + "Transposing audio tensor from (bsz, seq_len) -> (seq_len, bsz)." + ) + audio = audio.transpose(0, 1) + + decoded_audio = { + "waveform": audio, + "sample_rate": sample_rate, + "format": -1, + } + src = self.collate(self.convert_to_fbank(decoded_audio))["fbank"] + else: + if src_lang is None: + raise ValueError("src_lang must be specified for T2ST, T2TT tasks.") + + text = input + assert isinstance(text, str) + + self.token_encoder = self.text_tokenizer.create_encoder( + task="translation", lang=src_lang, mode="source", device=self.device + ) + src = self.collate(self.token_encoder(text)) + + assert isinstance(self.model, UnitYModel) + + seqs, padding_mask = get_seqs_and_padding_mask(src) + + if text_generation_opts is None: + text_generation_opts = SequenceGeneratorOptions( + beam_size=5, soft_max_seq_len=(1, 200) + ) + if unit_generation_opts is None: + unit_generation_opts = SequenceGeneratorOptions( + beam_size=5, soft_max_seq_len=(25, 50) + ) + + texts, units = self.get_prediction( + self.model, + self.text_tokenizer, + self.unit_tokenizer, + seqs, + padding_mask, + input_modality, + output_modality, + tgt_lang, + text_generation_opts, + unit_generation_opts, + unit_generation_ngram_filtering=unit_generation_ngram_filtering, + duration_factor=duration_factor, + prosody_encoder_input=prosody_encoder_input, + ) + + if self.apply_mintox and task_str != Task.ASR.name: + if input_modality == Modality.SPEECH: + if src_text is not None: + src_texts = [src_text] + else: + src_texts, _, = self.predict( + input=input, + task_str=Task.ASR.name, + tgt_lang=tgt_lang, + src_lang=src_lang, + text_generation_opts=text_generation_opts, + unit_generation_opts=unit_generation_opts, + spkr=spkr, + sample_rate=sample_rate, + unit_generation_ngram_filtering=unit_generation_ngram_filtering, + ) + else: + assert isinstance(input, str) + + src_texts = [input] + + assert src_lang is not None + assert self.unit_tokenizer is not None + assert self.bad_word_checker is not None + + texts, units = mintox_pipeline( + model=self.model, + text_tokenizer=self.text_tokenizer, + unit_tokenizer=self.unit_tokenizer, + device=self.device, + src_lang=src_lang, + tgt_lang=tgt_lang, + model_input=src, + input_modality=input_modality, + output_modality=output_modality, + src_texts=src_texts, + original_texts=texts, + original_units=units, + unit_generation_ngram_filtering=unit_generation_ngram_filtering, + text_generation_opts=text_generation_opts, + unit_generation_opts=unit_generation_opts, + bad_word_checker=self.bad_word_checker, + duration_factor=duration_factor, + prosody_encoder_input=prosody_encoder_input, + ) + + if output_modality == Modality.TEXT: + return texts, None + else: + assert units is not None + + if isinstance(self.model.t2u_model, UnitYT2UModel): + # Remove the lang token for AR UnitY since the vocoder doesn't need it + # in the unit sequence. tgt_lang is fed as an argument to the vocoder. + units = units[:, 1:] + duration_prediction = True + else: + # Vocoder duration predictions not required since the NAR + # T2U model already predicts duration in the units. + duration_prediction = False + + audio_wavs = [] + speech_units = [] + for i in range(len(units)): + assert self.model.t2u_model is not None + unit_padding_mask = ( + units[i] != self.model.t2u_model.target_vocab_info.pad_idx + ) + u = units[i][unit_padding_mask] + speech_units.append(u.tolist()) + + if self.vocoder is not None: + translated_audio_wav = self.vocoder( + units, tgt_lang, spkr, dur_prediction=duration_prediction + ) + for i in range(len(units)): + padding_removed_audio_wav = translated_audio_wav[ + i, + :, + : int( + translated_audio_wav.size(-1) + * len(speech_units[i]) + / len(units[i]) + ), + ].unsqueeze(0) + audio_wavs.append(padding_removed_audio_wav) + return ( + texts, + BatchedSpeechOutput( + units=speech_units, + audio_wavs=audio_wavs, + sample_rate=sample_rate, + ), + ) diff --git a/seamless_communication/src/seamless_communication/models/__init__.py b/seamless_communication/src/seamless_communication/models/__init__.py new file mode 100644 index 0000000..15d8859 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. diff --git a/seamless_communication/src/seamless_communication/models/aligner/__init__.py b/seamless_communication/src/seamless_communication/models/aligner/__init__.py new file mode 100644 index 0000000..a45bc18 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/aligner/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from seamless_communication.models.aligner.model import ( + UnitY2AlignmentEncoder as UnitY2AlignmentEncoder, +) +from seamless_communication.models.aligner.model import ( + UnitY2AlignmentFrontend as UnitY2AlignmentFrontend, +) +from seamless_communication.models.aligner.model import ( + UnitY2AlignmentModel as UnitY2AlignmentModel, +) diff --git a/seamless_communication/src/seamless_communication/models/aligner/alignment_extractor.py b/seamless_communication/src/seamless_communication/models/aligner/alignment_extractor.py new file mode 100644 index 0000000..d0cfae6 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/aligner/alignment_extractor.py @@ -0,0 +1,174 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import os +from typing import Any, List, Tuple, Union + +import numpy +import torch +import torch.nn as nn +import torchaudio +from fairseq2.typing import DataType, Device +from fairseq2.data.typing import StringLike +from torch import Tensor + +from seamless_communication.models.aligner.loader import load_unity2_alignment_model +from seamless_communication.models.unit_extractor import UnitExtractor + +try: + import matplotlib.pyplot as plt + + matplotlib_available = True +except ImportError: + matplotlib_available = False + + +class AlignmentExtractor(nn.Module): + def __init__( + self, + aligner_model_name_or_card: str, + unit_extractor_model_name_or_card: Union[Any, str] = None, + unit_extractor_output_layer: Union[Any, int] = None, + unit_extractor_kmeans_model_uri: Union[Any, str] = None, + device: Device = Device("cpu"), + dtype: DataType = torch.float32, + ): + super().__init__() + self.device = device + self.dtype = dtype + + if self.dtype == torch.float16 and self.device == Device("cpu"): + raise RuntimeError("FP16 only works on GPU, set args accordingly") + + self.alignment_model = load_unity2_alignment_model( + aligner_model_name_or_card, device=self.device, dtype=self.dtype + ) + self.alignment_model.eval() + + self.unit_extractor = None + self.unit_extractor_output_layer = 0 + + if unit_extractor_model_name_or_card is not None: + self.unit_extractor = UnitExtractor( + unit_extractor_model_name_or_card, + unit_extractor_kmeans_model_uri, + device=device, + dtype=dtype, + ) + self.unit_extractor_output_layer = unit_extractor_output_layer + + def load_audio( + self, audio_path: str, sampling_rate: int = 16_000 + ) -> Tuple[Tensor, int]: + assert os.path.exists(audio_path) + audio, rate = torchaudio.load(audio_path) + if rate != sampling_rate: + audio = torchaudio.functional.resample(audio, rate, sampling_rate) + rate = sampling_rate + return audio, rate + + def prepare_audio(self, audio: Union[str, Tensor]) -> Tensor: + # TODO: switch to fairseq2 data pipeline once it supports resampling + if isinstance(audio, str): + audio, _ = self.load_audio(audio, sampling_rate=16_000) + if audio.ndim > 1: + # averaging over channels + assert audio.size(0) < audio.size( + 1 + ), "Expected [Channel,Time] shape, but Channel > Time" + audio = audio.mean(0) + assert ( + audio.ndim == 1 + ), f"After channel averaging audio shape expected to be [Time] i.e. mono audio" + audio = audio.to(self.device, self.dtype) + + return audio + + def extract_units(self, audio: Tensor) -> Tensor: + assert isinstance( + self.unit_extractor, UnitExtractor + ), "Unit extractor is required to get units from audio tensor" + units = self.unit_extractor.predict(audio, self.unit_extractor_output_layer) + return units + + @torch.inference_mode() + def extract_alignment( + self, + audio: Union[str, Tensor], + text: str, + plot: bool = False, + add_trailing_silence: bool = False, + ) -> Tuple[Tensor, Tensor, List[StringLike]]: + if isinstance(audio, Tensor) and not torch.is_floating_point(audio): + # we got units as audio arg + units = audio + units = units.to(self.device) + audio_tensor = None + else: + audio_tensor = self.prepare_audio(audio) + units = self.extract_units(audio_tensor) + + tokenized_unit_ids = self.alignment_model.alignment_frontend.tokenize_unit( + units + ).unsqueeze(0) + tokenized_text_ids = ( + self.alignment_model.alignment_frontend.tokenize_text( + text, add_trailing_silence=add_trailing_silence + ) + .to(self.device) + .unsqueeze(0) + ) + tokenized_text_tokens = ( + self.alignment_model.alignment_frontend.tokenize_text_to_tokens( + text, add_trailing_silence=add_trailing_silence + ) + ) + _, alignment_durations = self.alignment_model( + tokenized_text_ids, tokenized_unit_ids + ) + + if plot and (audio_tensor is not None): + self.plot_alignment( + audio_tensor.cpu(), tokenized_text_tokens, alignment_durations.cpu() + ) + + return alignment_durations, tokenized_text_ids, tokenized_text_tokens + + def detokenize_text(self, tokenized_text_ids: Tensor) -> StringLike: + return self.alignment_model.alignment_frontend.decode_text(tokenized_text_ids) + + def plot_alignment( + self, audio: Tensor, text_tokens: List[StringLike], durations: Tensor + ) -> None: + if not matplotlib_available: + raise RuntimeError( + "Please `pip install matplotlib` in order to use plot alignment." + ) + _, ax = plt.subplots(figsize=(22, 3.5)) + ax.plot(audio, color="gray", linewidth=0.3) + durations_cumul = numpy.concatenate([numpy.array([0]), numpy.cumsum(durations)]) + alignment_ticks = durations_cumul * 320 # 320 is hardcoded for 20ms rate here + + ax.vlines( + alignment_ticks, + ymax=1, + ymin=-1, + color="indigo", + linestyles="dashed", + lw=0.5, + ) + + middle_tick_positions = ( + durations_cumul[:-1] + (durations_cumul[1:] - durations_cumul[:-1]) / 2 + ) + ax.set_xticks(middle_tick_positions * 320) + ax.set_xticklabels(text_tokens, fontsize=13) + ax.set_xlim(0, len(audio)) + + ax.set_ylim(audio.min(), audio.max()) + ax.set_yticks([]) + plt.tight_layout() + plt.show() diff --git a/seamless_communication/src/seamless_communication/models/aligner/builder.py b/seamless_communication/src/seamless_communication/models/aligner/builder.py new file mode 100644 index 0000000..280c508 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/aligner/builder.py @@ -0,0 +1,186 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from fairseq2.assets.card import AssetCard +from fairseq2.data.vocabulary_info import VocabularyInfo +from fairseq2.models.utils.arch_registry import ArchitectureRegistry +from fairseq2.nn.embedding import StandardEmbedding, init_scaled_embedding +from fairseq2.typing import DataType, Device + +from seamless_communication.models.aligner.model import ( + UnitY2AlignmentEncoder, + UnitY2AlignmentFrontend, + UnitY2AlignmentModel, +) +from seamless_communication.models.unity.char_tokenizer import load_unity_char_tokenizer +from seamless_communication.models.unity.loader import load_unity_unit_tokenizer + + +@dataclass +class AlignmentEncoderConfig: + model_dim: int + + feat_dim: int + + num_text_layers: int + + num_feat_layers: int + + dropout: float + + temperature: float + + reduction_factor: int + + +@dataclass +class UnitY2AlignmentFrontendConfig: + unit_vocab_info: VocabularyInfo + + text_vocab_size: int + + +@dataclass +class UnitY2AlignmentConfig: + model_name_or_card: Union[str, AssetCard] + + alignment_encoder_config: AlignmentEncoderConfig + + alignment_frontend_config: UnitY2AlignmentFrontendConfig + + +aligner_archs = ArchitectureRegistry[UnitY2AlignmentConfig]("unity2_aligner") + +aligner_arch = aligner_archs.decorator + + +@aligner_arch("nar_t2u_aligner") +def _aligner_nar_t2u() -> UnitY2AlignmentConfig: + encoder_config = AlignmentEncoderConfig( + model_dim=1024, + feat_dim=1024, + num_text_layers=2, + num_feat_layers=3, + dropout=0.1, + temperature=1.0, + reduction_factor=1, + ) + + frontend_config = UnitY2AlignmentFrontendConfig( + unit_vocab_info=VocabularyInfo( + size=10082, unk_idx=3, bos_idx=0, eos_idx=2, pad_idx=1 + ), + text_vocab_size=10943, + ) + + return UnitY2AlignmentConfig( + model_name_or_card="nar_t2u_aligner", + alignment_encoder_config=encoder_config, + alignment_frontend_config=frontend_config, + ) + + +class UnitY2AlignmentBuilder: + config: UnitY2AlignmentConfig + device: Optional[Device] + dtype: DataType + + def __init__( + self, + config: UnitY2AlignmentConfig, + *, + device: Optional[Device] = None, + dtype: DataType = torch.float32, + ) -> None: + """ + :param config: + The configuration to use. + :param device: + The device on which to initialize modules. + :param dtype: + The data type of module parameters and buffers. + """ + self.config = config + + self.device, self.dtype = device, dtype + + def build_model(self) -> UnitY2AlignmentModel: + alignment_frontend = self.build_alignment_frontend() + + alignment_encoder = self.build_alignment_encoder() + + return UnitY2AlignmentModel(alignment_frontend, alignment_encoder) + + def build_alignment_frontend(self) -> UnitY2AlignmentFrontend: + text_tokenizer = load_unity_char_tokenizer(self.config.model_name_or_card) + + unit_tokenizer = load_unity_unit_tokenizer(self.config.model_name_or_card) + + embed_text = StandardEmbedding( + num_embeddings=self.config.alignment_frontend_config.text_vocab_size, + embedding_dim=self.config.alignment_encoder_config.model_dim, + pad_idx=self.config.alignment_frontend_config.unit_vocab_info.pad_idx, + init_fn=init_scaled_embedding, + device=self.device, + dtype=self.dtype, + ) + + embed_unit = StandardEmbedding( + num_embeddings=self.config.alignment_frontend_config.unit_vocab_info.size, + embedding_dim=self.config.alignment_encoder_config.model_dim, + pad_idx=self.config.alignment_frontend_config.unit_vocab_info.pad_idx, + init_fn=init_scaled_embedding, + device=self.device, + dtype=self.dtype, + ) + + return UnitY2AlignmentFrontend( + embed_text, embed_unit, text_tokenizer, unit_tokenizer + ) + + def build_alignment_encoder(self, training: bool = False) -> UnitY2AlignmentEncoder: + cfg = self.config.alignment_encoder_config + alignment_encoder = UnitY2AlignmentEncoder( + embed_dim=cfg.model_dim, + feat_dim=cfg.feat_dim, + text_layers=cfg.num_text_layers, + feat_layers=cfg.num_feat_layers, + dropout=cfg.dropout, + temperature=cfg.temperature, + reduction_factor=cfg.reduction_factor, + dtype=self.dtype, + ) + alignment_encoder.training = training + + return alignment_encoder + + +def create_unity2_alignment_model( + config: UnitY2AlignmentConfig, + device: Optional[Device] = None, + dtype: DataType = torch.float32, +) -> UnitY2AlignmentModel: + """Create a UnitY model. + + :param config: + The configuration to use. + :param device: + The device on which to initialize modules. + :param dtype: + The data type of module parameters and buffers. + """ + + unity2_aligner_builder = UnitY2AlignmentBuilder( + config, + device=device, + dtype=dtype, + ) + + return unity2_aligner_builder.build_model() diff --git a/seamless_communication/src/seamless_communication/models/aligner/loader.py b/seamless_communication/src/seamless_communication/models/aligner/loader.py new file mode 100644 index 0000000..f96b5e7 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/aligner/loader.py @@ -0,0 +1,88 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Any, List, Mapping + +import torch +from fairseq2.assets import asset_store, download_manager +from fairseq2.models.utils import ConfigLoader, ModelLoader + +from seamless_communication.models.aligner.builder import ( + UnitY2AlignmentConfig, + aligner_archs, + create_unity2_alignment_model, +) +from seamless_communication.models.aligner.model import UnitY2AlignmentModel +from seamless_communication.models.unity.char_tokenizer import load_unity_char_tokenizer + + +def convert_unity2_aligner_checkpoint( + checkpoint: Mapping[str, Any], config: UnitY2AlignmentConfig +) -> Mapping[str, Any]: + if ( + "model" in checkpoint + and "alignment_encoder.t_conv.1.weight" in checkpoint["model"] + ): + return checkpoint + + alignment_frontend_statedict = {} + text_emb_state_keymap = {"weight": "alignment_frontend.embed_text.weight"} + for k, v in checkpoint["text_emb_state"].items(): + alignment_frontend_statedict[text_emb_state_keymap[k]] = v + + unit_emb_state_keymap = {"weight": "alignment_frontend.embed_unit.weight"} + for k, v in checkpoint["unit_emb_state"].items(): + alignment_frontend_statedict[unit_emb_state_keymap[k]] = v + + alignment_encoder_state_dict = {} + for k, v in checkpoint["aligner_state"].items(): + alignment_encoder_state_dict[f"alignment_encoder.{k}"] = v + + model_state = { + **alignment_encoder_state_dict, + **alignment_frontend_statedict, + } + + char_embeds = model_state["alignment_frontend.embed_text.weight"] + + index_mapping = _get_char_index_mapping(config) + vocab_size = len(index_mapping) + char_embeds[torch.arange(vocab_size)] = char_embeds[index_mapping] + + checkpoint["model"] = model_state + + return checkpoint + + +def _get_char_index_mapping(config: UnitY2AlignmentConfig) -> List[int]: + char_tokenizer = load_unity_char_tokenizer(config.model_name_or_card) + spm_order = [ + char_tokenizer.model.index_to_token(i) + for i in range(char_tokenizer.model.vocabulary_size) + ][4:] + spm_to_dict_mapping = { + ch: idx + for (idx, ch) in zip( + range(4, char_tokenizer.model.vocabulary_size), + sorted(spm_order), + ) + } + model_to_dict_mapping = [0, 1, 2, 3] + [spm_to_dict_mapping[ch] for ch in spm_order] + return model_to_dict_mapping + + +load_unity2_alignment_config = ConfigLoader[UnitY2AlignmentConfig]( + asset_store, aligner_archs +) + +load_unity2_alignment_model = ModelLoader[UnitY2AlignmentModel, UnitY2AlignmentConfig]( + asset_store, + download_manager, + load_unity2_alignment_config, + create_unity2_alignment_model, + convert_unity2_aligner_checkpoint, + restrict_checkpoints=False, +) diff --git a/seamless_communication/src/seamless_communication/models/aligner/model.py b/seamless_communication/src/seamless_communication/models/aligner/model.py new file mode 100644 index 0000000..5981da5 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/aligner/model.py @@ -0,0 +1,304 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Any, List, Tuple, Union + +import numpy as np +import numpy.typing as npt +import torch +import torch.nn as nn +import torch.nn.functional as F +from fairseq2.data import CString +from fairseq2.nn.embedding import StandardEmbedding +from fairseq2.nn.padding import to_padding_mask +from fairseq2.typing import DataType +from torch import Tensor +from torch.nn import Module + +from seamless_communication.models.unity.char_tokenizer import CharTokenizer +from seamless_communication.models.unity.unit_tokenizer import UnitTokenizer + + +class UnitY2AlignmentFrontend(Module): + def __init__( + self, + embed_text: StandardEmbedding, + embed_unit: StandardEmbedding, + text_tokenizer: CharTokenizer, + unit_tokenizer: UnitTokenizer, + ): + super().__init__() + self.embed_text = embed_text + self.embed_unit = embed_unit + self.text_tokenizer = text_tokenizer + self.unit_tokenizer = unit_tokenizer + unit_tokenizer.is_nar_decoder = True + + self.encode_text = self.text_tokenizer.create_raw_encoder() + # text decoder can be used to map aligned characters to words + self.decode_text = self.text_tokenizer.create_decoder() + self.encode_unit = self.unit_tokenizer.create_encoder(lang="eng") + + def tokenize_text( + self, text: str, return_tokens: bool = False, add_trailing_silence: bool = False + ) -> Tensor: + tokenized = self.encode_text(text) + if add_trailing_silence: + tokenized = torch.cat([tokenized, tokenized[0:1]]) + + return tokenized + + def tokenize_text_to_tokens( + self, text: str, add_trailing_silence: bool = False + ) -> List[Union[CString, str]]: + tokenized = self.encode_text.encode_as_tokens(text) + if add_trailing_silence: + tokenized = tokenized + [tokenized[0]] + + return tokenized + + def tokenize_unit(self, units: Union[str, Tensor]) -> Tensor: + if isinstance(units, str): + units = torch.tensor([int(u) for u in units.split(" ")]) + return self.encode_unit(units) + + def forward(self, text: Tensor, unit: Tensor) -> Tuple[Any, Any]: + embs_unit = self.embed_unit(unit) + embs_text = self.embed_text(text) + return embs_text, embs_unit + + +class Permute12(nn.Module): + def forward(self, x: Tensor) -> Tensor: + return x.transpose(1, 2) + + +class UnitY2AlignmentEncoder(Module): + """ + UnitY2 Aligner component + """ + + def __init__( + self, + embed_dim: int, + feat_dim: int, + text_layers: int, + feat_layers: int, + dropout: float, + temperature: float, + reduction_factor: int, + dtype: DataType, + ): + super().__init__() + self.temperature = temperature + self.reduction_factor = reduction_factor # for unit + + layers: List[Module] = [Permute12()] + for i in range(text_layers): + if i < text_layers - 1: + layers.append( + nn.Conv1d( + embed_dim, embed_dim, kernel_size=3, padding=1, dtype=dtype + ) + ) + layers.append(nn.ReLU()) + layers.append(nn.Dropout(p=dropout)) + else: + layers.append( + nn.Conv1d( + embed_dim, embed_dim, kernel_size=1, padding=0, dtype=dtype + ) + ) + layers.append(nn.Dropout(p=dropout)) + layers.append(Permute12()) + self.t_conv = nn.Sequential(*layers) + + layers = [Permute12()] + input_dim = feat_dim + for i in range(feat_layers): + if i < feat_layers - 1: + layers.append( + nn.Conv1d( + input_dim, embed_dim, kernel_size=3, padding=1, dtype=dtype + ) + ) + layers.append(nn.ReLU()) + layers.append(nn.Dropout(p=dropout)) + else: + layers.append( + nn.Conv1d( + input_dim, + embed_dim, + kernel_size=1, + padding=0, + stride=reduction_factor, + dtype=dtype, + ) + ) + layers.append(nn.Dropout(p=dropout)) + layers.append(Permute12()) + input_dim = embed_dim + self.f_conv = nn.Sequential(*layers) + + def forward( + self, + text_emb: Tensor, + feat_emb: Tensor, + text_lengths: Tensor, + feat_lengths: Tensor, + ) -> Tuple[Tensor, Tensor]: + """Compute alignment between sequence of text and feature embeddings + + Args: + text_emb (Tensor): Batched text embedding (B, T_text, C). + feat_emb (Tensor): Batched acoustic feature (B, T_feat, feat_dim). + text_lengths (Tensor): Source text length (B,). + feat_lengths (Tensor): Target feature length (B,). + + Returns: + Tensor: Log probability of attention matrix (B, T_feat, T_text) + Tensor: Unit durations of every text token (B, T_text) + + """ + _feat_lengths = feat_lengths.clone() + if self.reduction_factor > 1: + feat_lengths = torch.ceil(feat_lengths / self.reduction_factor).long() + + text_emb = self.t_conv(text_emb) + feat_emb = self.f_conv(feat_emb) + + dist = feat_emb.unsqueeze(2) - text_emb.unsqueeze(1) + dist = torch.norm(dist, p=2, dim=3) + score = -self.temperature * dist + + padding_mask = ~(to_padding_mask(text_lengths, max(text_lengths))) + padding_mask = padding_mask.unsqueeze(-2) + score = score.masked_fill(padding_mask, -np.inf) + + attn_lprob = F.log_softmax(score, dim=-1) + + attn_hard_dur = viterbi_decode(attn_lprob, text_lengths, feat_lengths) + + if self.reduction_factor > 1: + attn_hard_dur = self.postprocess_alignment( + attn_hard_dur, text_lengths, _feat_lengths + ) + + return attn_lprob, attn_hard_dur + + def postprocess_alignment( + self, attn_hard_dur: Tensor, text_lengths: Tensor, feat_lengths: Tensor + ) -> Tensor: + attn_hard_dur = attn_hard_dur * self.reduction_factor + B, T = attn_hard_dur.size() # B x T_text + dur_cumsum = torch.cumsum(attn_hard_dur, dim=1) + for b in range(B): + for t in range(text_lengths[b]): + # truncate the right frames + if dur_cumsum[b, t] >= feat_lengths[b]: + if t == 0: + attn_hard_dur[b, t] = feat_lengths[b] + else: + attn_hard_dur[b, t] = feat_lengths[b] - dur_cumsum[b, t - 1] + if t < text_lengths[b] - 1: + attn_hard_dur[b, t + 1 :] = 0 + break + return attn_hard_dur + + +def _monotonic_alignment_search( + attn_lprob: npt.NDArray[np.float64], +) -> npt.NDArray[np.float64]: + # https://arxiv.org/abs/2005.11129 + T_feat = attn_lprob.shape[0] + T_text = attn_lprob.shape[1] + Q = np.full((T_text, T_feat), fill_value=-np.inf) + + log_prob = attn_lprob.transpose(1, 0) # -> (T_text, T_feat) + # 1. Q <- init first row for all j + for j in range(T_feat): + Q[0, j] = log_prob[0, : j + 1].sum() + + # 2. + for j in range(1, T_feat): + for i in range(1, min(j + 1, T_text)): + Q[i, j] = max(Q[i - 1, j - 1], Q[i, j - 1]) + log_prob[i, j] + + # 3. + A = np.full((T_feat,), fill_value=T_text - 1) + for j in range(T_feat - 2, -1, -1): # T_feat-2, ..., 0 + # 'i' in {A[j+1]-1, A[j+1]} + i_a = A[j + 1] - 1 + i_b = A[j + 1] + if i_b == 0: + argmax_i = 0 + elif Q[i_a, j] >= Q[i_b, j]: + argmax_i = i_a + else: + argmax_i = i_b + A[j] = argmax_i + return A + + +def viterbi_decode( + attn_lprob: Tensor, text_lengths: Tensor, feat_lengths: Tensor +) -> Tensor: + """Extract duration from an attention probability matrix + + Args: + attn_lprob (Tensor): Batched log probability of attention + matrix (B, T_feat, T_text). + text_lengths (Tensor): Text length tensor (B,). + feat_lengths (Tensor): Feature length tensor (B,). + + Returns: + Tensor: Batched token duration extracted from `attn_lprob` (B, T_text). + Tensor: Binarization loss tensor (). + + """ + B = attn_lprob.size(0) + T_text = attn_lprob.size(2) + device = attn_lprob.device + + durations = torch.zeros((B, T_text), device=device, dtype=torch.long) + for b in range(B): + assert feat_lengths[b] > 0 + assert text_lengths[b] > 0 + cur_log_p_attn = attn_lprob[b, : feat_lengths[b], : text_lengths[b]] + viterbi = _monotonic_alignment_search( + cur_log_p_attn.float().detach().cpu().numpy() + ) + _durations = np.bincount(viterbi) + durations[b, : len(_durations)] = torch.from_numpy(_durations).to(device) + + return durations + + +class UnitY2AlignmentModel(Module): + alignment_encoder: UnitY2AlignmentEncoder + alignment_frontend: UnitY2AlignmentFrontend + + def __init__( + self, + alignment_frontend: UnitY2AlignmentFrontend, + alignment_encoder: UnitY2AlignmentEncoder, + ): + super().__init__() + self.alignment_frontend = alignment_frontend + self.alignment_encoder = alignment_encoder + + def forward(self, input_text: Tensor, input_unit: Tensor) -> Tuple[Tensor, Tensor]: + assert input_text.ndim == 2 + assert input_unit.ndim == 2 + embs_text, embs_unit = self.alignment_frontend(input_text, input_unit) + attn_lprob, attn_hard_dur = self.alignment_encoder( + embs_text, + embs_unit, + torch.tensor([embs_text.size(1)]).to(embs_text).int(), + torch.tensor([embs_unit.size(1)]).to(embs_unit).int(), + ) + + return attn_lprob, attn_hard_dur diff --git a/seamless_communication/src/seamless_communication/models/conformer_shaw/__init__.py b/seamless_communication/src/seamless_communication/models/conformer_shaw/__init__.py new file mode 100644 index 0000000..b0a36f7 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/conformer_shaw/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from .builder import ( + ConformerShawEncoderBuilder as ConformerShawEncoderBuilder, +) +from .builder import ( + ConformerShawEncoderConfig as ConformerShawEncoderConfig, +) +from .builder import ( + conformer_shaw_archs as conformer_shaw_archs, +) +from .builder import ( + create_conformer_shaw_model as create_conformer_shaw_model, +) +from .loader import ( + load_conformer_shaw_model as load_conformer_shaw_model, +) diff --git a/seamless_communication/src/seamless_communication/models/conformer_shaw/builder.py b/seamless_communication/src/seamless_communication/models/conformer_shaw/builder.py new file mode 100644 index 0000000..15747cd --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/conformer_shaw/builder.py @@ -0,0 +1,182 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from dataclasses import asdict, dataclass +from typing import Optional + +from fairseq2.models.conformer import ConformerConvolution +from fairseq2.models.utils.arch_registry import ArchitectureRegistry +from fairseq2.models.w2vbert import w2vbert_archs +from fairseq2.models.wav2vec2.builder import ( + Wav2Vec2Builder, + Wav2Vec2Config, + Wav2Vec2EncoderBuilder, + Wav2Vec2EncoderConfig, + wav2vec2_arch, +) +from fairseq2.models.wav2vec2.model import Wav2Vec2Model +from fairseq2.nn.transformer import SDPA, ShawRelativePositionSDPA, create_default_sdpa +from fairseq2.typing import DataType, Device + + +@dataclass +class ShawRelativePositionSDPAConfig: + """Holds the configuration of the :class:ShawRelativePositionSDPA module.""" + + max_left_rel_pos: int + """The left clipping value for relative positions.""" + + max_right_rel_pos: Optional[int] + """The right clipping value for relative positions.""" + + use_rel_pos_values: bool = False + """If True, also uses relative position values to compute relative attention.""" + + +@dataclass +class ConformerShawEncoderConfig(Wav2Vec2EncoderConfig): + """Holds the configuration of a conformer shaw encoder.""" + + shaw_rel_pos_sdpa_config: Optional[ShawRelativePositionSDPAConfig] + """The parameters for ShawRelativePositionSDPA.""" + + +conformer_shaw_archs = ArchitectureRegistry[ConformerShawEncoderConfig]( + "conformer_shaw" +) + +conformer_shaw_arch = conformer_shaw_archs.decorator + + +@conformer_shaw_arch("600m") +def _conformer_shaw_600m_encoder() -> ConformerShawEncoderConfig: + w2vbert_config = w2vbert_archs.get_config("600m") + w2v2_encoder_config = w2vbert_config.w2v2_config.encoder_config + sdpa_config = ShawRelativePositionSDPAConfig( + max_left_rel_pos=64, + max_right_rel_pos=8, + use_rel_pos_values=False, + ) + conformer_shaw_encoder_config = ConformerShawEncoderConfig( + **asdict(w2v2_encoder_config), + shaw_rel_pos_sdpa_config=sdpa_config, + ) + conformer_shaw_encoder_config.pos_encoder_type = "shaw_relative" + return conformer_shaw_encoder_config + + +@wav2vec2_arch("conformer_shaw_600m") +def _conformer_shaw_600m() -> Wav2Vec2Config: + encoder_config = _conformer_shaw_600m_encoder() + + return Wav2Vec2Config( + encoder_config, + final_dim=768, + final_proj_bias=True, + temporal_mask_span_len=10, + max_temporal_mask_prob=0.65, + spatial_mask_span_len=10, + max_spatial_mask_prob=0.0, + quantized_dim=768, + num_codebooks=2, + num_codebook_entries=320, + codebook_sampling_temperature=(2.0, 0.1, 0.999995), + num_distractors=100, + logit_temp=0.1, + diversity_loss_weight=0.2, + ) + + +class ConformerShawEncoderBuilder(Wav2Vec2EncoderBuilder): + """ + Builds modules of a `ConformerShawEncoderBuilder`. + + This is a Conformer architecture with these differences: + - ShawRelativePositionSDPA as the SDPA. + - ConformerConvolution with causal depthwise convolution + and norm_type "layer_norm". + """ + + config: ConformerShawEncoderConfig + + def __init__( + self, + config: ConformerShawEncoderConfig, + *, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + """ + :param config: + The configuration to use. + :param device: + The device on which to initialize modules. + :param dtype: + The data type of module parameters and buffers. + """ + super().__init__(config, device=device, dtype=dtype) + + assert self.config.use_conformer, "This architecture only supports a Conformer." + assert ( + self.config.pos_encoder_type == "shaw_relative" + ), "This architecture only supports ShawRelativePositionSDPA." + + def build_sdpa(self) -> SDPA: + if self.config.shaw_rel_pos_sdpa_config is None: + raise ValueError( + "`shaw_rel_pos_sdpa_config` must be specified when `pos_encoder_type` is 'shaw_relative'." + ) + + sdpa = create_default_sdpa(attn_dropout_p=self.config.attn_dropout_p) + + sdpa_config = self.config.shaw_rel_pos_sdpa_config + + return ShawRelativePositionSDPA( + self.config.model_dim, + self.config.num_encoder_attn_heads, + sdpa_config.max_left_rel_pos, + max_right_rel_pos=sdpa_config.max_right_rel_pos, + use_rel_pos_values=sdpa_config.use_rel_pos_values, + inner_sdpa=sdpa, + device=self.device, + dtype=self.dtype, + ) + + def build_conformer_conv(self) -> ConformerConvolution: + return ConformerConvolution( + self.config.model_dim, + self.config.depthwise_conv_kernel_size, + causal_depthwise_conv=True, + norm_type="layer_norm", + device=self.device, + dtype=self.dtype, + ) + + +def create_conformer_shaw_model( + config: Wav2Vec2Config, + *, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, +) -> Wav2Vec2Model: + """Create a conformer shaw model. + + :param config: + The configuration. + :param device: + The device on which to initialize modules. + :param dtype: + The data type of module parameters and buffers. + """ + assert isinstance(config.encoder_config, ConformerShawEncoderConfig) + + encoder_builder = ConformerShawEncoderBuilder( + config.encoder_config, device=device, dtype=dtype + ) + + builder = Wav2Vec2Builder(config, encoder_builder, device=device, dtype=dtype) + + return builder.build_model() diff --git a/seamless_communication/src/seamless_communication/models/conformer_shaw/loader.py b/seamless_communication/src/seamless_communication/models/conformer_shaw/loader.py new file mode 100644 index 0000000..241726a --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/conformer_shaw/loader.py @@ -0,0 +1,82 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Any, Mapping + +import torch + +from fairseq2.assets import asset_store, download_manager +from fairseq2.models.utils import ModelLoader +from fairseq2.models.utils.checkpoint import convert_fairseq_checkpoint +from fairseq2.models.wav2vec2.builder import Wav2Vec2Config +from fairseq2.models.wav2vec2.loader import load_wav2vec2_config +from fairseq2.models.wav2vec2.model import Wav2Vec2Model + +from .builder import ( + create_conformer_shaw_model, +) + + +def convert_conformer_shaw_checkpoint( + checkpoint: Mapping[str, Any], config: Wav2Vec2Config +) -> Mapping[str, Any]: + """Convert a fairseq conformer shaw checkpoint to fairseq2.""" + state_dict = checkpoint["model"] + + # Check if we have a fairseq2 checkpoint. + if "final_target_proj.weight" in state_dict: + return checkpoint + + for key in ( + "mlm_proj.weight", + "mlm_proj.bias", + "encoder.layer_norm.weight", + "encoder.layer_norm.bias", + ): + if key in state_dict: + del state_dict[key] + + state_dict["quantizer.num_updates"] = torch.zeros((), device="cpu") + + key_map = { + # fmt: off + r"^encoder\.layers\.([0-9]+)\.self_attn\.out_proj\.": r"encoder.layers.\1.self_attn.output_proj.", + r"^encoder\.layers\.([0-9]+)\.self_attn\.rel_k_embedding\.": r"encoder.layers.\1.self_attn.sdpa.rel_k_embed.", + r"^encoder\.layers\.([0-9]+)\.conv_module\.depthwise_conv\.": r"encoder.layers.\1.conv.depthwise_conv.", + r"^encoder\.layers\.([0-9]+)\.conv_module\.layer_norm\.": r"encoder.layers.\1.conv_layer_norm.", + r"^encoder\.layers\.([0-9]+)\.conv_module\.layer_norm2\.": r"encoder.layers.\1.conv.layer_norm.", + r"^encoder\.layers\.([0-9]+)\.conv_module\.pointwise_conv1\.": r"encoder.layers.\1.conv.pointwise_conv1.", + r"^encoder\.layers\.([0-9]+)\.conv_module\.pointwise_conv2\.": r"encoder.layers.\1.conv.pointwise_conv2.", + r"^encoder\.layers\.([0-9]+)\.fc1\.": r"encoder.layers.\1.ffn.inner_proj.", + r"^encoder\.layers\.([0-9]+)\.fc2\.": r"encoder.layers.\1.ffn.output_proj.", + r"^encoder\.layers\.([0-9]+)\.ffn(1|2)\.layer_norm\.": r"encoder.layers.\1.ffn\2_layer_norm.", + r"^encoder\.layers\.([0-9]+)\.ffn(1|2)\.w_1\.": r"encoder.layers.\1.ffn\2.inner_proj.", + r"^encoder\.layers\.([0-9]+)\.ffn(1|2)\.w_2\.": r"encoder.layers.\1.ffn\2.output_proj.", + r"^encoder\.layers\.([0-9]+)\.final_layer_norm\.": r"encoder.layers.\1.layer_norm.", + r"^encoder\.embed_tokens\.": r"encoder_frontend.embed.", + r"^encoder\.pos_conv\.0\.": r"encoder_frontend.pos_encoder.conv.", + r"^feature_extractor\.conv_layers\.([0-9]+)\.0\.": r"encoder_frontend.feature_extractor.layers.\1.conv.", + r"^feature_extractor\.conv_layers\.([0-9]+)\.2\.1\.": r"encoder_frontend.feature_extractor.layers.\1.layer_norm.", + r"^feature_extractor\.conv_layers\.0\.2\.": r"encoder_frontend.feature_extractor.layers.0.group_norm.", + r"^layer_norm\.": r"encoder_frontend.post_extract_layer_norm.", + r"^post_extract_proj\.": r"encoder_frontend.model_dim_proj.", + r"^mask_emb": r"masker.temporal_mask_embed", + r"^quantizer\.vars": r"quantizer.entries", + r"^quantizer\.weight_proj\.": r"quantizer.entry_proj.", + r"^project_q\.": r"final_target_proj.", + # fmt: on + } + + return convert_fairseq_checkpoint(checkpoint, key_map) + + +load_conformer_shaw_model = ModelLoader[Wav2Vec2Model, Wav2Vec2Config]( + asset_store, + download_manager, + load_wav2vec2_config, + create_conformer_shaw_model, + convert_conformer_shaw_checkpoint, +) diff --git a/seamless_communication/src/seamless_communication/models/generator/__init__.py b/seamless_communication/src/seamless_communication/models/generator/__init__.py new file mode 100644 index 0000000..5d20256 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/generator/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. diff --git a/seamless_communication/src/seamless_communication/models/generator/builder.py b/seamless_communication/src/seamless_communication/models/generator/builder.py new file mode 100644 index 0000000..b3eb622 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/generator/builder.py @@ -0,0 +1,506 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass +from typing import Any, Dict, List, Literal, Optional, Tuple + +from fairseq2.data import VocabularyInfo +from fairseq2.models.utils.arch_registry import ArchitectureRegistry +from fairseq2.nn.embedding import StandardEmbedding, init_scaled_embedding +from fairseq2.nn.position_encoder import SinusoidalPositionEncoder +from fairseq2.nn.projection import Linear +from fairseq2.nn.transformer import ( + MultiheadAttention, + StandardMultiheadAttention, + TransformerNormOrder, + create_default_sdpa, +) +from fairseq2.typing import DataType, Device +from torch.nn import Conv1d + +from seamless_communication.models.generator.ecapa_tdnn_builder import ( + EcapaTDNNBuilder, + EcapaTDNNConfig, + ecapa_tdnn_archs, +) +from seamless_communication.models.generator.vocoder import ( + PretsselDecoderFrontend, + PretsselEncoderFrontend, + PretsselVocoder, +) +from seamless_communication.models.unity.fft_decoder import FeedForwardTransformer +from seamless_communication.models.unity.fft_decoder_layer import ( + Conv1dBlock, + FeedForwardTransformerLayer, +) +from seamless_communication.models.unity.length_regulator import ( + VarianceAdaptor, + VariancePredictor, +) +from seamless_communication.models.unity.t2u_builder import VariancePredictorConfig + + +@dataclass +class PretsselEncoderFrontendConfig: + prosody_encoder_config: EcapaTDNNConfig + dropout: float + lang_embed_dim: Optional[int] = None + + +@dataclass +class FFTLayerConfig: + attention_heads: int + hidden_dim: int + kernel_size: int + dropout: float + conv1d_dropout: float + film_cond_dim: int + use_film: bool = False + + +@dataclass +class PretsselDecoderFrontendConfig: + upsampling_type: Literal["gaussian", "hard"] + variance_predictor_config: VariancePredictorConfig + add_variance_parallel: bool + + +@dataclass +class VocoderConfig: + """Holds the configuration of a Vocoder model.""" + + encoder_frontend_config: PretsselEncoderFrontendConfig + fft_layer_config: FFTLayerConfig + decoder_frontend_config: PretsselDecoderFrontendConfig + pn_conv_dim: int + pn_layers: int + pn_conv_kernel_size: int + pn_dropout: float + vocab_info: VocabularyInfo + model_dim: int + max_seq_len: int + encoder_layers: int + decoder_layers: int + mel_dim: int + langs: List # type: ignore[type-arg] + upsample_rates: List[int] + upsample_kernel_sizes: List[int] + upsample_initial_channel: int + resblock_kernel_sizes: List[int] + resblock_dilation_sizes: List[List[int]] + channels: int + dimension: int + n_filters: int + ratios: List[int] + norm: Literal["none", "weight_norm", "spectral_norm", "time_group_norm"] + norm_params: Dict[str, Any] + kernel_size: int + last_kernel_size: int + residual_kernel_size: int + causal: bool + pad_mode: str + true_skip: bool + compress: int + lstm: int + disable_norm_outer_blocks: int + trim_right_ratio: float + gcmvn_stats: Dict[str, List] # type: ignore[type-arg] + + +vocoder_archs = ArchitectureRegistry[VocoderConfig]("vocoder_pretssel") + + +vocoder_arch = vocoder_archs.decorator + + +def pretssel_config() -> ( + Tuple[PretsselEncoderFrontendConfig, FFTLayerConfig, PretsselDecoderFrontendConfig] +): + prosody_encoder_config = ecapa_tdnn_archs.get_config("base") + + encoder_frontend_config = PretsselEncoderFrontendConfig( + prosody_encoder_config=prosody_encoder_config, + dropout=0.2, + lang_embed_dim=64, + ) + + fft_layer_config = FFTLayerConfig( + attention_heads=2, + hidden_dim=1024, + kernel_size=9, + dropout=0.0, + conv1d_dropout=0.2, + use_film=True, + film_cond_dim=576, + ) + + variance_predictor_config = VariancePredictorConfig( + var_pred_hidden_dim=512, + var_pred_kernel_size=5, + var_pred_dropout=0.5, + use_film=True, + film_cond_dim=576, + ) + + decoder_frontend_config = PretsselDecoderFrontendConfig( + upsampling_type="gaussian", + variance_predictor_config=variance_predictor_config, + add_variance_parallel=True, + ) + return ( + encoder_frontend_config, + fft_layer_config, + decoder_frontend_config, + ) + + +@vocoder_arch("16khz") +def _16khz_vocoder() -> VocoderConfig: + ( + encoder_frontend_config, + fft_layer_config, + decoder_frontend_config, + ) = pretssel_config() + + return VocoderConfig( + encoder_frontend_config=encoder_frontend_config, + fft_layer_config=fft_layer_config, + decoder_frontend_config=decoder_frontend_config, + pn_conv_dim=512, + pn_layers=5, + pn_conv_kernel_size=5, + pn_dropout=0.5, + vocab_info=VocabularyInfo( + size=10004, unk_idx=3, bos_idx=0, eos_idx=2, pad_idx=1 + ), + model_dim=256, + max_seq_len=10000, + encoder_layers=4, + decoder_layers=4, + mel_dim=80, + langs=[], + upsample_rates=[5, 4, 4, 2], + upsample_kernel_sizes=[10, 8, 8, 4], + upsample_initial_channel=512, + resblock_kernel_sizes=[3, 7, 11], + resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], + channels=1, + dimension=128, + n_filters=32, + ratios=[8, 5, 4, 2], + norm="weight_norm", + norm_params={}, + kernel_size=7, + last_kernel_size=7, + residual_kernel_size=3, + causal=False, + pad_mode="constant", + true_skip=True, + compress=2, + lstm=2, + disable_norm_outer_blocks=0, + trim_right_ratio=1.0, + gcmvn_stats={}, + ) + + +@vocoder_arch("24khz") +def _24khz_vocoder() -> VocoderConfig: + ( + encoder_frontend_config, + fft_layer_config, + decoder_frontend_config, + ) = pretssel_config() + + return VocoderConfig( + encoder_frontend_config=encoder_frontend_config, + fft_layer_config=fft_layer_config, + decoder_frontend_config=decoder_frontend_config, + pn_conv_dim=512, + pn_layers=5, + pn_conv_kernel_size=5, + pn_dropout=0.5, + vocab_info=VocabularyInfo( + size=10004, unk_idx=3, bos_idx=0, eos_idx=2, pad_idx=1 + ), + model_dim=256, + max_seq_len=10000, + encoder_layers=4, + decoder_layers=4, + mel_dim=80, + langs=[], + upsample_rates=[5, 4, 4, 3], + upsample_kernel_sizes=[10, 8, 8, 6], + upsample_initial_channel=512, + resblock_kernel_sizes=[3, 7, 11], + resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], + channels=1, + dimension=128, + n_filters=32, + ratios=[8, 5, 4, 2], + norm="weight_norm", + norm_params={}, + kernel_size=7, + last_kernel_size=7, + residual_kernel_size=3, + causal=False, + pad_mode="constant", + true_skip=True, + compress=2, + lstm=2, + disable_norm_outer_blocks=0, + trim_right_ratio=1.0, + gcmvn_stats={}, + ) + + +class PretsselVocoderBuilder: + config: VocoderConfig + prosody_encoder_builder: EcapaTDNNBuilder + device: Optional[Device] = None + dtype: Optional[DataType] = None + + def __init__( + self, + config: VocoderConfig, + prosody_encoder_builder: EcapaTDNNBuilder, + *, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + """ + :param config: + The configuration to use. + :param device: + The device on which to initialize modules. + :param dtype: + The data type of module parameters and buffers. + """ + self.config = config + self.prosody_encoder_builder = prosody_encoder_builder + self.device, self.dtype = device, dtype + + def build_embed_tokens(self) -> StandardEmbedding: + """Build a unit embedding table.""" + + return StandardEmbedding( + num_embeddings=self.config.vocab_info.size, + embedding_dim=self.config.model_dim, + init_fn=init_scaled_embedding, + device=self.device, + dtype=self.dtype, + ) + + def build_fft(self, num_layers: int) -> FeedForwardTransformer: + """Build a Transformer encoder.""" + + layers = [self.build_fft_layer() for _ in range(num_layers)] + + return FeedForwardTransformer( + layers, + norm_order=TransformerNormOrder.POST, + device=self.device, + dtype=self.dtype, + ) + + def build_fft_layer(self) -> FeedForwardTransformerLayer: + """Build a Transformer decoder layer.""" + + self_attn = self.build_attention(self.config.fft_layer_config.attention_heads) + + conv1d = Conv1dBlock( + self.config.model_dim, + self.config.fft_layer_config.hidden_dim, + self.config.fft_layer_config.kernel_size, + bias=True, + device=self.device, + dtype=self.dtype, + ) + + return FeedForwardTransformerLayer( + self_attn, + conv1d, + dropout_p=0.0, # fairseq1 doesn't have this + conv1d_dropout_p=self.config.fft_layer_config.conv1d_dropout, + use_film=self.config.fft_layer_config.use_film, + film_cond_dim=self.config.fft_layer_config.film_cond_dim, + device=self.device, + dtype=self.dtype, + ) + + def build_attention(self, num_heads: int) -> MultiheadAttention: + """Build a Transformer multi-head attention layer.""" + + sdpa = create_default_sdpa(attn_dropout_p=self.config.fft_layer_config.dropout) + + return StandardMultiheadAttention( + self.config.model_dim, + num_heads, + sdpa=sdpa, + device=self.device, + dtype=self.dtype, + ) + + def build_variance_adaptor( + self, + decoder_frontend_config: PretsselDecoderFrontendConfig, + ) -> VarianceAdaptor: + """Build a variance adaptor module.""" + + variance_predictor_config = decoder_frontend_config.variance_predictor_config + + pitch_predictor = VariancePredictor( + self.config.model_dim, + variance_predictor_config.var_pred_hidden_dim, + variance_predictor_config.var_pred_kernel_size, + variance_predictor_config.var_pred_dropout, + use_film=variance_predictor_config.use_film, + film_cond_dim=variance_predictor_config.film_cond_dim, + device=self.device, + dtype=self.dtype, + ) + + embed_pitch = Conv1d(1, self.config.model_dim, kernel_size=1) + + vuv_predictor = VariancePredictor( + self.config.model_dim, + variance_predictor_config.var_pred_hidden_dim, + variance_predictor_config.var_pred_kernel_size, + variance_predictor_config.var_pred_dropout, + use_film=variance_predictor_config.use_film, + film_cond_dim=variance_predictor_config.film_cond_dim, + device=self.device, + dtype=self.dtype, + ) + + energy_predictor = VariancePredictor( + self.config.model_dim, + variance_predictor_config.var_pred_hidden_dim, + variance_predictor_config.var_pred_kernel_size, + variance_predictor_config.var_pred_dropout, + use_film=variance_predictor_config.use_film, + film_cond_dim=variance_predictor_config.film_cond_dim, + device=self.device, + dtype=self.dtype, + ) + + embed_energy = Conv1d(1, self.config.model_dim, kernel_size=1) + + variance_adaptor = VarianceAdaptor( + duration_predictor=None, + pitch_predictor=pitch_predictor, + embed_pitch=embed_pitch, + vuv_predictor=vuv_predictor, + energy_predictor=energy_predictor, + embed_energy=embed_energy, + add_variance_parallel=decoder_frontend_config.add_variance_parallel, + upsampling_type=decoder_frontend_config.upsampling_type, + ) + + return variance_adaptor + + def build_model(self) -> PretsselVocoder: + """build the pretssel vocoder.""" + prosody_encoder = self.prosody_encoder_builder.build_model() + embed_tokens = self.build_embed_tokens() + + embed_positions = SinusoidalPositionEncoder( + self.config.model_dim, + self.config.max_seq_len, + _legacy_pad_idx=self.config.vocab_info.pad_idx, + device=self.device, + ) + lang_to_index = {l: i for i, l in enumerate(self.config.langs)} + encoder_frontend = PretsselEncoderFrontend( + prosody_encoder, + embed_tokens, + embed_positions, + lang_to_index, + lang_embed_dim=self.config.encoder_frontend_config.lang_embed_dim, + dropout_p=self.config.encoder_frontend_config.dropout, + device=self.device, + dtype=self.dtype, + ) + + encoder = self.build_fft(self.config.encoder_layers) + + variance_adaptor = self.build_variance_adaptor( + self.config.decoder_frontend_config + ) + + decoder_frontend = PretsselDecoderFrontend( + variance_adaptor, + embed_positions, + device=self.device, + dtype=self.dtype, + ) + + decoder = self.build_fft(self.config.decoder_layers) + + final_proj = Linear( + self.config.model_dim, + self.config.mel_dim, + bias=True, + device=self.device, + dtype=self.dtype, + ) + + gcmvn_mean = gcmvn_std = None + if self.config.gcmvn_stats is not None: + gcmvn_mean = self.config.gcmvn_stats["mean"] + gcmvn_std = self.config.gcmvn_stats["std"] + + vocoder = PretsselVocoder( + encoder_frontend=encoder_frontend, + encoder=encoder, + decoder_frontend=decoder_frontend, + decoder=decoder, + final_proj=final_proj, + pn_n_channels=self.config.pn_conv_dim, + pn_kernel_size=self.config.pn_conv_kernel_size, + pn_layers=self.config.pn_layers, + pn_dropout=self.config.pn_dropout, + upsample_rates=self.config.upsample_rates, + upsample_kernel_sizes=self.config.upsample_kernel_sizes, + upsample_initial_channel=self.config.upsample_initial_channel, + resblock_kernel_sizes=self.config.resblock_kernel_sizes, + resblock_dilation_sizes=self.config.resblock_dilation_sizes, + channels=self.config.channels, + dimension=self.config.dimension, + n_filters=self.config.n_filters, + ratios=self.config.ratios, + norm=self.config.norm, + norm_params=self.config.norm_params, + kernel_size=self.config.kernel_size, + last_kernel_size=self.config.last_kernel_size, + residual_kernel_size=self.config.residual_kernel_size, + causal=self.config.causal, + pad_mode=self.config.pad_mode, + true_skip=self.config.true_skip, + compress=self.config.compress, + lstm=self.config.lstm, + disable_norm_outer_blocks=self.config.disable_norm_outer_blocks, + trim_right_ratio=self.config.trim_right_ratio, + gcmvn_mean=gcmvn_mean, + gcmvn_std=gcmvn_std, + ) + vocoder.to(dtype=self.dtype, device=self.device) + return vocoder + + +def create_vocoder_model( + config: VocoderConfig, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, +) -> PretsselVocoder: + prosody_encoder_builder = EcapaTDNNBuilder( + config.encoder_frontend_config.prosody_encoder_config, + device=device, + dtype=dtype, + ) + return PretsselVocoderBuilder( + config, prosody_encoder_builder, device=device, dtype=dtype + ).build_model() diff --git a/seamless_communication/src/seamless_communication/models/generator/ecapa_tdnn.py b/seamless_communication/src/seamless_communication/models/generator/ecapa_tdnn.py new file mode 100644 index 0000000..fa464f5 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/generator/ecapa_tdnn.py @@ -0,0 +1,474 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import List, Optional, Tuple + +import torch +import torch.nn.functional as F +from fairseq2.nn.padding import PaddingMask, to_padding_mask +from torch import Tensor +from torch.nn import Conv1d, LayerNorm, Module, ModuleList, ReLU, Sigmoid, Tanh, init + + +class ECAPA_TDNN(Module): + """ + Represents the ECAPA-TDNN model described in paper: + :cite:t`https://doi.org/10.48550/arxiv.2005.07143`. + + Arguments + --------- + :param channels: + Output channels for TDNN/SERes2Net layer. + :param kernel_sizes: + List of kernel sizes for each layer. + :param dilations: + List of dilations for kernels in each layer. + :param groups: + List of groups for kernels in each layer. + """ + + def __init__( + self, + channels: List[int], + kernel_sizes: List[int], + dilations: List[int], + attention_channels: int, + res2net_scale: int, + se_channels: int, + global_context: bool, + groups: List[int], + embed_dim: int, + input_dim: int, + ): + super().__init__() + assert len(channels) == len(kernel_sizes) == len(dilations) + self.channels = channels + self.embed_dim = embed_dim + self.blocks = ModuleList() + + self.blocks.append( + TDNNBlock( + input_dim, + channels[0], + kernel_sizes[0], + dilations[0], + groups[0], + ) + ) + + # SE-Res2Net layers + for i in range(1, len(channels) - 1): + self.blocks.append( + SERes2NetBlock( + channels[i - 1], + channels[i], + res2net_scale=res2net_scale, + se_channels=se_channels, + kernel_size=kernel_sizes[i], + dilation=dilations[i], + groups=groups[i], + ) + ) + + # Multi-layer feature aggregation + self.mfa = TDNNBlock( + channels[-1], + channels[-1], + kernel_sizes[-1], + dilations[-1], + groups=groups[-1], + ) + + # Attentive Statistical Pooling + self.asp = AttentiveStatisticsPooling( + channels[-1], + attention_channels=attention_channels, + global_context=global_context, + ) + self.asp_norm = LayerNorm(channels[-1] * 2, eps=1e-12) + + # Final linear transformation + self.fc = Conv1d( + in_channels=channels[-1] * 2, + out_channels=embed_dim, + kernel_size=1, + ) + + self.reset_parameters() + + def reset_parameters(self) -> None: + """Reset the parameters and buffers of the module.""" + + def encoder_init(m: Module) -> None: + if isinstance(m, Conv1d): + init.xavier_uniform_(m.weight, init.calculate_gain("relu")) + + self.apply(encoder_init) + + def forward( + self, + x: Tensor, + padding_mask: Optional[PaddingMask] = None, + ) -> Tensor: + """Returns the embedding vector. + + Arguments + --------- + x : torch.Tensor + Tensor of shape (batch, time, channel). + """ + # Minimize transpose for efficiency + x = x.transpose(1, 2) + + xl = [] + for layer in self.blocks: + x = layer(x, padding_mask=padding_mask) + xl.append(x) + + # Multi-layer feature aggregation + x = torch.cat(xl[1:], dim=1) + x = self.mfa(x) + + # Attentive Statistical Pooling + x = self.asp(x, padding_mask=padding_mask) + x = self.asp_norm(x.transpose(1, 2)).transpose(1, 2) + + # Final linear transformation + x = self.fc(x) + + x = x.transpose(1, 2).squeeze(1) # B x C + return F.normalize(x, dim=-1) + + +class TDNNBlock(Module): + """An implementation of TDNN. + + Arguments + ---------- + :param in_channels : int + Number of input channels. + :param out_channels : int + The number of output channels. + :param kernel_size : int + The kernel size of the TDNN blocks. + :param dilation : int + The dilation of the TDNN block. + :param groups: int + The groups size of the TDNN blocks. + + Example + ------- + >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2) + >>> layer = TDNNBlock(64, 64, kernel_size=3, dilation=1) + >>> out_tensor = layer(inp_tensor).transpose(1, 2) + >>> out_tensor.shape + torch.Size([8, 120, 64]) + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + dilation: int, + groups: int = 1, + ): + super().__init__() + self.conv = Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + dilation=dilation, + padding=dilation * (kernel_size - 1) // 2, + groups=groups, + ) + self.activation = ReLU() + self.norm = LayerNorm(out_channels, eps=1e-12) + + def forward(self, x: Tensor, padding_mask: Optional[PaddingMask] = None) -> Tensor: + """Processes the input tensor x and returns an output tensor.""" + x = self.activation(self.conv(x)) + + return self.norm(x.transpose(1, 2)).transpose(1, 2) # type: ignore[no-any-return] + + +class Res2NetBlock(Module): + """An implementation of Res2NetBlock w/ dilation. + + Arguments + --------- + :param in_channels : int + The number of channels expected in the input. + :param out_channels : int + The number of output channels. + :param scale : int + The scale of the Res2Net block. + :param kernel_size: int + The kernel size of the Res2Net block. + :param dilation : int + The dilation of the Res2Net block. + + Example + ------- + >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2) + >>> layer = Res2NetBlock(64, 64, scale=4, dilation=3) + >>> out_tensor = layer(inp_tensor).transpose(1, 2) + >>> out_tensor.shape + torch.Size([8, 120, 64]) + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + scale: int = 8, + kernel_size: int = 3, + dilation: int = 1, + ): + super().__init__() + assert in_channels % scale == 0 + assert out_channels % scale == 0 + + in_channel = in_channels // scale + hidden_channel = out_channels // scale + self.blocks = ModuleList( + [ + TDNNBlock( + in_channel, + hidden_channel, + kernel_size=kernel_size, + dilation=dilation, + ) + for i in range(scale - 1) + ] + ) + self.scale = scale + + def forward(self, x: Tensor) -> Tensor: + """Processes the input tensor x and returns an output tensor.""" + y = [] + for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)): + if i == 0: + y_i = x_i + elif i == 1: + y_i = self.blocks[i - 1](x_i) + else: + y_i = self.blocks[i - 1](x_i + y_i) + y.append(y_i) + + y_tensor = torch.cat(y, dim=1) + return y_tensor + + +class SEBlock(Module): + """An implementation of squeeze-and-excitation block. + + Arguments + --------- + in_channels : int + The number of input channels. + se_channels : int + The number of output channels after squeeze. + out_channels : int + The number of output channels. + """ + + def __init__( + self, + in_channels: int, + se_channels: int, + out_channels: int, + ): + super().__init__() + + self.conv1 = Conv1d( + in_channels=in_channels, out_channels=se_channels, kernel_size=1 + ) + self.relu = ReLU(inplace=True) + self.conv2 = Conv1d( + in_channels=se_channels, out_channels=out_channels, kernel_size=1 + ) + self.sigmoid = Sigmoid() + + def forward(self, x: Tensor, padding_mask: Optional[PaddingMask] = None) -> Tensor: + """Processes the input tensor x and returns an output tensor.""" + if padding_mask is not None: + mask = padding_mask.materialize().unsqueeze(1) + s = (x * mask).sum(dim=2, keepdim=True) / padding_mask.seq_lens[ + :, None, None + ] + else: + s = x.mean(dim=2, keepdim=True) + + s = self.relu(self.conv1(s)) + s = self.sigmoid(self.conv2(s)) + + return s * x + + +class AttentiveStatisticsPooling(Module): + """This class implements an attentive statistic pooling layer for each channel. + It returns the concatenated mean and std of the input tensor. + + Arguments + --------- + channels: int + The number of input channels. + attention_channels: int + The number of attention channels. + """ + + def __init__( + self, channels: int, attention_channels: int = 128, global_context: bool = True + ): + super().__init__() + + self.eps = 1e-12 + self.global_context = global_context + if global_context: + self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1) + else: + self.tdnn = TDNNBlock(channels, attention_channels, 1, 1) + + self.tanh = Tanh() + self.conv = Conv1d( + in_channels=attention_channels, out_channels=channels, kernel_size=1 + ) + + def forward(self, x: Tensor, padding_mask: Optional[PaddingMask] = None) -> Tensor: + """Calculates mean and std for a batch (input tensor). + + Arguments + --------- + x : torch.Tensor + Tensor of shape [N, C, L]. + """ + L = x.shape[-1] + + def _compute_statistics( + x: Tensor, m: Tensor, dim: int = 2, eps: float = self.eps + ) -> Tuple[Tensor, Tensor]: + mean = (m * x).sum(dim) + std = torch.sqrt((m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps)) + return mean, std + + # Make binary mask of shape [N, 1, L] + # mask = to_padding_mask(lengths, max(lengths)) + if padding_mask is not None: + mask = padding_mask.materialize() + else: + mask = to_padding_mask(torch.IntTensor([L]), L).repeat(x.shape[0], 1).to(x) + mask = mask.unsqueeze(1) + + # Expand the temporal context of the pooling layer by allowing the + # self-attention to look at global properties of the utterance. + if self.global_context: + # torch.std is unstable for backward computation + # https://github.com/pytorch/pytorch/issues/4320 + total = mask.sum(dim=2, keepdim=True).to(x) + mean, std = _compute_statistics(x, mask / total) + mean = mean.unsqueeze(2).repeat(1, 1, L) + std = std.unsqueeze(2).repeat(1, 1, L) + attn = torch.cat([x, mean, std], dim=1) + else: + attn = x + + # Apply layers + attn = self.conv(self.tanh(self.tdnn(attn))) + + # Filter out zero-paddings + attn = attn.masked_fill(mask == 0, float("-inf")) + + attn = F.softmax(attn, dim=2) + mean, std = _compute_statistics(x, attn) + # Append mean and std of the batch + pooled_stats = torch.cat((mean, std), dim=1) + pooled_stats = pooled_stats.unsqueeze(2) + + return pooled_stats + + +class SERes2NetBlock(Module): + """An implementation of building block in ECAPA-TDNN, i.e., + TDNN-Res2Net-TDNN-SEBlock. + + Arguments + ---------- + out_channels: int + The number of output channels. + res2net_scale: int + The scale of the Res2Net block. + kernel_size: int + The kernel size of the TDNN blocks. + dilation: int + The dilation of the Res2Net block. + groups: int + Number of blocked connections from input channels to output channels. + + Example + ------- + >>> x = torch.rand(8, 120, 64).transpose(1, 2) + >>> conv = SERes2NetBlock(64, 64, res2net_scale=4) + >>> out = conv(x).transpose(1, 2) + >>> out.shape + torch.Size([8, 120, 64]) + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + res2net_scale: int = 8, + se_channels: int = 128, + kernel_size: int = 1, + dilation: int = 1, + groups: int = 1, + ): + super().__init__() + self.out_channels = out_channels + self.tdnn1 = TDNNBlock( + in_channels, + out_channels, + kernel_size=1, + dilation=1, + groups=groups, + ) + self.res2net_block = Res2NetBlock( + out_channels, + out_channels, + res2net_scale, + kernel_size, + dilation, + ) + self.tdnn2 = TDNNBlock( + out_channels, + out_channels, + kernel_size=1, + dilation=1, + groups=groups, + ) + self.se_block = SEBlock(out_channels, se_channels, out_channels) + + self.shortcut = None + if in_channels != out_channels: + self.shortcut = Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + ) + + def forward(self, x: Tensor, padding_mask: Optional[PaddingMask] = None) -> Tensor: + """Processes the input tensor x and returns an output tensor.""" + residual = x + if self.shortcut: + residual = self.shortcut(x) + + x = self.tdnn1(x) + x = self.res2net_block(x) + x = self.tdnn2(x) + x = self.se_block(x, padding_mask=padding_mask) + + return x + residual diff --git a/seamless_communication/src/seamless_communication/models/generator/ecapa_tdnn_builder.py b/seamless_communication/src/seamless_communication/models/generator/ecapa_tdnn_builder.py new file mode 100644 index 0000000..1c02be0 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/generator/ecapa_tdnn_builder.py @@ -0,0 +1,112 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass +from typing import List, Optional + +from fairseq2.models.utils.arch_registry import ArchitectureRegistry +from fairseq2.typing import DataType, Device + +from seamless_communication.models.generator.ecapa_tdnn import ECAPA_TDNN + + +@dataclass +class EcapaTDNNConfig: + channels: List[int] + kernel_sizes: List[int] + dilations: List[int] + attention_channels: int + res2net_scale: int + se_channels: int + global_context: bool + groups: List[int] + embed_dim: int + input_dim: int + + +ecapa_tdnn_archs = ArchitectureRegistry[EcapaTDNNConfig]("ecapa_tdnn") + +ecapa_tdnn_arch = ecapa_tdnn_archs.decorator + + +@ecapa_tdnn_arch("base") +def _base_ecapa_tdnn() -> EcapaTDNNConfig: + return EcapaTDNNConfig( + channels=[512, 512, 512, 512, 1536], + kernel_sizes=[5, 3, 3, 3, 1], + dilations=[1, 2, 3, 4, 1], + attention_channels=128, + res2net_scale=8, + se_channels=128, + global_context=True, + groups=[1, 1, 1, 1, 1], + embed_dim=512, + input_dim=80, + ) + + +class EcapaTDNNBuilder: + """ + Builder module for ECAPA_TDNN model + """ + + config: EcapaTDNNConfig + device: Optional[Device] + dtype: Optional[DataType] + + def __init__( + self, + config: EcapaTDNNConfig, + *, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + """ + :param config: + The configuration to use. + :param devicev: + The device on which to initialize modules. + :param dtype: + The data type of module parameters and buffers. + """ + self.config = config + + self.device, self.dtype = device, dtype + + def build_model(self) -> ECAPA_TDNN: + """Build a model.""" + model = ECAPA_TDNN( + self.config.channels, + self.config.kernel_sizes, + self.config.dilations, + self.config.attention_channels, + self.config.res2net_scale, + self.config.se_channels, + self.config.global_context, + self.config.groups, + self.config.embed_dim, + self.config.input_dim, + ) + model.to(device=self.device, dtype=self.dtype) + return model + + +def create_ecapa_tdnn_model( + config: EcapaTDNNConfig, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, +) -> ECAPA_TDNN: + """Create a ECAPA_TDNN model. + + :param config: + The configuration to use. + :param device: + The device on which to initialize modules. + :param dtype: + The data type of module parameters and buffers. + """ + + return EcapaTDNNBuilder(config, device=device, dtype=dtype).build_model() diff --git a/seamless_communication/src/seamless_communication/models/generator/loader.py b/seamless_communication/src/seamless_communication/models/generator/loader.py new file mode 100644 index 0000000..e718122 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/generator/loader.py @@ -0,0 +1,29 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + + +from typing import Any, Mapping + +from fairseq2.assets import asset_store, download_manager +from fairseq2.models.utils import ConfigLoader, ModelLoader + +from seamless_communication.models.generator.builder import ( + VocoderConfig, + create_vocoder_model, + vocoder_archs, +) +from seamless_communication.models.generator.vocoder import PretsselVocoder + +load_pretssel_vocoder_config = ConfigLoader[VocoderConfig](asset_store, vocoder_archs) + + +load_pretssel_vocoder_model = ModelLoader[PretsselVocoder, VocoderConfig]( + asset_store, + download_manager, + load_pretssel_vocoder_config, + create_vocoder_model, + restrict_checkpoints=False, +) diff --git a/seamless_communication/src/seamless_communication/models/generator/streamable.py b/seamless_communication/src/seamless_communication/models/generator/streamable.py new file mode 100644 index 0000000..a9df68b --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/generator/streamable.py @@ -0,0 +1,452 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import math +import warnings +from typing import Any, Dict, List, Literal, Optional, Tuple, TypeVar + +import torch +from fairseq2.typing import DataType, Device +from torch.nn import ( + ELU, + LSTM, + Conv1d, + ConvTranspose1d, + GroupNorm, + Identity, + Module, + Sequential, +) +from torch.nn import functional as F +from torch.nn.utils import spectral_norm, weight_norm # type: ignore[attr-defined] + +CONV_NORMALIZATIONS = frozenset( + ["none", "weight_norm", "spectral_norm", "time_group_norm"] +) + + +def apply_parametrization_norm( + module: Module, + norm: Literal["none", "weight_norm", "spectral_norm", "time_group_norm"] = "none", +) -> Module: + if norm == "weight_norm": + return weight_norm(module) + elif norm == "spectral_norm": + return spectral_norm(module) + else: + # We already check was in CONV_NORMALIZATION, so any other choice + # doesn't need reparametrization. + return module + + +def get_norm_module( # type: ignore[no-untyped-def] + module: Module, + causal: bool = False, + norm: Literal["none", "weight_norm", "spectral_norm", "time_group_norm"] = "none", + **norm_kwargs, +) -> Module: + """Return the proper normalization module. If causal is True, this will ensure the returned + module is causal, or return an error if the normalization doesn't support causal evaluation. + """ + assert norm in CONV_NORMALIZATIONS + if norm == "time_group_norm": + if causal: + raise ValueError("GroupNorm doesn't support causal evaluation.") + assert isinstance(module, torch.nn.modules.conv._ConvNd) + return GroupNorm(1, module.out_channels, **norm_kwargs) + else: + return Identity() + + +def get_extra_padding_for_conv1d( + x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0 +) -> int: + """See `pad_for_conv1d`.""" + length = x.shape[-1] + n_frames = (length - kernel_size + padding_total) / stride + 1 + ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total) + return ideal_length - length + + +def pad_for_conv1d( + x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0 +) -> torch.Tensor: + """Pad for a convolution to make sure that the last window is full. + Extra padding is added at the end. This is required to ensure that we can rebuild + an output of the same length, as otherwise, even with padding, some time steps + might get removed. + For instance, with total padding = 4, kernel size = 4, stride = 2: + 0 0 1 2 3 4 5 0 0 # (0s are padding) + 1 2 3 # (output frames of a convolution, last 0 is never used) + 0 0 1 2 3 4 5 0 # (output of tr. conv., but pos. 5 is going to get removed as padding) + 1 2 3 4 # once you removed padding, we are missing one time step ! + """ + extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total) + return F.pad(x, (0, extra_padding)) # noqa + + +def pad1d( + x: torch.Tensor, + paddings: Tuple[int, int], + mode: str = "constant", + value: float = 0.0, +) -> torch.Tensor: + """Tiny wrapper around F.pad, just to allow for reflect padding on small input. + If this is the case, we insert extra 0 padding to the right before the reflection happen. + """ + length = x.shape[-1] + padding_left, padding_right = paddings + assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right) + if mode == "reflect": + max_pad = max(padding_left, padding_right) + extra_pad = 0 + if length <= max_pad: + extra_pad = max_pad - length + 1 + x = F.pad(x, (0, extra_pad)) + padded = F.pad(x, paddings, mode, value) + end = padded.shape[-1] - extra_pad + return padded[..., :end] + else: + return F.pad(x, paddings, mode, value) + + +def unpad1d(x: torch.Tensor, paddings: Tuple[int, int]) -> torch.Tensor: + """Remove padding from x, handling properly zero padding. Only for 1d!""" + padding_left, padding_right = paddings + assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right) + assert (padding_left + padding_right) <= x.shape[-1] + end = x.shape[-1] - padding_right + return x[..., padding_left:end] + + +class NormConv1d(Module): + """Wrapper around Conv1d and normalization applied to this conv + to provide a uniform interface across normalization approaches. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + dilation: int = 1, + groups: int = 1, + bias: bool = True, + causal: bool = False, + norm: Literal[ + "none", "weight_norm", "spectral_norm", "time_group_norm" + ] = "none", + norm_kwargs: Dict[str, Any] = {}, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ): + super().__init__() + self.conv: Module = apply_parametrization_norm( + Conv1d( + in_channels, + out_channels, + kernel_size, + stride, + dilation=dilation, + groups=groups, + bias=bias, + device=device, + dtype=dtype, + ), + norm, + ) + self.norm: Module = get_norm_module(self.conv, causal, norm, **norm_kwargs) + self.norm_type = norm + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.conv(x) + x = self.norm(x) + return x + + +class NormConvTranspose1d(Module): + """Wrapper around ConvTranspose1d and normalization applied to this conv + to provide a uniform interface across normalization approaches. + """ + + def __init__( # type: ignore[no-untyped-def] + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + causal: bool = False, + norm: Literal[ + "none", "weight_norm", "spectral_norm", "time_group_norm" + ] = "none", + norm_kwargs: Dict[str, Any] = {}, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ): + super().__init__() + self.convtr = apply_parametrization_norm( + ConvTranspose1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + device=device, + dtype=dtype, + ), + norm, + ) + self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs) + self.norm_type = norm + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.convtr(x) + x = self.norm(x) + return x + + +class StreamableConv1d(Module): + """Conv1d with some builtin handling of asymmetric or causal padding + and normalization. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + dilation: int = 1, + groups: int = 1, + bias: bool = True, + causal: bool = False, + norm: Literal[ + "none", "weight_norm", "spectral_norm", "time_group_norm" + ] = "none", + norm_kwargs: Dict[str, Any] = {}, + pad_mode: str = "reflect", + activation: Optional[Module] = None, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ): + super().__init__() + # warn user on unusual setup between dilation and stride + if stride > 1 and dilation > 1: + warnings.warn( + "StreamableConv1d has been initialized with stride > 1 and dilation > 1" + f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})." + ) + self.activation = activation + self.conv = NormConv1d( + in_channels, + out_channels, + kernel_size, + stride, + dilation=dilation, + groups=groups, + bias=bias, + causal=causal, + norm=norm, + norm_kwargs=norm_kwargs, + device=device, + dtype=dtype, + ) + self.causal = causal + self.pad_mode = pad_mode + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.activation: + x = self.activation(x) + kernel_size: int = self.conv.conv.kernel_size[0] # type: ignore[index,assignment] + stride: int = self.conv.conv.stride[0] # type: ignore[index,assignment] + dilation = self.conv.conv.dilation[0] # type: ignore[index] + kernel_size = ( # type: ignore[assignment] + kernel_size - 1 + ) * dilation + 1 # effective kernel size with dilations + padding_total = kernel_size - stride + extra_padding = get_extra_padding_for_conv1d( + x, kernel_size, stride, padding_total + ) + if self.causal: + # Left padding for causal + x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode) + else: + # Asymmetric padding required for odd strides + padding_right = padding_total // 2 + padding_left = padding_total - padding_right + x = pad1d( + x, (padding_left, padding_right + extra_padding), mode=self.pad_mode + ) + return self.conv(x) # type: ignore[no-any-return] + + +class StreamableConvTranspose1d(Module): + """ConvTranspose1d with some builtin handling of asymmetric or causal padding + and normalization. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + causal: bool = False, + norm: Literal[ + "none", "weight_norm", "spectral_norm", "time_group_norm" + ] = "none", + trim_right_ratio: float = 1.0, + norm_kwargs: Dict[str, Any] = {}, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ): + super().__init__() + self.convtr = NormConvTranspose1d( + in_channels, + out_channels, + kernel_size, + stride, + causal=causal, + norm=norm, + norm_kwargs=norm_kwargs, + device=device, + dtype=dtype, + ) + self.causal = causal + self.trim_right_ratio = trim_right_ratio + assert ( + self.causal or self.trim_right_ratio == 1.0 + ), "`trim_right_ratio` != 1.0 only makes sense for causal convolutions" + assert self.trim_right_ratio >= 0.0 and self.trim_right_ratio <= 1.0 + + def forward(self, x: torch.Tensor) -> torch.Tensor: + kernel_size: int = self.convtr.convtr.kernel_size[0] # type: ignore[index,assignment] + stride: int = self.convtr.convtr.stride[0] # type: ignore[index,assignment] + padding_total = kernel_size - stride + + y: torch.Tensor = self.convtr(x) + + # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be + # removed at the very end, when keeping only the right length for the output, + # as removing it here would require also passing the length at the matching layer + # in the encoder. + if self.causal: + # Trim the padding on the right according to the specified ratio + # if trim_right_ratio = 1.0, trim everything from right + padding_right = math.ceil(padding_total * self.trim_right_ratio) + padding_left = padding_total - padding_right + y = unpad1d(y, (padding_left, padding_right)) + else: + # Asymmetric padding required for odd strides + padding_right = padding_total // 2 + padding_left = padding_total - padding_right + y = unpad1d(y, (padding_left, padding_right)) + return y + + +class StreamableLSTM(Module): + """LSTM without worrying about the hidden state, nor the layout of the data. + Expects input as convolutional layout. + """ + + def __init__( + self, + dimension: int, + num_layers: int = 2, + skip: bool = True, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ): + super().__init__() + self.skip = skip + self.lstm = LSTM(dimension, dimension, num_layers, device=device, dtype=dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.permute(2, 0, 1) + y, _ = self.lstm(x) + if self.skip: + y = y + x + y = y.permute(1, 2, 0) + return y # type: ignore[no-any-return] + + +class StreamableResnetBlock(Module): + """custom Residual block model with streamable convnet. + + Args: + dim (int): Dimension of the input/output. + kernel_sizes (list): List of kernel sizes for the convolutions. + dilations (list): List of dilations for the convolutions. + activation_params (dict): Parameters to provide to the (ELU) activation function. + norm (str): Normalization method. + norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution. + causal (bool): Whether to use fully causal convolution. + pad_mode (str): Padding mode for the convolutions. + compress (int): Reduced dimensionality in residual branches (from Demucs v3). + true_skip (bool): Whether to use true skip connection or a simple + (streamable) convolution as the skip connection. + """ + + def __init__( + self, + dim: int, + kernel_sizes: List[int] = [3, 1], + dilations: List[int] = [1, 1], + activation_params: Dict[str, Any] = {"alpha": 1.0}, + norm: Literal[ + "none", "weight_norm", "spectral_norm", "time_group_norm" + ] = "none", + norm_params: Dict[str, Any] = {}, + causal: bool = False, + pad_mode: str = "reflect", + compress: int = 2, + true_skip: bool = True, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ): + super().__init__() + assert len(kernel_sizes) == len( + dilations + ), "Number of kernel sizes should match number of dilations" + hidden = dim // compress + block = [] + for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)): + in_chs = dim if i == 0 else hidden + out_chs = dim if i == len(kernel_sizes) - 1 else hidden + block += [ + ELU(**activation_params), + StreamableConv1d( + in_chs, + out_chs, + kernel_size=kernel_size, + dilation=dilation, + norm=norm, + norm_kwargs=norm_params, + causal=causal, + pad_mode=pad_mode, + device=device, + dtype=dtype, + ), + ] + self.block = Sequential(*block) + self.shortcut: Module + if true_skip: + self.shortcut = Identity() + else: + self.shortcut = StreamableConv1d( + dim, + dim, + kernel_size=1, + norm=norm, + norm_kwargs=norm_params, + causal=causal, + pad_mode=pad_mode, + device=device, + dtype=dtype, + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.shortcut(x) + self.block(x) # type: ignore[no-any-return] diff --git a/seamless_communication/src/seamless_communication/models/generator/vocoder.py b/seamless_communication/src/seamless_communication/models/generator/vocoder.py new file mode 100644 index 0000000..4f4ef4b --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/generator/vocoder.py @@ -0,0 +1,586 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Any, Dict, List, Literal, Optional, Tuple + +import torch +import torch.nn.functional as F +from fairseq2.nn.embedding import Embedding, StandardEmbedding +from fairseq2.nn.padding import PaddingMask +from fairseq2.nn.position_encoder import PositionEncoder +from fairseq2.nn.projection import Projection +from fairseq2.typing import DataType, Device +from torch.nn import ( + ELU, + BatchNorm1d, + Conv1d, + ConvTranspose1d, + Dropout, + Module, + ModuleList, + Parameter, + Sequential, + Tanh, + init, +) +from torch.nn.utils.weight_norm import remove_weight_norm, weight_norm + +from seamless_communication.models.generator.ecapa_tdnn import ECAPA_TDNN +from seamless_communication.models.unity.fft_decoder import FeedForwardTransformer +from seamless_communication.models.unity.length_regulator import VarianceAdaptor +from seamless_communication.models.vocoder.hifigan import ( + LRELU_SLOPE, + ResBlock, + init_weights, +) + +from .streamable import ( + StreamableConv1d, + StreamableConvTranspose1d, + StreamableLSTM, + StreamableResnetBlock, +) + +ELU_PARAMS: Dict[str, Any] = {"alpha": 1.0} + + +class PretsselEncoderFrontend(Module): + """ + Represent Encoder frontend, including the prosody encoder and language embedding + """ + + prosody_encoder: ECAPA_TDNN + embed_tokens: Embedding + embed_positions: PositionEncoder + pos_emb_alpha: Parameter + embed_lang: Embedding + dropout: Dropout + + def __init__( + self, + prosody_encoder: ECAPA_TDNN, + embed_tokens: Embedding, + embed_positions: PositionEncoder, + lang_to_index: Dict[str, int], + lang_embed_dim: Optional[int], + dropout_p: float, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ): + super().__init__() + + self.prosody_encoder = prosody_encoder + + self.embed_tokens = embed_tokens + + self.embed_positions = embed_positions + self.pos_emb_alpha = Parameter(torch.ones(1, device=device, dtype=dtype)) + + self.lang_to_index = lang_to_index + + if lang_embed_dim is not None: + self.embed_lang = StandardEmbedding( + len(lang_to_index), lang_embed_dim, device=device, dtype=dtype + ) + else: + self.register_module("embed_lang", None) + + self.dropout = Dropout(dropout_p) + + self.device = device + self.dtype = dtype + + def forward( + self, + seqs: torch.Tensor, + padding_mask: Optional[PaddingMask], + prosody_input_seqs: torch.Tensor, + prosody_padding_mask: Optional[PaddingMask], + tgt_lang: str, + ) -> Tuple[torch.Tensor, torch.Tensor]: + prosody_embs = self.prosody_encoder( + prosody_input_seqs, + prosody_padding_mask, + ).unsqueeze(1) + + if self.embed_lang is not None: + lang_index = self.lang_to_index[tgt_lang] + lang_index_tensor = ( + torch.Tensor([lang_index]).to(seqs).repeat(seqs.size(0), 1) + ) + lang_embeds = self.embed_lang(lang_index_tensor) + prosody_embs = torch.cat([prosody_embs, lang_embeds], dim=-1) + + seqs = self.embed_tokens(seqs) + seqs += self.pos_emb_alpha * (self.embed_positions(seqs, padding_mask) - seqs) + seqs = self.dropout(seqs) + + return seqs, prosody_embs + + +class PretsselDecoderFrontend(Module): + """Represent Decoder frontend, including VarianceAdaptor & Positional embedding""" + + variance_adaptor: VarianceAdaptor + embed_positions: PositionEncoder + pos_emb_alpha: Parameter + + def __init__( + self, + variance_adaptor: VarianceAdaptor, + embed_positions: PositionEncoder, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ): + super().__init__() + + self.variance_adaptor = variance_adaptor + self.embed_positions = embed_positions + self.pos_emb_alpha = Parameter(torch.ones(1, device=device, dtype=dtype)) + + self.device = device + self.dtype = dtype + + def forward( + self, + seqs: torch.Tensor, + padding_mask: PaddingMask, + durations: Optional[torch.Tensor] = None, + duration_factor: float = 1.0, + min_duration: int = 0, + film_cond_emb: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, PaddingMask]: + seqs, padding_mask, _ = self.variance_adaptor( + seqs, padding_mask, durations, duration_factor, min_duration, film_cond_emb + ) + + seqs += self.pos_emb_alpha * (self.embed_positions(seqs, padding_mask) - seqs) + + return seqs, padding_mask + + +class PretsselVocoder(Module): + """The expressivity-preserving vocoder""" + + encoder_frontend: PretsselEncoderFrontend + encoder: FeedForwardTransformer + decoder_frontend: PretsselDecoderFrontend + decoder: FeedForwardTransformer + final_proj: Projection + + def __init__( # type: ignore[no-untyped-def] + self, + encoder_frontend: PretsselEncoderFrontend, + encoder: FeedForwardTransformer, + decoder_frontend: PretsselDecoderFrontend, + decoder: FeedForwardTransformer, + final_proj: Projection, + pn_n_channels: int, + pn_kernel_size: int, + pn_layers: int, + pn_dropout: float, + upsample_rates: List[int], + upsample_kernel_sizes: List[int], + upsample_initial_channel: int, + resblock_kernel_sizes: List[int], + resblock_dilation_sizes: List[List[int]], + mel_dim: int = 80, + add_ups_out_pad: bool = True, + channels: int = 1, + dimension: int = 128, + n_filters: int = 32, + ratios: List[int] = [8, 5, 4, 2], + norm: Literal[ + "none", "weight_norm", "spectral_norm", "time_group_norm" + ] = "none", + norm_params: Dict[str, Any] = {}, + kernel_size: int = 7, + last_kernel_size: int = 7, + residual_kernel_size: int = 3, + causal: bool = False, + pad_mode: str = "constant", + true_skip: bool = True, + compress: int = 2, + lstm: int = 0, + disable_norm_outer_blocks: int = 0, + trim_right_ratio: float = 1.0, + gcmvn_mean: Optional[List[float]] = None, + gcmvn_std: Optional[List[float]] = None, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ): + super().__init__() + self.encoder_frontend = encoder_frontend + self.encoder = encoder + self.decoder_frontend = decoder_frontend + self.decoder = decoder + self.final_proj = final_proj + mult = 1 + stream_layers: List[Module] = [ + StreamableConv1d( + channels, + mult * n_filters, + kernel_size, + norm="none" if disable_norm_outer_blocks >= 1 else norm, + norm_kwargs=norm_params, + causal=causal, + pad_mode=pad_mode, + activation=Tanh(), + device=device, + dtype=dtype, + ) + ] + # Downsample to from audio scale + for i, ratio in enumerate(list(reversed(ratios))): + block_norm = "none" if disable_norm_outer_blocks >= i + 2 else norm + stream_layers.append( + StreamableResnetBlock( + mult * n_filters, + kernel_sizes=[residual_kernel_size, 1], + dilations=[1, 1], + norm=block_norm, + norm_params=norm_params, + causal=causal, + pad_mode=pad_mode, + compress=compress, + true_skip=true_skip, + device=device, + dtype=dtype, + ) + ) + stream_layers.append(ELU(**ELU_PARAMS)) + stream_layers.append( + StreamableConv1d( + mult * n_filters, + mult * n_filters * 2, + kernel_size=ratio * 2, + stride=ratio, + norm=block_norm, + norm_kwargs=norm_params, + causal=causal, + pad_mode=pad_mode, + device=device, + dtype=dtype, + ) + ) + mult *= 2 + + stream_layers.append(StreamableLSTM(mult * n_filters, num_layers=lstm)) + stream_layers.append(ELU(**ELU_PARAMS)) + n_blocks = len(ratios) + 2 + stream_layers.append( + StreamableConv1d( + mult * n_filters, + dimension, + last_kernel_size, + norm="none" if disable_norm_outer_blocks == n_blocks else norm, + norm_kwargs=norm_params, + causal=causal, + pad_mode=pad_mode, + device=device, + dtype=dtype, + ) + ) + stream_layers.append( + StreamableConv1d( + dimension, + mult * n_filters, + kernel_size, + norm="none" if disable_norm_outer_blocks == n_blocks else norm, + norm_kwargs=norm_params, + causal=causal, + pad_mode=pad_mode, + device=device, + dtype=dtype, + ) + ) + stream_layers.append( + StreamableLSTM( + mult * n_filters, num_layers=lstm, device=device, dtype=dtype + ) + ) + + # resample back to raw audio scale + for i, ratio in enumerate(ratios): + block_norm = ( + "none" if disable_norm_outer_blocks >= n_blocks - (i + 1) else norm + ) + stream_layers.append(ELU(**ELU_PARAMS)) + stream_layers.append( + StreamableConvTranspose1d( + mult * n_filters, + mult * n_filters // 2, + kernel_size=ratio * 2, + stride=ratio, + norm=block_norm, + norm_kwargs=norm_params, + causal=causal, + trim_right_ratio=trim_right_ratio, + device=device, + dtype=dtype, + ) + ) + stream_layers.append( + StreamableResnetBlock( + mult * n_filters // 2, + kernel_sizes=[residual_kernel_size, 1], + dilations=[1, 1], + norm=block_norm, + norm_params=norm_params, + activation_params=ELU_PARAMS, + causal=causal, + pad_mode=pad_mode, + compress=compress, + true_skip=true_skip, + device=device, + dtype=dtype, + ) + ) + mult //= 2 + + stream_layers.append(ELU(**ELU_PARAMS)) + stream_layers.append( + StreamableConv1d( + n_filters, + channels, + last_kernel_size, + norm="none" if disable_norm_outer_blocks >= 1 else norm, + norm_kwargs=norm_params, + causal=causal, + pad_mode=pad_mode, + device=device, + dtype=dtype, + ) + ) + self.n_streams = len(stream_layers) + chunk_size = self.n_streams // 4 + stream_idx = 0 + + self.pn_layers = pn_layers + self.layers = ModuleList() + assert pn_kernel_size % 2 == 1 + for i in range(pn_layers): + cur_layers = ( + [ + Conv1d( + mel_dim if i == 0 else pn_n_channels, + pn_n_channels if i < pn_layers - 1 else mel_dim, + kernel_size=pn_kernel_size, + padding="same", + device=device, + dtype=dtype, + ), + BatchNorm1d( + pn_n_channels if i < pn_layers - 1 else mel_dim, + device=device, + dtype=dtype, + ), + ] + + ([Tanh()] if i < pn_layers - 1 else []) + + [Dropout(pn_dropout)] + ) + self.layers.append(Sequential(*cur_layers)) + self.reset_parameters() + self.layers.extend(stream_layers[:chunk_size]) + stream_idx += chunk_size + self.layers.append( + weight_norm( + Conv1d( + mel_dim if mel_dim is not None else 80, + upsample_initial_channel, + 7, + 1, + padding="same", + device=device, + dtype=dtype, + ) + ) + ) + self.layers.extend(stream_layers[stream_idx : stream_idx + chunk_size]) # noqa + stream_idx += chunk_size + + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + ups = ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + out_pad = u % 2 if add_ups_out_pad else 0 + ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2 + out_pad, + output_padding=out_pad, + device=device, + dtype=dtype, + ) + ) + ) + ups.apply(init_weights) + self.layers.extend(ups) + self.layers.extend(stream_layers[stream_idx : stream_idx + chunk_size]) # noqa + stream_idx += chunk_size + + for i in range(self.num_upsamples): + ch = upsample_initial_channel // (2 ** (i + 1)) + for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes): + self.layers.append( + ResBlock( + ch, + k, + d, + ).to(device, dtype=dtype) + ) + self.layers.extend(stream_layers[stream_idx:]) + + conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + conv_post.apply(init_weights) + self.layers.append(conv_post) + for u, k in zip(upsample_rates, upsample_kernel_sizes): + assert k == 2 * u, (k, u) + + mean = torch.zeros((mel_dim,), dtype=torch.float) + scale = torch.zeros((mel_dim,), dtype=torch.float) + self.register_buffer("mean", mean) + self.register_buffer("scale", scale) + + self.gcmvn_mean = torch.tensor(gcmvn_mean, device=device, dtype=dtype) + self.gcmvn_std = torch.tensor(gcmvn_std, device=device, dtype=dtype) + + def reset_parameters(self) -> None: + for i in range(self.pn_layers): + init.xavier_uniform_( + self.layers[i][0].weight, + init.calculate_gain("tanh" if i < self.pn_layers - 1 else "linear"), + ) + + def gcmvn_denormalize(self, x: torch.Tensor) -> torch.Tensor: + if self.gcmvn_mean is None or self.gcmvn_std is None: + raise ValueError("gcmvn_mean is not set") + + assert ( + x.ndim == 3 + and x.shape[2] == self.gcmvn_mean.shape[0] + and x.shape[2] == self.gcmvn_std.shape[0] + ) + gcmvn_mean = self.gcmvn_mean.to(x) + gcmvn_std = self.gcmvn_std.to(x) + x = x * gcmvn_std.view(1, 1, -1).expand_as(x) # type: ignore[attr-defined] + return x + gcmvn_mean.view(1, 1, -1).expand_as(x) # type: ignore[attr-defined,no-any-return] + + def forward( + self, + seqs: torch.Tensor, + tgt_lang: str, + prosody_input_seqs: torch.Tensor, + padding_mask: Optional[PaddingMask] = None, + prosody_padding_mask: Optional[PaddingMask] = None, + durations: Optional[torch.Tensor] = None, + duration_factor: float = 1.0, + min_duration: int = 0, + normalize_before: bool = True, + ) -> List[torch.Tensor]: + # Here we are adding batch dimension for the pretssel + if seqs.ndim < 2: + seqs = seqs.unsqueeze(0) + if prosody_input_seqs.ndim < 3: + prosody_input_seqs = prosody_input_seqs.unsqueeze(0) + seqs, cond_embs = self.encoder_frontend( + seqs, + padding_mask, + prosody_input_seqs, + prosody_padding_mask, + tgt_lang, + ) + seqs, padding_mask = self.encoder(seqs, padding_mask, cond_embs) + seqs, padding_mask = self.decoder_frontend( + seqs, padding_mask, durations, duration_factor, min_duration, cond_embs + ) + seqs, padding_mask = self.decoder(seqs, padding_mask, cond_embs) + seqs = self.final_proj(seqs) + + pn = seqs.transpose(1, 2) # B x T x C -> B x C x T + for i in range(self.pn_layers): + pn = self.layers[i](pn) + pn = pn.transpose(1, 2) + + x = seqs + pn + x = self.gcmvn_denormalize(x) + + wavs = [] + for idx, _x in enumerate(x): + _x = _x[: durations[idx].sum()] # type: ignore[index] + if normalize_before: + _x = (_x - self.mean) / self.scale + + _x = _x.transpose(1, 0).unsqueeze(0) + chunk_size = self.n_streams // 4 + _x = self.layers[self.pn_layers + chunk_size](_x) + for i in range(self.num_upsamples): + _x = F.leaky_relu(_x, LRELU_SLOPE) + _x = self.layers[i + self.pn_layers + 1 + 2 * chunk_size](_x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.layers[ + i * self.num_kernels + + j + + self.pn_layers + + 3 * chunk_size + + self.num_upsamples + + 1 + ](_x) + else: + xs += self.layers[ + i * self.num_kernels + + j + + self.pn_layers + + 3 * chunk_size + + self.num_upsamples + + 1 + ](_x) + _x = xs / self.num_kernels # type: ignore + _x = F.leaky_relu(_x) + _x = self.layers[ + self.pn_layers + + self.n_streams + + self.num_upsamples * (1 + self.num_kernels) + + 1 + ](_x) + skip_output = _x + h = skip_output + + for i1 in range(self.pn_layers, self.pn_layers + chunk_size): + h = self.layers[i1](h) + i1 += 2 + for i2 in range(i1, i1 + chunk_size): + h = self.layers[i2](h) + i2 = i2 + self.num_upsamples + 1 + + for i3 in range(i2, i2 + chunk_size): + h = self.layers[i3](h) + i3 = i3 + (self.num_upsamples * self.num_kernels) + 1 + for i4 in range(i3, i3 + chunk_size): + h = self.layers[i4](h) + h = h[:, :, : _x.size(-1)] + + wavs.append(0.8 * h + torch.tanh(skip_output).squeeze(0)) + return wavs + + def remove_weight_norm(self) -> None: + i = self.pn_layers + 1 + for j in range(self.num_upsamples): + remove_weight_norm(self.layers[i + j]) + for k in range(self.num_upsamples * self.num_kernels): + self.layers[i + j + k + 1].remove_weight_norm() + remove_weight_norm(self.layers[self.pn_layers]) + remove_weight_norm( + self.layers[ + self.pn_layers + 1 + self.num_upsamples * (1 + self.num_kernels) + ] + ) diff --git a/seamless_communication/src/seamless_communication/models/monotonic_decoder/__init__.py b/seamless_communication/src/seamless_communication/models/monotonic_decoder/__init__.py new file mode 100644 index 0000000..951dda0 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/monotonic_decoder/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. +from seamless_communication.models.monotonic_decoder.builder import ( + MonotonicDecoderBuilder as MonotonicDecoderBuilder, +) +from seamless_communication.models.monotonic_decoder.builder import ( + MonotonicDecoderConfig as MonotonicDecoderConfig, +) +from seamless_communication.models.monotonic_decoder.model import ( + MonotonicDecoderModel as MonotonicDecoderModel, +) +from seamless_communication.models.monotonic_decoder.builder import ( + create_monotonic_decoder_model as create_monotonic_decoder_model, +) +from seamless_communication.models.monotonic_decoder.builder import ( + monotonic_decoder_archs as monotonic_decoder_archs, +) +from seamless_communication.models.monotonic_decoder.loader import ( + load_monotonic_decoder_config as load_monotonic_decoder_config, +) +from seamless_communication.models.monotonic_decoder.loader import ( + load_monotonic_decoder_model as load_monotonic_decoder_model, +) diff --git a/seamless_communication/src/seamless_communication/models/monotonic_decoder/builder.py b/seamless_communication/src/seamless_communication/models/monotonic_decoder/builder.py new file mode 100644 index 0000000..a664c21 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/monotonic_decoder/builder.py @@ -0,0 +1,263 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass +from typing import Optional + +from fairseq2.data import VocabularyInfo +from fairseq2.models.transformer import ( + TransformerEmbeddingFrontend, + TransformerFrontend, +) +from fairseq2.models.utils.arch_registry import ArchitectureRegistry +from fairseq2.nn.embedding import Embedding, StandardEmbedding, init_scaled_embedding +from fairseq2.nn.position_encoder import SinusoidalPositionEncoder +from fairseq2.nn.projection import TiedProjection +from fairseq2.nn.transformer import ( + FeedForwardNetwork, + MultiheadAttention, + StandardFeedForwardNetwork, + StandardMultiheadAttention, + TransformerNormOrder, + create_default_sdpa, +) +from fairseq2.typing import DataType, Device + +from seamless_communication.models.monotonic_decoder.model import MonotonicDecoderModel +from seamless_communication.models.monotonic_decoder.monotonic_decoder import ( + MonotonicTransformerDecoder, +) +from seamless_communication.models.monotonic_decoder.monotonic_decoder_layer import ( + MonotonicTransformerDecoderLayer, +) +from seamless_communication.models.monotonic_decoder.p_choose import PChooseLayer + + +@dataclass +class MonotonicDecoderConfig: + """Holds the configuration of an Monotonic Decoder model.""" + + model_dim: int + """The dimensionality of the model.""" + + max_seq_len: int + """The expected maximum sequence length.""" + + vocab_info: VocabularyInfo + """The vocabulary information.""" + + num_decoder_layers: int + """The number of Transformer decoder layers.""" + + num_decoder_attn_heads: int + """The number of attention heads in Transformer decoder layers.""" + + ffn_inner_dim: int + """The inner dimensionality of Transformer feed-forward networks.""" + + dropout_p: float + """The dropout probability in Transformer layers.""" + + energy_bias_value: float + """The value of the energy bias parameter to be added to the + monotonic energy in the PChooseLayer.""" + + monotonic_temperature: float + """The parameter with which to divide the monotonic energy + to compute p_choose.""" + + num_monotonic_energy_layers: int + """The number of layers in the EnergyProjection module.""" + + pre_decision_ratio: int + """The kernel size and stride of the average pooling + in the PChooseLayer.""" + + +monotonic_decoder_archs = ArchitectureRegistry[MonotonicDecoderConfig]( + "monotonic_decoder" +) + +monotonic_decoder_arch = monotonic_decoder_archs.decorator + + +@monotonic_decoder_arch("dense_1b") +def _dense_1b() -> MonotonicDecoderConfig: + return MonotonicDecoderConfig( + model_dim=1024, + max_seq_len=4096, + vocab_info=VocabularyInfo( + size=256102, unk_idx=1, bos_idx=2, eos_idx=3, pad_idx=0 + ), + num_decoder_layers=24, + num_decoder_attn_heads=16, + ffn_inner_dim=1024 * 8, + dropout_p=0.1, + energy_bias_value=-0.5, + monotonic_temperature=0.2, + num_monotonic_energy_layers=4, + pre_decision_ratio=2, + ) + + +class MonotonicDecoderBuilder: + """Builds modules of a Monotonic Decoder. + + To tweak the architecture, you can derive from this class and override the + corresponding methods. + """ + + config: MonotonicDecoderConfig + device: Optional[Device] + dtype: Optional[DataType] + + def __init__( + self, + config: MonotonicDecoderConfig, + *, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + """ + :param config: + The configuration to use. + :param device: + The device on which to initialize modules. + :param dtype: + The data type of module parameters and buffers. + """ + self.config = config + + self.device, self.dtype = device, dtype + + def build_model(self) -> MonotonicDecoderModel: + text_embed = self.build_embedding() + + text_decoder_frontend = self.build_frontend(text_embed) + + text_decoder = self.build_decoder() + + final_proj = TiedProjection(text_embed.weight, bias=None) + + return MonotonicDecoderModel( + text_decoder_frontend, + text_decoder, + final_proj, + ) + + def build_embedding(self) -> StandardEmbedding: + """Build an embedding table.""" + return StandardEmbedding( + num_embeddings=self.config.vocab_info.size, + embedding_dim=self.config.model_dim, + pad_idx=self.config.vocab_info.pad_idx, + init_fn=init_scaled_embedding, + device=self.device, + dtype=self.dtype, + ) + + def build_frontend(self, embed: Embedding) -> TransformerFrontend: + """Build a Transformer decoder front-end.""" + pos_encoder = SinusoidalPositionEncoder( + self.config.model_dim, + self.config.max_seq_len, + _legacy_pad_idx=1, + device=self.device, + ) + + return TransformerEmbeddingFrontend( + embed, + pos_encoder, + dropout_p=self.config.dropout_p, + device=self.device, + dtype=self.dtype, + ) + + def build_decoder(self) -> MonotonicTransformerDecoder: + """Build a Transformer decoder.""" + num_layers = self.config.num_decoder_layers + + layers = [self.build_decoder_layer() for _ in range(num_layers)] + + return MonotonicTransformerDecoder( + layers, + device=self.device, + dtype=self.dtype, + ) + + def build_decoder_layer(self) -> MonotonicTransformerDecoderLayer: + """Build a Transformer decoder layer.""" + self_attn = self.build_attention(self.config.num_decoder_attn_heads) + + encoder_decoder_attn = self.build_attention(self.config.num_decoder_attn_heads) + + p_choose_layer = self.build_p_choose_layer(self.config.num_decoder_attn_heads) + + ffn = self.build_ffn() + + return MonotonicTransformerDecoderLayer( + self_attn, + encoder_decoder_attn, + p_choose_layer, + ffn, + dropout_p=self.config.dropout_p, + device=self.device, + dtype=self.dtype, + ) + + def build_attention(self, num_heads: int) -> MultiheadAttention: + """Build a Transformer multi-head attention layer.""" + sdpa = create_default_sdpa(attn_dropout_p=self.config.dropout_p) + + return StandardMultiheadAttention( + self.config.model_dim, + num_heads, + sdpa=sdpa, + device=self.device, + dtype=self.dtype, + ) + + def build_p_choose_layer(self, num_heads: int) -> PChooseLayer: + """Build a PChoose layer.""" + return PChooseLayer( + self.config.model_dim, + num_heads, + self.config.energy_bias_value, + self.config.monotonic_temperature, + self.config.num_monotonic_energy_layers, + self.config.pre_decision_ratio, + device=self.device, + dtype=self.dtype, + ) + + def build_ffn(self) -> FeedForwardNetwork: + """Build a Transformer feed-forward network.""" + return StandardFeedForwardNetwork( + self.config.model_dim, + self.config.ffn_inner_dim, + bias=True, + norm_order=TransformerNormOrder.PRE, + device=self.device, + dtype=self.dtype, + ) + + +def create_monotonic_decoder_model( + config: MonotonicDecoderConfig, + *, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, +) -> MonotonicDecoderModel: + """Create an Monotonic Decoder model. + + :param config: + The configuration to use. + :param device: + The device on which to initialize modules. + :param dtype: + The data type of module parameters and buffers. + """ + return MonotonicDecoderBuilder(config, device=device, dtype=dtype).build_model() diff --git a/seamless_communication/src/seamless_communication/models/monotonic_decoder/loader.py b/seamless_communication/src/seamless_communication/models/monotonic_decoder/loader.py new file mode 100644 index 0000000..b2c0fc7 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/monotonic_decoder/loader.py @@ -0,0 +1,92 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Any, Mapping + +import torch +from fairseq2.assets import asset_store, download_manager +from fairseq2.models.utils import ConfigLoader, ModelLoader +from fairseq2.models.utils.checkpoint import convert_fairseq_checkpoint + +from seamless_communication.models.monotonic_decoder.builder import ( + MonotonicDecoderConfig, + create_monotonic_decoder_model, + monotonic_decoder_archs, +) +from seamless_communication.models.monotonic_decoder.model import MonotonicDecoderModel + + +def convert_monotonic_checkpoint( + checkpoint: Mapping[str, Any], config: MonotonicDecoderConfig +) -> Mapping[str, Any]: + state_dict = checkpoint["model"] + + # Check if we have a fairseq2 checkpoint. + if "text_decoder.layers.0.self_attn.k_proj.weight" in state_dict: + return checkpoint + + key_map = { + # fmt: off + r"^decoder\.embed_tokens\.": r"text_decoder_frontend.embed.", + r"^decoder\.layers\.([0-9]+)\.self_attn\.out_proj\.": r"text_decoder.layers.\1.self_attn.output_proj.", + r"^decoder\.layers\.([0-9]+)\.self_attn\.": r"text_decoder.layers.\1.self_attn.", + r"^decoder\.layers\.([0-9]+)\.self_attn_layer_norm\.": r"text_decoder.layers.\1.self_attn_layer_norm.", + r"^decoder\.layers\.([0-9]+)\.encoder_attn\.out_proj\.": r"text_decoder.layers.\1.encoder_decoder_attn.output_proj.", + r"^decoder\.layers\.([0-9]+)\.encoder_attn\.energy_bias": r"text_decoder.layers.\1.p_choose_layer.energy_bias", + r"^decoder\.layers\.([0-9]+)\.encoder_attn\.source_energy_layer\.": r"text_decoder.layers.\1.p_choose_layer.k_energy_proj.", + r"^decoder\.layers\.([0-9]+)\.encoder_attn\.target_energy_layer\.": r"text_decoder.layers.\1.p_choose_layer.q_energy_proj.", + r"^decoder\.layers\.([0-9]+)\.encoder_attn\.": r"text_decoder.layers.\1.encoder_decoder_attn.", + r"^decoder\.layers\.([0-9]+)\.encoder_attn_layer_norm\.": r"text_decoder.layers.\1.encoder_decoder_attn_layer_norm.", + r"^decoder\.layers\.([0-9]+)\.fc1\.": r"text_decoder.layers.\1.ffn.inner_proj.", + r"^decoder\.layers\.([0-9]+)\.fc2\.": r"text_decoder.layers.\1.ffn.output_proj.", + r"^decoder\.layers\.([0-9]+)\.final_layer_norm\.": r"text_decoder.layers.\1.ffn_layer_norm.", + r"^decoder\.layer_norm\.": r"text_decoder.layer_norm.", + r"^decoder\.output_projection\.": r"final_proj.", + # fmt: on + } + + # Convert to fairseq2. + checkpoint = convert_fairseq_checkpoint(checkpoint, key_map) + + state_dict = checkpoint["model"] + + embeds = state_dict["final_proj.weight"] + + # fairseq had a bug that accidentally introduced a dummy token in the + # embedding table of NLLB-100. We just discard it. + if embeds.size(0) == 256103: # means NLLB-100 + embeds = embeds[:-1] + + state_dict["final_proj.weight"] = embeds + + # fairseq checkpoints have duplicate embedding weights. Ensure that we + # use a single embedding table in fairseq2. + state_dict["text_decoder_frontend.embed.weight"] = embeds + + # The embedding positions of the control symbols in fairseq's dict do + # not match the SentencePiece model of the tokenizer. + with torch.inference_mode(): + # (BOS, PAD, EOS, UNK) -> (PAD, UNK, BOS, EOS) + embeds[[0, 1, 2, 3]] = embeds[[1, 3, 0, 2]] + + return checkpoint + + +load_monotonic_decoder_config = ConfigLoader[MonotonicDecoderConfig]( + asset_store, monotonic_decoder_archs +) + + +load_monotonic_decoder_model = ModelLoader[ + MonotonicDecoderModel, MonotonicDecoderConfig +]( + asset_store, + download_manager, + load_monotonic_decoder_config, + create_monotonic_decoder_model, + convert_monotonic_checkpoint, + restrict_checkpoints=False, +) diff --git a/seamless_communication/src/seamless_communication/models/monotonic_decoder/model.py b/seamless_communication/src/seamless_communication/models/monotonic_decoder/model.py new file mode 100644 index 0000000..3913c82 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/monotonic_decoder/model.py @@ -0,0 +1,66 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Optional, Tuple, final + +from fairseq2.models.transformer.frontend import TransformerFrontend +from fairseq2.nn.incremental_state import IncrementalStateBag +from fairseq2.nn.padding import PaddingMask +from fairseq2.nn.projection import Projection +from overrides import final as finaloverride +from torch import Tensor +from torch.nn import Module + +from seamless_communication.models.monotonic_decoder.monotonic_decoder import ( + MonotonicTransformerDecoder, +) + + +@final +class MonotonicDecoderModel(Module): + text_decoder_frontend: TransformerFrontend + text_decoder: MonotonicTransformerDecoder + final_proj: Projection + + def __init__( + self, + text_decoder_frontend: TransformerFrontend, + text_decoder: MonotonicTransformerDecoder, + final_proj: Projection, + ) -> None: + super().__init__() + + self.text_decoder_frontend = text_decoder_frontend + self.text_decoder = text_decoder + self.final_proj = final_proj + + @finaloverride + def decode( + self, + seqs: Tensor, + padding_mask: Optional[PaddingMask], + encoder_output: Tensor, + encoder_padding_mask: Optional[PaddingMask], + *, + state_bag: Optional[IncrementalStateBag] = None, + ) -> Tuple[Tensor, Optional[PaddingMask], Tensor]: + seqs, padding_mask = self.text_decoder_frontend( + seqs, padding_mask, state_bag=state_bag + ) + + return self.text_decoder( # type: ignore[no-any-return] + seqs, + padding_mask, + encoder_output, + encoder_padding_mask, + state_bag=state_bag, + ) + + @finaloverride + def project(self, decoder_output: Tensor) -> Tensor: + logits = self.final_proj(decoder_output) + + return logits # type: ignore[no-any-return] diff --git a/seamless_communication/src/seamless_communication/models/monotonic_decoder/monotonic_decoder.py b/seamless_communication/src/seamless_communication/models/monotonic_decoder/monotonic_decoder.py new file mode 100644 index 0000000..413a8f5 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/monotonic_decoder/monotonic_decoder.py @@ -0,0 +1,98 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Iterable, List, Optional, Tuple, final + +import torch +from fairseq2.nn.incremental_state import IncrementalStateBag +from fairseq2.nn.module_list import ModuleList +from fairseq2.nn.normalization import LayerNorm +from fairseq2.nn.padding import PaddingMask +from fairseq2.nn.transformer import ( + AttentionMaskFactory, + CausalAttentionMaskFactory, + create_standard_layer_norm, +) +from fairseq2.typing import DataType, Device, finaloverride +from torch import Tensor +from torch.nn import Module + +from seamless_communication.models.monotonic_decoder.monotonic_decoder_layer import ( + MonotonicTransformerDecoderLayer, +) + + +@final +class MonotonicTransformerDecoder(Module): + """Represents a Monotonic Transformer decoder.""" + + model_dim: int + self_attn_mask_factory: AttentionMaskFactory + layers: ModuleList + layer_norm: LayerNorm + + def __init__( + self, + layers: Iterable[MonotonicTransformerDecoderLayer], + *, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + """ + :param layers: + The decoder layers. + """ + super().__init__() + + layer_list = ModuleList(layers) + + if not layer_list: + raise ValueError("`layers` must be non-empty.") + + self.model_dim = layer_list[0].model_dim + + self.self_attn_mask_factory = CausalAttentionMaskFactory() + + self.layers = layer_list + + self.layer_norm = create_standard_layer_norm( + self.model_dim, device=device, dtype=dtype + ) + + @finaloverride + def forward( + self, + seqs: Tensor, + padding_mask: Optional[PaddingMask], + encoder_output: Optional[Tensor] = None, + encoder_padding_mask: Optional[PaddingMask] = None, + *, + state_bag: Optional[IncrementalStateBag] = None, + ) -> Tuple[Tensor, Optional[PaddingMask], Tensor]: + self_attn_mask = self.self_attn_mask_factory( + seqs, keys=seqs, training=self.training, state_bag=state_bag + ) + + p_choose_list: List[Tensor] = [] + + for layer in self.layers.drop_iter(): + seqs, padding_mask, p_choose = layer( + seqs, + padding_mask, + self_attn_mask, + encoder_output, + encoder_padding_mask, + state_bag=state_bag, + ) + p_choose_list.append(p_choose) + + seqs = self.layer_norm(seqs) + + p_choose = torch.cat(p_choose_list, dim=0) + + p_choose = p_choose.flatten(0, 1) + + return seqs, padding_mask, p_choose diff --git a/seamless_communication/src/seamless_communication/models/monotonic_decoder/monotonic_decoder_layer.py b/seamless_communication/src/seamless_communication/models/monotonic_decoder/monotonic_decoder_layer.py new file mode 100644 index 0000000..079b300 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/monotonic_decoder/monotonic_decoder_layer.py @@ -0,0 +1,201 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Optional, Tuple, final + +from fairseq2.nn.incremental_state import IncrementalStateBag +from fairseq2.nn.normalization import LayerNorm +from fairseq2.nn.padding import PaddingMask +from fairseq2.nn.transformer import ( + AttentionMask, + FeedForwardNetwork, + MultiheadAttention, + create_standard_layer_norm, +) +from fairseq2.typing import DataType, Device, finaloverride +from torch import Tensor +from torch.nn import Dropout, Module + +from seamless_communication.models.monotonic_decoder.p_choose import PChooseLayer + + +@final +class MonotonicTransformerDecoderLayer(Module): + """Represents a Monotonic Transformer decoder layer.""" + + self_attn: MultiheadAttention + self_attn_dropout: Optional[Dropout] + self_attn_layer_norm: LayerNorm + encoder_decoder_attn: MultiheadAttention + encoder_decoder_attn_dropout: Optional[Dropout] + encoder_decoder_attn_layer_norm: LayerNorm + p_choose_layer: PChooseLayer + ffn: FeedForwardNetwork + ffn_dropout: Optional[Dropout] + ffn_layer_norm: LayerNorm + + def __init__( + self, + self_attn: MultiheadAttention, + encoder_decoder_attn: MultiheadAttention, + p_choose_layer: PChooseLayer, + ffn: FeedForwardNetwork, + *, + dropout_p: float = 0.1, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + """ + :param self_attn: + The self attention layer. + :param encoder_decoder_attn: + The encoder-decoder attention layer. + :param ffn: + The feed-forward network. + :param dropout_p: + The dropout probability on outputs of the attention layers and the + feed-forward network. + """ + super().__init__() + + self.model_dim = self_attn.model_dim + + self_attn_layer_norm = create_standard_layer_norm( + self.model_dim, device=device, dtype=dtype + ) + + self.self_attn_layer_norm = self_attn_layer_norm + + self.self_attn = self_attn + + if dropout_p > 0.0: + self.self_attn_dropout = Dropout(dropout_p) + else: + self.register_module("self_attn_dropout", None) + + encoder_decoder_attn_layer_norm = create_standard_layer_norm( + self.model_dim, device=device, dtype=dtype + ) + + self.encoder_decoder_attn_layer_norm = encoder_decoder_attn_layer_norm + + self.encoder_decoder_attn = encoder_decoder_attn + + if dropout_p > 0.0: + self.encoder_decoder_attn_dropout = Dropout(dropout_p) + else: + self.register_module("encoder_decoder_attn_dropout", None) + + self.p_choose_layer = p_choose_layer + + ffn_layer_norm = create_standard_layer_norm( + self.model_dim, device=device, dtype=dtype + ) + + self.ffn_layer_norm = ffn_layer_norm + + self.ffn = ffn + + if dropout_p > 0.0: + self.ffn_dropout = Dropout(dropout_p) + else: + self.register_module("ffn_dropout", None) + + @finaloverride + def forward( + self, + seqs: Tensor, + padding_mask: Optional[PaddingMask], + self_attn_mask: Optional[AttentionMask] = None, + encoder_output: Optional[Tensor] = None, + encoder_padding_mask: Optional[PaddingMask] = None, + *, + state_bag: Optional[IncrementalStateBag] = None, + ) -> Tuple[Tensor, Optional[PaddingMask], Tensor]: + seqs = self._forward_self_attn(seqs, padding_mask, self_attn_mask, state_bag) + + seqs, p_choose = self._forward_encoder_decoder_attn( + seqs, padding_mask, encoder_output, encoder_padding_mask + ) + + seqs = self._forward_ffn(seqs) + + return seqs, padding_mask, p_choose + + def _forward_self_attn( + self, + seqs: Tensor, + padding_mask: Optional[PaddingMask], + self_attn_mask: Optional[AttentionMask], + state_bag: Optional[IncrementalStateBag], + ) -> Tensor: + residual = seqs + + seqs = self.self_attn_layer_norm(seqs) + + seqs = self.self_attn( + seqs, + padding_mask, + keys=seqs, + key_padding_mask=padding_mask, + values=seqs, + attn_mask=self_attn_mask, + state_bag=state_bag, + ) + + if self.self_attn_dropout is not None: + seqs = self.self_attn_dropout(seqs) + + seqs = seqs + residual + + return seqs + + def _forward_encoder_decoder_attn( + self, + seqs: Tensor, + padding_mask: Optional[PaddingMask], + encoder_output: Optional[Tensor], + encoder_padding_mask: Optional[PaddingMask], + ) -> Tuple[Tensor, Tensor]: + if encoder_output is None: + raise ValueError( + "`encoder_output` must not be `None` for encoder-decoder attention." + ) + + residual = seqs + + seqs = self.encoder_decoder_attn_layer_norm(seqs) + + p_choose = self.p_choose_layer(seqs, encoder_output) + + seqs = self.encoder_decoder_attn( + seqs, + padding_mask, + encoder_output, + encoder_padding_mask, + encoder_output, + ) + + if self.encoder_decoder_attn_dropout is not None: + seqs = self.encoder_decoder_attn_dropout(seqs) + + seqs = seqs + residual + + return seqs, p_choose + + def _forward_ffn(self, seqs: Tensor) -> Tensor: + residual = seqs + + seqs = self.ffn_layer_norm(seqs) + + seqs = self.ffn(seqs) + + if self.ffn_dropout is not None: + seqs = self.ffn_dropout(seqs) + + seqs = seqs + residual + + return seqs diff --git a/seamless_communication/src/seamless_communication/models/monotonic_decoder/p_choose.py b/seamless_communication/src/seamless_communication/models/monotonic_decoder/p_choose.py new file mode 100644 index 0000000..1b5bf67 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/monotonic_decoder/p_choose.py @@ -0,0 +1,148 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Optional, final + +import torch +from fairseq2.nn.projection import Linear +from fairseq2.typing import DataType, Device, finaloverride +from torch import Tensor +from torch.nn import AvgPool1d, Module, ModuleList, ReLU +from torch.nn.parameter import Parameter + + +class EnergyProjection(Module): + def __init__( + self, + model_dim: int, + num_layers: int, + bias: bool = True, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + super().__init__() + + if num_layers < 1: + raise ValueError( + f"Invalid `num_layers`: {num_layers} for EnergyProjectionLayer." + ) + + self.layers = ModuleList() + + for _ in range(num_layers): + self.layers.append( + Linear(model_dim, model_dim, bias, device=device, dtype=dtype) + ) + self.layers.append(ReLU()) + + def forward(self, seqs: Tensor) -> Tensor: + for layer in self.layers: + seqs = layer(seqs) + return seqs + + +@final +class PChooseLayer(Module): + """Represents a PChoose layer.""" + + model_dim: int + num_heads: int + energy_bias: Parameter + monotonic_temperature: float + q_energy_proj: EnergyProjection + k_energy_proj: EnergyProjection + keys_pooling: AvgPool1d + + def __init__( + self, + model_dim: int, + num_heads: int, + energy_bias_value: float, + monotonic_temperature: float, + num_monotonic_energy_layers: int, + pre_decision_ratio: int, + *, + bias: bool = True, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + """ + :param model_dim: + The dimensionality of the model. + :param num_heads: + The number of attention heads. + :param bias: + If ``True``, query, key energy projection layers learn an + additive bias. + """ + super().__init__() + + self.model_dim = model_dim + self.num_heads = num_heads + + if energy_bias_value != 0.0: + self.energy_bias = Parameter( + torch.full([1], energy_bias_value, device=device, dtype=dtype) + ) + else: + self.register_module("energy_bias", None) + + self.monotonic_temperature = monotonic_temperature + + if num_monotonic_energy_layers <= 0: + raise ValueError("Number of monotonic energy layers must be > 0.") + + self.q_energy_proj = EnergyProjection( + self.model_dim, + num_monotonic_energy_layers, + bias, + device=device, + dtype=dtype, + ) + self.k_energy_proj = EnergyProjection( + self.model_dim, + num_monotonic_energy_layers, + bias, + device=device, + dtype=dtype, + ) + + self.keys_pooling = AvgPool1d( + kernel_size=pre_decision_ratio, + stride=pre_decision_ratio, + ceil_mode=True, + ) + + @finaloverride + def forward(self, seqs: Tensor, keys: Tensor) -> Tensor: + q = self.q_energy_proj(seqs) + + # (N, S, M) -> (N, H, S, K) + q = q.unflatten(-1, (self.num_heads, -1)).transpose(1, 2) + + # (N, S_kv, M) -> (N, M, S_kv) -> (N, M, S_p) + pooled_keys = self.keys_pooling(keys.transpose(1, 2)) + + # (N, M, S_p) -> (N, S_p, M) + pooled_keys = pooled_keys.transpose(1, 2) + + k = self.k_energy_proj(pooled_keys) + + # (N, S_p, M) -> (N, H, S_p, K) + k = k.unflatten(-1, (self.num_heads, -1)).transpose(1, 2) + + # (N, H, S, K) @ (N, H, K, S_p) = (N, H, S, S_p) + monotonic_energy = torch.matmul(q, k.transpose(-1, -2)) + + monotonic_energy = monotonic_energy * (q.size(-1) ** -0.5) + + if self.energy_bias is not None: + monotonic_energy += self.energy_bias + + # p_choose: (N, H, S, S_p) + p_choose = torch.sigmoid(monotonic_energy / self.monotonic_temperature) + + return p_choose diff --git a/seamless_communication/src/seamless_communication/models/pretssel/__init__.py b/seamless_communication/src/seamless_communication/models/pretssel/__init__.py new file mode 100644 index 0000000..789ee77 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/pretssel/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from seamless_communication.models.pretssel.ecapa_tdnn import ECAPA_TDNN as ECAPA_TDNN +from seamless_communication.models.pretssel.ecapa_tdnn_builder import ( + EcapaTDNNBuilder as EcapaTDNNBuilder, +) +from seamless_communication.models.pretssel.ecapa_tdnn_builder import ( + EcapaTDNNConfig as EcapaTDNNConfig, +) +from seamless_communication.models.pretssel.ecapa_tdnn_builder import ( + ecapa_tdnn_archs as ecapa_tdnn_archs, +) diff --git a/seamless_communication/src/seamless_communication/models/pretssel/ecapa_tdnn.py b/seamless_communication/src/seamless_communication/models/pretssel/ecapa_tdnn.py new file mode 100644 index 0000000..a1fb2f2 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/pretssel/ecapa_tdnn.py @@ -0,0 +1,477 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import List, Optional, Tuple + +import torch +import torch.nn.functional as F +from fairseq2.nn.padding import PaddingMask, to_padding_mask +from torch import Tensor +from torch.nn import Conv1d, LayerNorm, Module, ModuleList, ReLU, Sigmoid, Tanh, init + + +class ECAPA_TDNN(Module): + """ + Represents the ECAPA-TDNN model described in paper: + :cite:t`https://doi.org/10.48550/arxiv.2005.07143`. + + Arguments + --------- + :param channels: + Output channels for TDNN/SERes2Net layer. + :param kernel_sizes: + List of kernel sizes for each layer. + :param dilations: + List of dilations for kernels in each layer. + :param groups: + List of groups for kernels in each layer. + """ + + def __init__( + self, + channels: List[int], + kernel_sizes: List[int], + dilations: List[int], + attention_channels: int, + res2net_scale: int, + se_channels: int, + global_context: bool, + groups: List[int], + embed_dim: int, + input_dim: int, + ): + super().__init__() + assert len(channels) == len(kernel_sizes) == len(dilations) + self.channels = channels + self.embed_dim = embed_dim + self.blocks = ModuleList() + + self.blocks.append( + TDNNBlock( + input_dim, + channels[0], + kernel_sizes[0], + dilations[0], + groups[0], + ) + ) + + # SE-Res2Net layers + for i in range(1, len(channels) - 1): + self.blocks.append( + SERes2NetBlock( + channels[i - 1], + channels[i], + res2net_scale=res2net_scale, + se_channels=se_channels, + kernel_size=kernel_sizes[i], + dilation=dilations[i], + groups=groups[i], + ) + ) + + # Multi-layer feature aggregation + self.mfa = TDNNBlock( + channels[-1], + channels[-1], + kernel_sizes[-1], + dilations[-1], + groups=groups[-1], + ) + + # Attentive Statistical Pooling + self.asp = AttentiveStatisticsPooling( + channels[-1], + attention_channels=attention_channels, + global_context=global_context, + ) + self.asp_norm = LayerNorm(channels[-1] * 2, eps=1e-12) + + # Final linear transformation + self.fc = Conv1d( + in_channels=channels[-1] * 2, + out_channels=embed_dim, + kernel_size=1, + ) + + self.reset_parameters() + + def reset_parameters(self) -> None: + """Reset the parameters and buffers of the module.""" + + def encoder_init(m: Module) -> None: + if isinstance(m, Conv1d): + init.xavier_uniform_(m.weight, init.calculate_gain("relu")) + + self.apply(encoder_init) + + def forward( + self, + x: Tensor, + padding_mask: Optional[PaddingMask] = None, + ) -> Tensor: + """Returns the embedding vector. + + Arguments + --------- + x : torch.Tensor + Tensor of shape (batch, time, channel). + """ + # Minimize transpose for efficiency + x = x.transpose(1, 2) + + xl = [] + for layer in self.blocks: + x = layer(x, padding_mask=padding_mask) + xl.append(x) + + # Multi-layer feature aggregation + x = torch.cat(xl[1:], dim=1) + x = self.mfa(x) + + # Attentive Statistical Pooling + x = self.asp(x, padding_mask=padding_mask) + x = self.asp_norm(x.transpose(1, 2)).transpose(1, 2) + + # Final linear transformation + x = self.fc(x) + + x = x.transpose(1, 2).squeeze(1) # B x C + return F.normalize(x, dim=-1) + + +class TDNNBlock(Module): + """An implementation of TDNN. + + Arguments + ---------- + :param in_channels : int + Number of input channels. + :param out_channels : int + The number of output channels. + :param kernel_size : int + The kernel size of the TDNN blocks. + :param dilation : int + The dilation of the TDNN block. + :param groups: int + The groups size of the TDNN blocks. + + Example + ------- + >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2) + >>> layer = TDNNBlock(64, 64, kernel_size=3, dilation=1) + >>> out_tensor = layer(inp_tensor).transpose(1, 2) + >>> out_tensor.shape + torch.Size([8, 120, 64]) + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + dilation: int, + groups: int = 1, + ): + super().__init__() + self.conv = Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + dilation=dilation, + padding=dilation * (kernel_size - 1) // 2, + groups=groups, + ) + self.activation = ReLU() + self.norm = LayerNorm(out_channels, eps=1e-12) + + def forward(self, x: Tensor, padding_mask: Optional[PaddingMask] = None) -> Tensor: + """Processes the input tensor x and returns an output tensor.""" + x = self.activation(self.conv(x)) + + return self.norm(x.transpose(1, 2)).transpose(1, 2) # type: ignore[no-any-return] + + +class Res2NetBlock(Module): + """An implementation of Res2NetBlock w/ dilation. + + Arguments + --------- + :param in_channels : int + The number of channels expected in the input. + :param out_channels : int + The number of output channels. + :param scale : int + The scale of the Res2Net block. + :param kernel_size: int + The kernel size of the Res2Net block. + :param dilation : int + The dilation of the Res2Net block. + + Example + ------- + >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2) + >>> layer = Res2NetBlock(64, 64, scale=4, dilation=3) + >>> out_tensor = layer(inp_tensor).transpose(1, 2) + >>> out_tensor.shape + torch.Size([8, 120, 64]) + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + scale: int = 8, + kernel_size: int = 3, + dilation: int = 1, + ): + super().__init__() + assert in_channels % scale == 0 + assert out_channels % scale == 0 + + in_channel = in_channels // scale + hidden_channel = out_channels // scale + self.blocks = ModuleList( + [ + TDNNBlock( + in_channel, + hidden_channel, + kernel_size=kernel_size, + dilation=dilation, + ) + for i in range(scale - 1) + ] + ) + self.scale = scale + + def forward(self, x: Tensor) -> Tensor: + """Processes the input tensor x and returns an output tensor.""" + y = [] + for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)): + if i == 0: + y_i = x_i + elif i == 1: + y_i = self.blocks[i - 1](x_i) + else: + y_i = self.blocks[i - 1](x_i + y_i) + y.append(y_i) + + y_tensor = torch.cat(y, dim=1) + return y_tensor + + +class SEBlock(Module): + """An implementation of squeeze-and-excitation block. + + Arguments + --------- + in_channels : int + The number of input channels. + se_channels : int + The number of output channels after squeeze. + out_channels : int + The number of output channels. + """ + + def __init__( + self, + in_channels: int, + se_channels: int, + out_channels: int, + ): + super().__init__() + + self.conv1 = Conv1d( + in_channels=in_channels, out_channels=se_channels, kernel_size=1 + ) + self.relu = ReLU(inplace=True) + self.conv2 = Conv1d( + in_channels=se_channels, out_channels=out_channels, kernel_size=1 + ) + self.sigmoid = Sigmoid() + + def forward(self, x: Tensor, padding_mask: Optional[PaddingMask] = None) -> Tensor: + """Processes the input tensor x and returns an output tensor.""" + if padding_mask is not None: + mask = padding_mask.materialize().unsqueeze(1) + s = (x * mask).sum(dim=2, keepdim=True) / padding_mask.seq_lens[ + :, None, None + ] + else: + s = x.mean(dim=2, keepdim=True) + + s = self.relu(self.conv1(s)) + s = self.sigmoid(self.conv2(s)) + + return s * x + + +class AttentiveStatisticsPooling(Module): + """This class implements an attentive statistic pooling layer for each channel. + It returns the concatenated mean and std of the input tensor. + + Arguments + --------- + channels: int + The number of input channels. + attention_channels: int + The number of attention channels. + """ + + def __init__( + self, channels: int, attention_channels: int = 128, global_context: bool = True + ): + super().__init__() + + self.eps = 1e-12 + self.global_context = global_context + if global_context: + self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1) + else: + self.tdnn = TDNNBlock(channels, attention_channels, 1, 1) + + self.tanh = Tanh() + self.conv = Conv1d( + in_channels=attention_channels, out_channels=channels, kernel_size=1 + ) + + def forward(self, x: Tensor, padding_mask: Optional[PaddingMask] = None) -> Tensor: + """Calculates mean and std for a batch (input tensor). + + Arguments + --------- + x : torch.Tensor + Tensor of shape [N, C, L]. + """ + L = x.shape[-1] + + def _compute_statistics( + x: Tensor, m: Tensor, dim: int = 2, eps: float = self.eps + ) -> Tuple[Tensor, Tensor]: + mean = (m * x).sum(dim) + std = torch.sqrt((m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps)) + return mean, std + + # if lengths is None: + # lengths = [x.shape[0]] + + # Make binary mask of shape [N, 1, L] + # mask = to_padding_mask(lengths, max(lengths)) + if padding_mask is not None: + mask = padding_mask.materialize() + else: + mask = to_padding_mask(torch.IntTensor([L]), L).repeat(x.shape[0], 1).to(x) + mask = mask.unsqueeze(1) + + # Expand the temporal context of the pooling layer by allowing the + # self-attention to look at global properties of the utterance. + if self.global_context: + # torch.std is unstable for backward computation + # https://github.com/pytorch/pytorch/issues/4320 + total = mask.sum(dim=2, keepdim=True).to(x) + mean, std = _compute_statistics(x, mask / total) + mean = mean.unsqueeze(2).repeat(1, 1, L) + std = std.unsqueeze(2).repeat(1, 1, L) + attn = torch.cat([x, mean, std], dim=1) + else: + attn = x + + # Apply layers + attn = self.conv(self.tanh(self.tdnn(attn))) + + # Filter out zero-paddings + attn = attn.masked_fill(mask == 0, float("-inf")) + + attn = F.softmax(attn, dim=2) + mean, std = _compute_statistics(x, attn) + # Append mean and std of the batch + pooled_stats = torch.cat((mean, std), dim=1) + pooled_stats = pooled_stats.unsqueeze(2) + + return pooled_stats + + +class SERes2NetBlock(Module): + """An implementation of building block in ECAPA-TDNN, i.e., + TDNN-Res2Net-TDNN-SEBlock. + + Arguments + ---------- + out_channels: int + The number of output channels. + res2net_scale: int + The scale of the Res2Net block. + kernel_size: int + The kernel size of the TDNN blocks. + dilation: int + The dilation of the Res2Net block. + groups: int + Number of blocked connections from input channels to output channels. + + Example + ------- + >>> x = torch.rand(8, 120, 64).transpose(1, 2) + >>> conv = SERes2NetBlock(64, 64, res2net_scale=4) + >>> out = conv(x).transpose(1, 2) + >>> out.shape + torch.Size([8, 120, 64]) + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + res2net_scale: int = 8, + se_channels: int = 128, + kernel_size: int = 1, + dilation: int = 1, + groups: int = 1, + ): + super().__init__() + self.out_channels = out_channels + self.tdnn1 = TDNNBlock( + in_channels, + out_channels, + kernel_size=1, + dilation=1, + groups=groups, + ) + self.res2net_block = Res2NetBlock( + out_channels, + out_channels, + res2net_scale, + kernel_size, + dilation, + ) + self.tdnn2 = TDNNBlock( + out_channels, + out_channels, + kernel_size=1, + dilation=1, + groups=groups, + ) + self.se_block = SEBlock(out_channels, se_channels, out_channels) + + self.shortcut = None + if in_channels != out_channels: + self.shortcut = Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + ) + + def forward(self, x: Tensor, padding_mask: Optional[PaddingMask] = None) -> Tensor: + """Processes the input tensor x and returns an output tensor.""" + residual = x + if self.shortcut: + residual = self.shortcut(x) + + x = self.tdnn1(x) + x = self.res2net_block(x) + x = self.tdnn2(x) + x = self.se_block(x, padding_mask=padding_mask) + + return x + residual diff --git a/seamless_communication/src/seamless_communication/models/pretssel/ecapa_tdnn_builder.py b/seamless_communication/src/seamless_communication/models/pretssel/ecapa_tdnn_builder.py new file mode 100644 index 0000000..3a4b677 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/pretssel/ecapa_tdnn_builder.py @@ -0,0 +1,112 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass +from typing import List, Optional + +from fairseq2.models.utils.arch_registry import ArchitectureRegistry +from fairseq2.typing import DataType, Device + +from seamless_communication.models.pretssel.ecapa_tdnn import ECAPA_TDNN + + +@dataclass +class EcapaTDNNConfig: + channels: List[int] + kernel_sizes: List[int] + dilations: List[int] + attention_channels: int + res2net_scale: int + se_channels: int + global_context: bool + groups: List[int] + embed_dim: int + input_dim: int + + +ecapa_tdnn_archs = ArchitectureRegistry[EcapaTDNNConfig]("ecapa_tdnn") + +ecapa_tdnn_arch = ecapa_tdnn_archs.decorator + + +@ecapa_tdnn_arch("base") +def _base_ecapa_tdnn() -> EcapaTDNNConfig: + return EcapaTDNNConfig( + channels=[512, 512, 512, 512, 1536], + kernel_sizes=[5, 3, 3, 3, 1], + dilations=[1, 2, 3, 4, 1], + attention_channels=128, + res2net_scale=8, + se_channels=128, + global_context=True, + groups=[1, 1, 1, 1, 1], + embed_dim=512, + input_dim=80, + ) + + +class EcapaTDNNBuilder: + """ + Builder module for ECAPA_TDNN model + """ + + config: EcapaTDNNConfig + device: Optional[Device] + dtype: Optional[DataType] + + def __init__( + self, + config: EcapaTDNNConfig, + *, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + """ + :param config: + The configuration to use. + :param devicev: + The device on which to initialize modules. + :param dtype: + The data type of module parameters and buffers. + """ + self.config = config + + self.device, self.dtype = device, dtype + + def build_model(self) -> ECAPA_TDNN: + """Build a model.""" + model = ECAPA_TDNN( + self.config.channels, + self.config.kernel_sizes, + self.config.dilations, + self.config.attention_channels, + self.config.res2net_scale, + self.config.se_channels, + self.config.global_context, + self.config.groups, + self.config.embed_dim, + self.config.input_dim, + ) + model.to(device=self.device, dtype=self.dtype) + return model + + +def create_ecapa_tdnn_model( + config: EcapaTDNNConfig, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, +) -> ECAPA_TDNN: + """Create a ECAPA_TDNN model. + + :param config: + The configuration to use. + :param device: + The device on which to initialize modules. + :param dtype: + The data type of module parameters and buffers. + """ + + return EcapaTDNNBuilder(config, device=device, dtype=dtype).build_model() diff --git a/seamless_communication/src/seamless_communication/models/tokenizer.py b/seamless_communication/src/seamless_communication/models/tokenizer.py new file mode 100644 index 0000000..7cf4162 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/tokenizer.py @@ -0,0 +1,122 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Optional, Sequence, Set, final + +from fairseq2.data.text import ( + SentencePieceDecoder, + SentencePieceEncoder, + SentencePieceModel, + TextTokenDecoder, + TextTokenEncoder, + TextTokenizer, + vocab_info_from_sentencepiece, +) +from fairseq2.data.typing import PathLike +from fairseq2.typing import Device, finaloverride + + +@final +class SPMTokenizer(TextTokenizer): + """Represents standard SPM-based tokenizer used in MT tasks""" + + model: SentencePieceModel + langs: Set[str] + prepend_target_langtok_to_target: bool + + def __init__( + self, + pathname: PathLike, + langs: Sequence[str], + prepend_target_langtok_to_target: bool = True, + ) -> None: + """ + :param pathname: + The pathname of the SentencePiece model file. + :param langs: + The list of supported languages. + :param default_lang: + The fall-back language if no language is specified. + """ + self.langs = set(langs) + self.prepend_target_langtok_to_target = prepend_target_langtok_to_target + + # Each language is represented by a `__lang__` control symbol. + control_symbols = [self._lang_tok_to_internal(lang) for lang in sorted(langs)] + self.model = SentencePieceModel(pathname, control_symbols) + vocab_info = vocab_info_from_sentencepiece(self.model) + super().__init__(vocab_info) + + @classmethod + def _lang_tok_to_internal(cls, lang: str) -> str: + return f"__{lang}__" + + @finaloverride + def create_encoder( + self, + *, + task: Optional[str] = None, + lang: Optional[str] = None, + mode: Optional[str] = None, + device: Optional[Device] = None, + pin_memory: bool = False, + ) -> TextTokenEncoder: + """Create a token encoder. + + :param task: + Must be 'translation'. If ``None``, defaults to 'translation'. + :param lang: + A language from :attr:`langs`. If ``None``, defaults to + :attr:`default_lang`. + :param mode: + Must be 'source' or 'target'. + :param device: + The device on which to construct tensors. + :param pin_memory: + If ``True``, uses pinned memory while constructing tensors. + """ + if task is not None and task != "translation": + raise ValueError(f"`task` must be 'translation', but is '{task}' instead.") + + assert lang is not None + + if lang not in self.langs: + raise ValueError( + f"`lang` must be a supported language, but is '{lang}' instead." + ) + + if mode is None or mode == "source": + prefix_tokens = [] + suffix_tokens = [""] + elif mode == "target": + prefix_tokens = ( + [""] + [self._lang_tok_to_internal(lang)] + if self.prepend_target_langtok_to_target + else [] + ) + suffix_tokens = [""] + else: + raise ValueError( + f"`mode` must be 'source' or 'target', but is '{mode}' instead." + ) + + return SentencePieceEncoder( + self.model, + prefix_tokens=prefix_tokens, + suffix_tokens=suffix_tokens, + device=device, + pin_memory=pin_memory, + ) + + @finaloverride + def create_raw_encoder( + self, *, device: Optional[Device] = None, pin_memory: bool = False + ) -> TextTokenEncoder: + return SentencePieceEncoder(self.model, device=device, pin_memory=pin_memory) + + @finaloverride + def create_decoder(self) -> TextTokenDecoder: + return SentencePieceDecoder(self.model) diff --git a/seamless_communication/src/seamless_communication/models/unit_extractor/__init__.py b/seamless_communication/src/seamless_communication/models/unit_extractor/__init__.py new file mode 100644 index 0000000..2997286 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/unit_extractor/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from seamless_communication.models.unit_extractor.kmeans import ( + KmeansModel as KmeansModel, +) +from seamless_communication.models.unit_extractor.unit_extractor import ( + UnitExtractor as UnitExtractor, +) +from seamless_communication.models.unit_extractor.wav2vec2_layer_output import ( + Wav2Vec2LayerOutputModel as Wav2Vec2LayerOutputModel, +) diff --git a/seamless_communication/src/seamless_communication/models/unit_extractor/kmeans.py b/seamless_communication/src/seamless_communication/models/unit_extractor/kmeans.py new file mode 100644 index 0000000..8e9d55e --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/unit_extractor/kmeans.py @@ -0,0 +1,30 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import numpy as np +import torch +from fairseq2.assets import download_manager +from fairseq2.typing import DataType, Device +from torch import Tensor, nn + + +class KmeansModel(nn.Module): + def __init__(self, kmeans_uri: str, device: Device, dtype: DataType): + super().__init__() + km_path = download_manager.download_checkpoint(kmeans_uri, kmeans_uri) + km_model = np.load(km_path) + centroids_numpy = km_model.transpose() + centroids = torch.from_numpy(centroids_numpy) + self.centroids = centroids.to(device=device, dtype=dtype) + self.centroid_norm = (self.centroids**2).sum(0, keepdims=True) + + def forward(self, x: Tensor) -> Tensor: + dist: Tensor = ( + x.pow(2).sum(1, keepdim=True) + - 2 * torch.matmul(x, self.centroids) + + self.centroid_norm + ) + return dist.argmin(dim=-1) diff --git a/seamless_communication/src/seamless_communication/models/unit_extractor/unit_extractor.py b/seamless_communication/src/seamless_communication/models/unit_extractor/unit_extractor.py new file mode 100644 index 0000000..81f7b37 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/unit_extractor/unit_extractor.py @@ -0,0 +1,112 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import logging +from pathlib import Path +from typing import Union + +import torch +import torch.nn.functional as F +from fairseq2.assets.card import AssetCard +from fairseq2.data import Collater +from fairseq2.data.audio import AudioDecoder +from fairseq2.memory import MemoryBlock +from fairseq2.models.sequence import SequenceBatch +from fairseq2.models.wav2vec2 import Wav2Vec2Model, load_wav2vec2_model +from fairseq2.nn.padding import get_seqs_and_padding_mask +from fairseq2.typing import DataType, Device +from torch import Tensor, nn + +from seamless_communication.models.unit_extractor.kmeans import KmeansModel +from seamless_communication.models.unit_extractor.wav2vec2_layer_output import ( + Wav2Vec2LayerOutputModel, +) +from seamless_communication.models.vocoder import Vocoder, load_vocoder_model + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s -- %(name)s: %(message)s", +) + +logger = logging.getLogger(__name__) + + +class UnitExtractor(nn.Module): + """Unit Extractor which converts raw audio into units.""" + + def __init__( + self, + model_name_or_card: Union[str, AssetCard], + kmeans_uri: str, + device: Device, + dtype: DataType = torch.float32, + ): + super().__init__() + + wav2vec2_model = load_wav2vec2_model( + model_name_or_card, device=device, dtype=dtype + ) + wav2vec2_model.eval() + assert isinstance(wav2vec2_model, Wav2Vec2Model) + self.model = Wav2Vec2LayerOutputModel(wav2vec2_model) + self.decode_audio = AudioDecoder(dtype=torch.float32, device=device) + self.collate = Collater(pad_value=1, pad_to_multiple=2) + self.kmeans_model = KmeansModel(kmeans_uri, device, dtype) + self.device = device + self.dtype = dtype + + @torch.inference_mode() + def predict( + self, + audio: Union[str, Tensor], + out_layer_idx: int, + sample_rate: int = 16000, + ) -> Tensor: + if isinstance(audio, str): + with Path(audio).open("rb") as fb: + block = MemoryBlock(fb.read()) + decoded_audio = self.decode_audio(block) + assert ( + sample_rate == decoded_audio["sample_rate"] + ), f"Input audio must have {sample_rate} sampling rate" + + else: + assert audio.dim() <= 2, "The audio tensor can't be more than 2 dimensions." + if audio.dim() == 1: + audio = audio.unsqueeze(1) + elif audio.dim() == 2 and audio.size(0) < audio.size(1): + logger.warning( + "Transposing audio tensor from (bsz, seq_len) -> (seq_len, bsz)." + ) + audio = audio.transpose(0, 1) + + decoded_audio = { + "waveform": audio.to(dtype=self.dtype), + "sample_rate": sample_rate, + "format": -1, + } + src = self.collate(decoded_audio)["waveform"] + seqs, padding_mask = get_seqs_and_padding_mask(src) + seqs = seqs.view(1, -1) + seqs = F.layer_norm(seqs, seqs.shape) + batch = SequenceBatch(seqs=seqs, padding_mask=padding_mask) + features = self.model(batch, out_layer_idx).squeeze(0) + units = self.kmeans_model(features) + return units # type: ignore[no-any-return] + + @staticmethod + def resynthesize_audio( + units: Tensor, + src_lang: str, + device: Device, + dtype: DataType, + vocoder_name: str = "vocoder_v2", + ) -> Tensor: + vocoder = load_vocoder_model(vocoder_name, device=device, dtype=dtype) + vocoder.eval() + assert isinstance(vocoder, Vocoder) + wav = vocoder(units, src_lang, spkr=-1, dur_prediction=True) + return wav # type: ignore[no-any-return] diff --git a/seamless_communication/src/seamless_communication/models/unit_extractor/wav2vec2_layer_output.py b/seamless_communication/src/seamless_communication/models/unit_extractor/wav2vec2_layer_output.py new file mode 100644 index 0000000..5d3daff --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/unit_extractor/wav2vec2_layer_output.py @@ -0,0 +1,119 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. +from typing import Optional + +import torch +import torch.nn as nn +from fairseq2.models.sequence import SequenceBatch +from fairseq2.models.wav2vec2 import ( + Wav2Vec2Config, + Wav2Vec2EncoderConfig, + Wav2Vec2Frontend, + Wav2Vec2Model, + wav2vec2_arch, +) +from fairseq2.nn.padding import PaddingMask +from fairseq2.nn.transformer import TransformerEncoder, TransformerNormOrder +from torch import Tensor + + +def _encoder_xlsr2_1b_v2() -> Wav2Vec2EncoderConfig: + layer_descs = [(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512, 2, 2)] * 2 + + return Wav2Vec2EncoderConfig( + model_dim=1280, + max_seq_len=4096, + feature_dim=512, + use_fbank=False, + first_pass_dropout_p=0.0, + layer_norm_features=False, + feature_extractor_layer_descs=layer_descs, + feature_extractor_bias=True, + feature_extractor_layer_norm_convs=True, + feature_grad_scale=1.0, + num_fbank_channels=0, + fbank_stride=0, + sample_fbank_every_k=0, + pos_encoder_type="conv", + pos_encoder_depth=1, + pos_conv_kernel_size=128, + num_pos_conv_groups=16, + use_conformer=False, + num_encoder_layers=48, + num_encoder_attn_heads=16, + ffn_inner_dim=5120, + dropout_p=0.1, + attn_dropout_p=0.1, + layer_drop_p=0.0, + norm_order=TransformerNormOrder.PRE, + depthwise_conv_kernel_size=0, + ) + + +@wav2vec2_arch("xlsr2_1b_v2") +def _xlsr2_1b_v2() -> Wav2Vec2Config: + encoder_config = _encoder_xlsr2_1b_v2() + + return Wav2Vec2Config( + encoder_config, + final_dim=1024, + final_proj_bias=True, + temporal_mask_span_len=10, + max_temporal_mask_prob=0.65, + spatial_mask_span_len=10, + max_spatial_mask_prob=0.0, + quantized_dim=1024, + num_codebooks=2, + num_codebook_entries=320, + codebook_sampling_temperature=(2, 0.1, 0.999995), + num_distractors=100, + logit_temp=0.1, + diversity_loss_weight=0.1, + ) + + +class Wav2Vec2LayerOutputModel(nn.Module): + encoder_frontend: Wav2Vec2Frontend + encoder: TransformerEncoder + + def __init__(self, w2v2: Wav2Vec2Model): + super().__init__() + + self.encoder_frontend = w2v2.encoder_frontend + self.encoder = w2v2.encoder + + @torch.inference_mode() + def forward(self, batch: SequenceBatch, out_layer_idx: int) -> Tensor: + """ + :param batch: + The batch of sequences to process. + """ + seqs, padding_mask = self.encoder_frontend(batch.seqs, batch.padding_mask) + + w2v2_layer_output = None + + def hook( + layer_idx: int, + layer_output: Tensor, + layer_padding_mask: Optional[PaddingMask], + num_layers: int, + ) -> bool: + nonlocal w2v2_layer_output + + if layer_idx == out_layer_idx: + w2v2_layer_output = layer_output + + # We don't need to execute the remaining layers. + return False + + return True + + with self.encoder.register_layer_output_hook(hook): + _, _ = self.encoder(seqs, padding_mask) + + assert w2v2_layer_output is not None + + return w2v2_layer_output diff --git a/seamless_communication/src/seamless_communication/models/unity/__init__.py b/seamless_communication/src/seamless_communication/models/unity/__init__.py new file mode 100644 index 0000000..eb6f138 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/unity/__init__.py @@ -0,0 +1,90 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from seamless_communication.models.unity.builder import UnitYBuilder as UnitYBuilder +from seamless_communication.models.unity.builder import UnitYConfig as UnitYConfig +from seamless_communication.models.unity.builder import ( + create_unity_model as create_unity_model, +) +from seamless_communication.models.unity.builder import unity_arch as unity_arch +from seamless_communication.models.unity.builder import unity_archs as unity_archs +from seamless_communication.models.unity.char_tokenizer import ( + CharTokenizer as CharTokenizer, +) +from seamless_communication.models.unity.char_tokenizer import ( + UnitYCharTokenizerLoader as UnitYCharTokenizerLoader, +) +from seamless_communication.models.unity.char_tokenizer import ( + load_unity_char_tokenizer as load_unity_char_tokenizer, +) +from seamless_communication.models.unity.fft_decoder import ( + FeedForwardTransformer as FeedForwardTransformer, +) +from seamless_communication.models.unity.fft_decoder_layer import ( + FeedForwardTransformerLayer as FeedForwardTransformerLayer, +) +from seamless_communication.models.unity.film import FiLM +from seamless_communication.models.unity.length_regulator import ( + HardUpsampling as HardUpsampling, +) +from seamless_communication.models.unity.length_regulator import ( + VarianceAdaptor as VarianceAdaptor, +) +from seamless_communication.models.unity.length_regulator import ( + VariancePredictor as VariancePredictor, +) +from seamless_communication.models.unity.loader import ( + load_gcmvn_stats as load_gcmvn_stats, +) +from seamless_communication.models.unity.loader import ( + load_unity_config as load_unity_config, +) +from seamless_communication.models.unity.loader import ( + load_unity_model as load_unity_model, +) +from seamless_communication.models.unity.loader import ( + load_unity_text_tokenizer as load_unity_text_tokenizer, +) +from seamless_communication.models.unity.loader import ( + load_unity_unit_tokenizer as load_unity_unit_tokenizer, +) +from seamless_communication.models.unity.model import UnitYModel as UnitYModel +from seamless_communication.models.unity.model import ( + UnitYNART2UModel as UnitYNART2UModel, +) +from seamless_communication.models.unity.model import UnitYOutput as UnitYOutput +from seamless_communication.models.unity.model import UnitYT2UModel as UnitYT2UModel +from seamless_communication.models.unity.model import UnitYX2TModel as UnitYX2TModel +from seamless_communication.models.unity.nar_decoder_frontend import ( + NARDecoderFrontend as NARDecoderFrontend, +) +from seamless_communication.models.unity.t2u_builder import ( + UnitYNART2UBuilder as UnitYNART2UBuilder, +) +from seamless_communication.models.unity.t2u_builder import ( + UnitYT2UBuilder as UnitYT2UBuilder, +) +from seamless_communication.models.unity.t2u_builder import ( + UnitYT2UConfig as UnitYT2UConfig, +) +from seamless_communication.models.unity.t2u_builder import ( + create_unity_t2u_model as create_unity_t2u_model, +) +from seamless_communication.models.unity.t2u_builder import ( + unity_t2u_arch as unity_t2u_arch, +) +from seamless_communication.models.unity.t2u_builder import ( + unity_t2u_archs as unity_t2u_archs, +) +from seamless_communication.models.unity.unit_tokenizer import ( + UnitTokenDecoder as UnitTokenDecoder, +) +from seamless_communication.models.unity.unit_tokenizer import ( + UnitTokenEncoder as UnitTokenEncoder, +) +from seamless_communication.models.unity.unit_tokenizer import ( + UnitTokenizer as UnitTokenizer, +) diff --git a/seamless_communication/src/seamless_communication/models/unity/adaptor_block.py b/seamless_communication/src/seamless_communication/models/unity/adaptor_block.py new file mode 100644 index 0000000..ba96a68 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/unity/adaptor_block.py @@ -0,0 +1,438 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Iterable, Optional, Tuple, final + +import torch +from fairseq2.models.conformer import ConformerBlock +from fairseq2.nn.module_list import ModuleList +from fairseq2.nn.normalization import LayerNorm +from fairseq2.nn.padding import PaddingMask +from fairseq2.nn.projection import Linear +from fairseq2.nn.transformer import ( + AttentionMask, + FeedForwardNetwork, + LayerNormFactory, + MultiheadAttention, + TransformerEncoder, + TransformerEncoderLayer, + create_standard_layer_norm, +) +from fairseq2.typing import DataType, Device +from overrides import final as finaloverride +from torch import Tensor +from torch.nn import GLU, Conv1d, Dropout, ReLU + + +@final +class UnitYEncoderAdaptor(TransformerEncoder): + """Represents a Transformer encoder that wraps a speech encoder and adapts + it to be used with the UnitY architecture.""" + + inner: TransformerEncoder + inner_layer_norm: Optional[LayerNorm] + proj1: Linear + activation: ReLU + proj2: Linear + adaptor_layers: ModuleList + layer_norm: LayerNorm + + def __init__( + self, + inner: TransformerEncoder, + adaptor_layers: Iterable[TransformerEncoderLayer], + *, + inner_layer_norm: bool = False, + layer_norm_factory: Optional[LayerNormFactory] = None, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + """ + :param inner: + The speech encoder to wrap. + :param adaptor_layers: + The adaptor layers to stack on top of ``inner``. + :param inner_layer_norm: + If ``True``, applies Layer Normalization to outputs of ``inner``. + :param layer_norm_factory: + The factory to use to construct the Layer Normalization modules. + """ + model_dim = inner.model_dim + + super().__init__(model_dim) + + if layer_norm_factory is None: + layer_norm_factory = create_standard_layer_norm + + self.inner = inner + + if inner_layer_norm: + self.inner_layer_norm = layer_norm_factory( + model_dim, device=device, dtype=dtype + ) + else: + self.register_module("inner_layer_norm", None) + + self.proj1 = Linear( + model_dim, model_dim * 4, bias=True, device=device, dtype=dtype + ) + + self.activation = ReLU() + + self.proj2 = Linear( + model_dim * 4, model_dim, bias=True, device=device, dtype=dtype + ) + + layer_list = ModuleList(adaptor_layers) + if not layer_list: + raise ValueError("`adaptor_layers` must be non-empty.") + + self.adaptor_layers = layer_list + + self.layer_norm = layer_norm_factory(model_dim, device=device, dtype=dtype) + + @finaloverride + def forward( + self, + seqs: Tensor, + padding_mask: Optional[PaddingMask], + ) -> Tuple[Tensor, Optional[PaddingMask]]: + seqs, padding_mask = self.inner(seqs, padding_mask) + + if self.inner_layer_norm is not None: + seqs = self.inner_layer_norm(seqs) + + # Only difference compared to a vanilla Transformer encoder. + seqs = seqs + 0.5 * self._expand_contract(seqs) + + for layer in self.adaptor_layers: + seqs, padding_mask = layer(seqs, padding_mask) + + seqs = self.layer_norm(seqs) + + return seqs, padding_mask + + def _expand_contract(self, seqs: Tensor) -> Tensor: + seqs = self.proj1(seqs) + + seqs = self.activation(seqs) + + seqs = self.proj2(seqs) + + return seqs + + +@final +class UnitYTransformerAdaptorLayer(TransformerEncoderLayer): + """Represents a variant of M-Adaptor layer described in + :cite:t`https://doi.org/10.48550/arxiv.2207.00952`. + + The main difference from the paper is that pooling is applied to multi-head + attention input rather than projected Q, K, V. + """ + + kernel_size: int + stride: int + residual_layer_norm: LayerNorm + residual_conv: Conv1d + residual_activation: GLU + self_attn_layer_norm: LayerNorm + self_attn_conv: Conv1d + self_attn_activation: GLU + self_attn: MultiheadAttention + self_attn_dropout: Optional[Dropout] + ffn_layer_norm: LayerNorm + ffn: FeedForwardNetwork + ffn_dropout: Optional[Dropout] + + def __init__( + self, + self_attn: MultiheadAttention, + ffn: FeedForwardNetwork, + kernel_size: int, + stride: int, + *, + dropout_p: float = 0.1, + layer_norm_factory: Optional[LayerNormFactory] = None, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + """ + :param self_attn: + The self attention layer. + :param ffn: + The feed-forward network. + :param kernel_size: + The kernel size for 1D pooling convolutions. + :param stride: + The stride for 1D pooling convolutions. + :param dropout_p: + The dropout probability on outputs of the self attention layer and + the feed-forward network. + :param layer_norm_factory: + The factory to use to construct the Layer Normalization modules. + """ + model_dim = self_attn.model_dim + + super().__init__(model_dim) + + if layer_norm_factory is None: + layer_norm_factory = create_standard_layer_norm + + self.kernel_size = kernel_size + self.stride = stride + + self.residual_layer_norm = layer_norm_factory( + model_dim, device=device, dtype=dtype + ) + + self.residual_conv = Conv1d( + model_dim, + model_dim * 2, + kernel_size, + stride, + padding=kernel_size // 2, + device=device, + dtype=dtype, + ) + + self.residual_activation = GLU(dim=1) + + self.self_attn_layer_norm = layer_norm_factory( + model_dim, device=device, dtype=dtype + ) + + self.self_attn_conv = Conv1d( + model_dim, + model_dim * 2, + kernel_size, + stride, + padding=kernel_size // 2, + device=device, + dtype=dtype, + ) + + self.self_attn_activation = GLU(dim=1) + + self.self_attn = self_attn + + if dropout_p > 0.0: + self.self_attn_dropout = Dropout(dropout_p) + else: + self.register_module("self_attn_dropout", None) + + self.ffn_layer_norm = layer_norm_factory(model_dim, device=device, dtype=dtype) + + self.ffn = ffn + + if dropout_p > 0.0: + self.ffn_dropout = Dropout(dropout_p) + else: + self.register_module("ffn_dropout", None) + + @finaloverride + def forward( + self, + seqs: Tensor, + padding_mask: Optional[PaddingMask], + self_attn_mask: Optional[AttentionMask] = None, + ) -> Tuple[Tensor, Optional[PaddingMask]]: + seqs, padding_mask = self._forward_self_attn(seqs, padding_mask, self_attn_mask) + + seqs = self._forward_ffn(seqs) + + return seqs, padding_mask + + def _forward_self_attn( + self, + seqs: Tensor, + padding_mask: Optional[PaddingMask], + self_attn_mask: Optional[AttentionMask], + ) -> Tuple[Tensor, Optional[PaddingMask]]: + residual = self.residual_layer_norm(seqs) + + # Apply pooling to the residual to match the sequence length of the + # multi-head attention output. + # (N, S, M) -> (N, M, S) + residual = residual.transpose(1, 2) + + residual = self.residual_conv(residual) + + residual = self.residual_activation(residual) + + # (N, M, S) -> (N, S, M) + residual = residual.transpose(1, 2) + + seqs = self.self_attn_layer_norm(seqs) + + # Apply pooling before feeding to the multihead-attention layer. + # (N, S, M) -> (N, M, S) + seqs = seqs.transpose(1, 2) + + seqs = self.self_attn_conv(seqs) + + seqs = self.self_attn_activation(seqs) + + # (N, M, S) -> (N, S, M) + seqs = seqs.transpose(1, 2) + + padding_mask = _compute_new_padding_mask( + seqs, padding_mask, self.kernel_size, self.stride + ) + + # The rest of the computation is identical to a vanilla Transformer + # encoder layer. + seqs = self.self_attn( + seqs, + padding_mask, + keys=seqs, + key_padding_mask=padding_mask, + values=seqs, + attn_mask=self_attn_mask, + ) + + if self.self_attn_dropout is not None: + seqs = self.self_attn_dropout(seqs) + + seqs = seqs + residual + + return seqs, padding_mask + + def _forward_ffn(self, seqs: Tensor) -> Tensor: + residual = seqs + + seqs = self.ffn_layer_norm(seqs) + + seqs = self.ffn(seqs) + + if self.ffn_dropout is not None: + seqs = self.ffn_dropout(seqs) + + return seqs + residual + + def extra_repr(self) -> str: + """:meta private:""" + s = super().extra_repr() + + return s + f", kernel_size={self.kernel_size}, stride={self.stride}" + + +@final +class UnitYConformerAdaptorLayer(TransformerEncoderLayer): + """Represents a variant of M-Adaptor layer described in + :cite:t`https://doi.org/10.48550/arxiv.2207.00952`. + + The main difference from the paper is that this variant uses a Conformer + block which empirically showed better performance when used with Conformer- + based speech encoder architectures such as w2v-BERT. + """ + + kernel_size: int + stride: int + layer_norm: Optional[LayerNorm] + conv: Conv1d + activation: GLU + block: ConformerBlock + + def __init__( + self, + block: ConformerBlock, + kernel_size: int, + stride: int, + *, + layer_norm: bool = False, + layer_norm_factory: Optional[LayerNormFactory] = None, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + """ + :param block: + The Conformer block to use. + :param kernel_size: + The kernel size for 1D pooling convolutions. + :param stride: + The stride for 1D pooling convolutions. + :param layer_norm: + If ``True``, applies Layer Normalization to inputs before pooling. + :param layer_norm_factory: + The factory to use to construct the Layer Normalization modules. + """ + super().__init__(block.model_dim) + + if layer_norm_factory is None: + layer_norm_factory = create_standard_layer_norm + + self.kernel_size = kernel_size + self.stride = stride + + if layer_norm: + self.layer_norm = layer_norm_factory( + self.model_dim, device=device, dtype=dtype + ) + else: + self.register_module("layer_norm", None) + + self.conv = Conv1d( + self.model_dim, + self.model_dim * 2, + kernel_size, + stride, + padding=kernel_size // 2, + device=device, + dtype=dtype, + ) + + self.activation = GLU(dim=1) + + self.block = block + + @finaloverride + def forward( + self, + seqs: Tensor, + padding_mask: Optional[PaddingMask], + self_attn_mask: Optional[AttentionMask] = None, + ) -> Tuple[Tensor, Optional[PaddingMask]]: + if self.layer_norm is not None: + seqs = self.layer_norm(seqs) + + # Apply pooling before feeding to the Conformer block. + # (N, S, M) -> (N, M, S) + seqs = seqs.transpose(1, 2) + + seqs = self.conv(seqs) + + seqs = self.activation(seqs) + + # (N, M, S) -> (N, S, M) + seqs = seqs.transpose(1, 2) + + padding_mask = _compute_new_padding_mask( + seqs, padding_mask, self.kernel_size, self.stride + ) + + return self.block(seqs, padding_mask, self_attn_mask) # type: ignore[no-any-return] + + def extra_repr(self) -> str: + """:meta private:""" + s = super().extra_repr() + + return s + f", kernel_size={self.kernel_size}, stride={self.stride}" + + +def _compute_new_padding_mask( + seqs: Tensor, padding_mask: Optional[PaddingMask], kernel_size: int, stride: int +) -> Optional[PaddingMask]: + if padding_mask is None: + return padding_mask + + pad = kernel_size // 2 + + seq_lens = ((padding_mask.seq_lens + 2 * pad - kernel_size) / stride) + 1 + + seq_lens = seq_lens.floor().to(torch.int64) + + return PaddingMask(seq_lens, batch_seq_len=seqs.size(1)) diff --git a/seamless_communication/src/seamless_communication/models/unity/builder.py b/seamless_communication/src/seamless_communication/models/unity/builder.py new file mode 100644 index 0000000..f7d6b6d --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/unity/builder.py @@ -0,0 +1,517 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass +from typing import Optional, Union + +from fairseq2.models.conformer import ConformerBlock, ConformerConvolution +from fairseq2.models.nllb import NllbBuilder, NllbConfig, nllb_archs +from fairseq2.models.utils.arch_registry import ArchitectureRegistry +from fairseq2.models.w2vbert import w2vbert_archs +from fairseq2.models.wav2vec2 import Wav2Vec2EncoderBuilder, Wav2Vec2EncoderConfig +from fairseq2.nn.projection import TiedProjection +from fairseq2.nn.transformer import ( + FeedForwardNetwork, + MultiheadAttention, + StandardFeedForwardNetwork, + StandardMultiheadAttention, + TransformerEncoder, + TransformerEncoderLayer, + TransformerNormOrder, + create_default_sdpa, +) +from fairseq2.typing import DataType, Device, override +from torch.nn import GELU, ReLU + +from seamless_communication.models.generator.ecapa_tdnn_builder import ( + EcapaTDNNBuilder, + EcapaTDNNConfig, + ecapa_tdnn_archs, +) +from seamless_communication.models.unity.adaptor_block import ( + UnitYConformerAdaptorLayer, + UnitYEncoderAdaptor, + UnitYTransformerAdaptorLayer, +) +from seamless_communication.models.unity.model import UnitYModel +from seamless_communication.models.unity.t2u_builder import ( + UnitYNART2UBuilder, + UnitYT2UBuilder, + UnitYT2UConfig, + unity_t2u_archs, +) +from seamless_communication.models.conformer_shaw import ( + ConformerShawEncoderBuilder, + ConformerShawEncoderConfig, + conformer_shaw_archs, +) + + +@dataclass +class UnitYConfig: + """Holds the configuration of a UnitY model as described in + :cite:t`https://doi.org/10.48550/arxiv.2212.08055`""" + + model_dim: int + """The dimensionality of the model.""" + + w2v2_encoder_config: Wav2Vec2EncoderConfig + """The configuration of the underlying wav2vec 2.0 encoder.""" + + mt_model_config: NllbConfig + """The configuration of the underlying MT text encoder-decoder.""" + + t2u_config: Optional[UnitYT2UConfig] + """The configuration of the UnitY T2U sub-model.""" + + prosody_encoder_config: Optional[EcapaTDNNConfig] + """The configuration of the expressive prosody encoder.""" + + use_text_encoder: bool + """If ``True``, uses an aligned MT encoder for the MT task.""" + + use_text_decoder: bool + """If ``False``, skips loading a text decoder, to be used with a Monotonic decoder.""" + + use_conformer_adaptor: bool + """If ``True``, uses a Conformer-based adaptor block.""" + + use_gelu: bool + """If ``True``, uses GELU activation function in feed-forward networks of + adaptor blocks and decoder layers.""" + + num_adaptor_layers: int + """The number of Transformer encoder layers in the adaptor block.""" + + adaptor_kernel_size: int + """The kernel size of 1D convolutions in the adaptor block.""" + + adaptor_stride: int + """The stride of 1D convolutions in the adaptor block.""" + + adaptor_layer_norm: bool + """If ``True``, applies Layer Normalization to outputs of the underlying + encoder in the adaptor block.""" + + adaptor_dropout_p: float + """The dropout probability in Transformer layers of the adaptor block.""" + + +unity_archs = ArchitectureRegistry[UnitYConfig]("unity") + +unity_arch = unity_archs.decorator + + +@unity_arch("base") +def _base() -> UnitYConfig: + w2vbert_config = w2vbert_archs.get_config("600m") + + mt_model_config: NllbConfig = nllb_archs.get_config("dense_1b") + + mt_model_config.vocab_info.size = 256102 # NLLB-100 + + t2u_config = unity_t2u_archs.get_config("base") + + return UnitYConfig( + model_dim=1024, + w2v2_encoder_config=w2vbert_config.w2v2_config.encoder_config, + mt_model_config=mt_model_config, + t2u_config=t2u_config, + prosody_encoder_config=None, + use_text_encoder=True, + use_text_decoder=True, + use_conformer_adaptor=False, + use_gelu=False, + num_adaptor_layers=1, + adaptor_kernel_size=8, + adaptor_stride=8, + adaptor_layer_norm=True, + adaptor_dropout_p=0.1, + ) + + +@unity_arch("medium") +def _medium() -> UnitYConfig: + w2vbert_config = w2vbert_archs.get_config("300m") + + mt_model_config: NllbConfig = nllb_archs.get_config("dense_600m") + + mt_model_config.vocab_info.size = 256206 # NLLB-200 + + t2u_config = unity_t2u_archs.get_config("medium") + + return UnitYConfig( + model_dim=1024, + w2v2_encoder_config=w2vbert_config.w2v2_config.encoder_config, + mt_model_config=mt_model_config, + t2u_config=t2u_config, + prosody_encoder_config=None, + use_text_encoder=True, + use_text_decoder=True, + use_conformer_adaptor=False, + use_gelu=False, + num_adaptor_layers=1, + adaptor_kernel_size=8, + adaptor_stride=8, + adaptor_layer_norm=True, + adaptor_dropout_p=0.1, + ) + + +@unity_arch("base_v2") +def _base_v2() -> UnitYConfig: + conformer_shaw_encoder_config = conformer_shaw_archs.get_config("600m") + + mt_model_config: NllbConfig = nllb_archs.get_config("dense_1b") + + mt_model_config.vocab_info.size = 256102 # NLLB-100 + + mt_model_config.max_seq_len = 4096 + + t2u_config = unity_t2u_archs.get_config("base_nar") + + return UnitYConfig( + model_dim=1024, + w2v2_encoder_config=conformer_shaw_encoder_config, + mt_model_config=mt_model_config, + t2u_config=t2u_config, + prosody_encoder_config=None, + use_text_encoder=True, + use_text_decoder=True, + use_conformer_adaptor=False, + use_gelu=False, + num_adaptor_layers=1, + adaptor_kernel_size=8, + adaptor_stride=8, + adaptor_layer_norm=True, + adaptor_dropout_p=0.1, + ) + + +@unity_arch("expressivity_v2") +def _expressivity_v2() -> UnitYConfig: + conformer_shaw_encoder_config = conformer_shaw_archs.get_config("600m") + + mt_model_config: NllbConfig = nllb_archs.get_config("dense_1b") + + mt_model_config.vocab_info.size = 256102 # NLLB-100 + + mt_model_config.max_seq_len = 10000 + + t2u_config = unity_t2u_archs.get_config("expressivity_nar") + + prosody_encoder_config = ecapa_tdnn_archs.get_config("base") + + return UnitYConfig( + model_dim=1024, + w2v2_encoder_config=conformer_shaw_encoder_config, + mt_model_config=mt_model_config, + t2u_config=t2u_config, + prosody_encoder_config=prosody_encoder_config, + use_text_encoder=False, + use_text_decoder=True, + use_conformer_adaptor=False, + use_gelu=True, + num_adaptor_layers=1, + adaptor_kernel_size=8, + adaptor_stride=8, + adaptor_layer_norm=True, + adaptor_dropout_p=0.1, + ) + + +class UnitYBuilder: + """Builds modules of a UnitY model. + + To tweak the architecture, you can derive from this class and override the + corresponding methods. + """ + + config: UnitYConfig + w2v2_encoder_builder: Wav2Vec2EncoderBuilder + mt_model_builder: NllbBuilder + t2u_builder: Union[UnitYT2UBuilder, UnitYNART2UBuilder, None] + prosody_encoder_builder: Optional[EcapaTDNNBuilder] + device: Optional[Device] + dtype: Optional[DataType] + + def __init__( + self, + config: UnitYConfig, + w2v2_encoder_builder: Wav2Vec2EncoderBuilder, + mt_model_builder: NllbBuilder, + t2u_builder: Union[UnitYT2UBuilder, UnitYNART2UBuilder, None], + prosody_encoder_builder: Optional[EcapaTDNNBuilder], + *, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + """ + :param config: + The configuration to use. + :param w2v2_encoder_builder: + The wav2vec 2.0 encoder builder. + :param mt_model_builder: + The MT model builder. + :param t2u_builder: + The UnitY T2U model builder. + :param device: + The device on which to initialize modules. + :param dtype: + The data type of module parameters and buffers. + """ + if w2v2_encoder_builder.config.model_dim != config.model_dim: + raise ValueError( + f"`model_dim` and `model_dim` of `w2v2_encoder_builder.config` must be equal, but are {config.model_dim} and {w2v2_encoder_builder.config.model_dim} instead." + ) + + if mt_model_builder.config.model_dim != config.model_dim: + raise ValueError( + f"`model_dim` and `model_dim` of `mt_model_builder.config` must be equal, but are {config.model_dim} and {mt_model_builder.config.model_dim} instead." + ) + + if t2u_builder is not None and t2u_builder.config.model_dim != config.model_dim: + raise ValueError( + f"`model_dim` and `model_dim` of `t2u_builder.config` must be equal, but are {config.model_dim} and {t2u_builder.config.model_dim} instead." + ) + + self.config = config + + self.w2v2_encoder_builder = w2v2_encoder_builder + self.mt_model_builder = mt_model_builder + self.t2u_builder = t2u_builder + self.prosody_encoder_builder = prosody_encoder_builder + + self.device, self.dtype = device, dtype + + def build_model(self) -> UnitYModel: + """Build a model.""" + speech_encoder_frontend = self.w2v2_encoder_builder.build_frontend() + speech_encoder = self.build_speech_encoder() + + if self.config.use_text_encoder: + text_embed = self.mt_model_builder.build_embedding() + text_encoder_frontend = self.mt_model_builder.build_frontend(text_embed) + text_encoder = self.mt_model_builder.build_encoder() + else: + text_embed = None + text_encoder_frontend = None + text_encoder = None + + if self.config.use_text_decoder: + if text_embed is None: + text_embed = self.mt_model_builder.build_embedding() + + if text_encoder_frontend is not None: + # We use shared embedding as in NLLB. + text_decoder_frontend = text_encoder_frontend + else: + text_decoder_frontend = self.mt_model_builder.build_frontend(text_embed) + + text_decoder = self.mt_model_builder.build_decoder() + final_proj = TiedProjection(text_embed.weight, bias=None) + else: + text_decoder_frontend = None + text_decoder = None + final_proj = None + + if self.t2u_builder is None: + t2u_model = None + else: + t2u_model = self.t2u_builder.build_model() + + if self.prosody_encoder_builder is None: + prosody_encoder_model = None + else: + prosody_encoder_model = self.prosody_encoder_builder.build_model() + + return UnitYModel( + speech_encoder_frontend, + speech_encoder, + text_encoder_frontend, + text_encoder, + text_decoder_frontend, + text_decoder, + final_proj, + t2u_model, + self.config.mt_model_config.vocab_info, + prosody_encoder_model, + ) + + def build_speech_encoder(self) -> TransformerEncoder: + """Build a speech Transformer encoder.""" + w2v2_encoder = self.w2v2_encoder_builder.build_encoder() + + # For Conformer-based wav2vec 2.0 architectures (e.g. w2v-BERT), we + # typically use a special type of adaptor layer. + if not self.config.use_conformer_adaptor: + build_adaptor_layer = self.build_adaptor_layer + else: + build_adaptor_layer = self.build_conformer_adaptor_layer + + num_layers = self.config.num_adaptor_layers + + layers = [build_adaptor_layer(i) for i in range(num_layers)] + + return UnitYEncoderAdaptor( + w2v2_encoder, + layers, + inner_layer_norm=self.config.adaptor_layer_norm, + device=self.device, + dtype=self.dtype, + ) + + def build_adaptor_layer(self, idx: int) -> TransformerEncoderLayer: + """Build a Transformer-based encoder adaptor layer.""" + self_attn = self.build_adaptor_attention( + self.w2v2_encoder_builder.config.num_encoder_attn_heads + ) + + ffn = StandardFeedForwardNetwork( + self.config.model_dim, + self.w2v2_encoder_builder.config.ffn_inner_dim, + inner_activation=GELU() if self.config.use_gelu else ReLU(), + bias=True, + device=self.device, + dtype=self.dtype, + ) + + return UnitYTransformerAdaptorLayer( + self_attn, + ffn, + self.config.adaptor_kernel_size, + self.config.adaptor_stride, + dropout_p=self.config.adaptor_dropout_p, + device=self.device, + dtype=self.dtype, + ) + + def build_conformer_adaptor_layer(self, idx: int) -> TransformerEncoderLayer: + """Build a Conformer-based encoder adaptor layer.""" + ffn1 = self.w2v2_encoder_builder.build_ffn(use_swish=True) + + # Empirically shown that, in adaptor layers, vanilla MHA performs better + # than MHA with relative positional encoding. + self_attn = self.build_adaptor_attention( + self.w2v2_encoder_builder.config.num_encoder_attn_heads + ) + + conv = ConformerConvolution( + self.w2v2_encoder_builder.config.model_dim, + self.w2v2_encoder_builder.config.depthwise_conv_kernel_size, + device=self.device, + dtype=self.dtype, + ) + + ffn2 = self.w2v2_encoder_builder.build_ffn(use_swish=True) + + block = ConformerBlock( + ffn1, + self_attn, + conv, + ffn2, + dropout_p=self.config.adaptor_dropout_p, + device=self.device, + dtype=self.dtype, + ) + + layer_norm = idx == 0 + + return UnitYConformerAdaptorLayer( + block, + self.config.adaptor_kernel_size, + self.config.adaptor_stride, + layer_norm=layer_norm, + device=self.device, + dtype=self.dtype, + ) + + def build_adaptor_attention(self, num_heads: int) -> MultiheadAttention: + """Build a Transformer multi-head attention layer in adaptor block.""" + sdpa = create_default_sdpa(attn_dropout_p=self.config.adaptor_dropout_p) + + return StandardMultiheadAttention( + self.config.model_dim, + num_heads, + sdpa=sdpa, + device=self.device, + dtype=self.dtype, + ) + + +class NllbWithGELUBuilder(NllbBuilder): + @override + def build_ffn(self) -> FeedForwardNetwork: + return StandardFeedForwardNetwork( + self.config.model_dim, + self.config.ffn_inner_dim, + bias=True, + inner_activation=GELU(), + norm_order=TransformerNormOrder.PRE, + device=self.device, + dtype=self.dtype, + ) + + +def create_unity_model( + config: UnitYConfig, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, +) -> UnitYModel: + """Create a UnitY model. + + :param config: + The configuration to use. + :param device: + The device on which to initialize modules. + :param dtype: + The data type of module parameters and buffers. + """ + if isinstance(config.w2v2_encoder_config, ConformerShawEncoderConfig): + w2v2_encoder_builder: Wav2Vec2EncoderBuilder = ConformerShawEncoderBuilder( + config.w2v2_encoder_config, device=device, dtype=dtype + ) + else: + w2v2_encoder_builder = Wav2Vec2EncoderBuilder( + config.w2v2_encoder_config, device=device, dtype=dtype + ) + + t2u_builder: Union[UnitYT2UBuilder, UnitYNART2UBuilder, None] + + if config.t2u_config is None: + t2u_builder = None + elif config.t2u_config.nar_decoder_config is None: + t2u_builder = UnitYT2UBuilder(config.t2u_config, device=device, dtype=dtype) + else: + t2u_builder = UnitYNART2UBuilder(config.t2u_config, device=device, dtype=dtype) + + if config.prosody_encoder_config is None: + prosody_encoder_builder = None + else: + prosody_encoder_builder = EcapaTDNNBuilder( + config.prosody_encoder_config, device=device, dtype=dtype + ) + + if config.use_gelu: + mt_model_builder: NllbBuilder = NllbWithGELUBuilder( + config.mt_model_config, device=device, dtype=dtype + ) + else: + mt_model_builder = NllbBuilder( + config.mt_model_config, device=device, dtype=dtype + ) + + unity_builder = UnitYBuilder( + config, + w2v2_encoder_builder, + mt_model_builder, + t2u_builder, + prosody_encoder_builder, + device=device, + dtype=dtype, + ) + + return unity_builder.build_model() diff --git a/seamless_communication/src/seamless_communication/models/unity/char_tokenizer.py b/seamless_communication/src/seamless_communication/models/unity/char_tokenizer.py new file mode 100644 index 0000000..2256a77 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/unity/char_tokenizer.py @@ -0,0 +1,113 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Optional, Union, final + +from fairseq2.assets import ( + AssetDownloadManager, + AssetStore, + asset_store, + download_manager, +) +from fairseq2.assets.card import AssetCard +from fairseq2.data.text import ( + SentencePieceDecoder, + SentencePieceEncoder, + SentencePieceModel, + TextTokenDecoder, + TextTokenEncoder, + TextTokenizer, + vocab_info_from_sentencepiece, +) +from fairseq2.data.typing import PathLike +from fairseq2.typing import Device, finaloverride + + +@final +class CharTokenizer(TextTokenizer): + """A character-level tokenizer used during non-autoregressive T2U decoding.""" + + model: SentencePieceModel + + def __init__(self, pathname: PathLike) -> None: + """ + :param pathname: + The pathname of the SentencePiece model file. + """ + self.model = SentencePieceModel(pathname) + + vocab_info = vocab_info_from_sentencepiece(self.model) + + super().__init__(vocab_info) + + @finaloverride + def create_encoder( + self, + task: Optional[str] = None, + lang: Optional[str] = None, + mode: Optional[str] = None, + device: Optional[Device] = None, + pin_memory: bool = False, + ) -> TextTokenEncoder: + """Creates a character level encoder.""" + return SentencePieceEncoder( + self.model, + device=device, + pin_memory=pin_memory, + ) + + @finaloverride + def create_raw_encoder( + self, *, device: Optional[Device] = None, pin_memory: bool = False + ) -> TextTokenEncoder: + return SentencePieceEncoder(self.model, device=device, pin_memory=pin_memory) + + @finaloverride + def create_decoder(self) -> TextTokenDecoder: + return SentencePieceDecoder(self.model) + + +class UnitYCharTokenizerLoader: + """Loads character-level tokenizers of UnitY models.""" + + def __init__( + self, asset_store: AssetStore, download_manager: AssetDownloadManager + ) -> None: + """ + :param asset_store: + The asset store to retrieve the model information. + :param download_manager: + The download manager to use. + """ + self.asset_store = asset_store + self.download_manager = download_manager + + def __call__( + self, + model_name_or_card: Union[str, AssetCard], + force: bool = False, + progress: bool = True, + ) -> CharTokenizer: + """ + :param model_name_or_card: + The name of the model or an already loaded AssetCard + """ + + if isinstance(model_name_or_card, AssetCard): + card = model_name_or_card + else: + card = self.asset_store.retrieve_card(model_name_or_card) + + uri = card.field("char_tokenizer").as_uri() + + pathname = self.download_manager.download_tokenizer( + uri, card.name, force=force, progress=progress + ) + + return CharTokenizer(pathname) + + +load_unity_char_tokenizer = UnitYCharTokenizerLoader(asset_store, download_manager) diff --git a/seamless_communication/src/seamless_communication/models/unity/fft_decoder.py b/seamless_communication/src/seamless_communication/models/unity/fft_decoder.py new file mode 100644 index 0000000..8b2c450 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/unity/fft_decoder.py @@ -0,0 +1,83 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Iterable, Optional, Tuple, final + +from fairseq2.nn.module_list import ModuleList +from fairseq2.nn.normalization import LayerNorm +from fairseq2.nn.padding import PaddingMask +from fairseq2.nn.transformer import TransformerNormOrder, create_standard_layer_norm +from fairseq2.typing import DataType, Device, finaloverride +from torch import Tensor +from torch.nn import Module + +from seamless_communication.models.unity.fft_decoder_layer import ( + FeedForwardTransformerLayer, +) + + +@final +class FeedForwardTransformer(Module): + """Represents a Feedforward Transformer decoder.""" + + model_dim: int + layer_norm: Optional[LayerNorm] + norm_order: TransformerNormOrder + + def __init__( + self, + layers: Iterable[FeedForwardTransformerLayer], + *, + norm_order: TransformerNormOrder = TransformerNormOrder.POST, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + """ + :param layers: + The decoder layers. + :param norm_order: + The Layer Normalization order to use. + """ + super().__init__() + + layer_list = ModuleList(layers) + + if not layer_list: + raise ValueError("`layers` must be non-empty.") + + self.model_dim = layer_list[0].model_dim + + self.layers = layer_list + + if norm_order != TransformerNormOrder.POST: + self.layer_norm = create_standard_layer_norm( + self.model_dim, device=device, dtype=dtype + ) + else: + self.register_module("layer_norm", None) + + self.norm_order = norm_order + + @finaloverride + def forward( + self, + seqs: Tensor, + padding_mask: Optional[PaddingMask], + film_cond_emb: Optional[Tensor] = None, + ) -> Tuple[Tensor, Optional[PaddingMask]]: + for layer in self.layers.drop_iter(): + seqs, padding_mask = layer(seqs, padding_mask, film_cond_emb=film_cond_emb) + + if self.layer_norm is not None: + seqs = self.layer_norm(seqs) + + return seqs, padding_mask + + def extra_repr(self) -> str: + """:meta private:""" + s = super().extra_repr() + + return f"{s}, norm_order={self.norm_order}" diff --git a/seamless_communication/src/seamless_communication/models/unity/fft_decoder_layer.py b/seamless_communication/src/seamless_communication/models/unity/fft_decoder_layer.py new file mode 100644 index 0000000..39b6915 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/unity/fft_decoder_layer.py @@ -0,0 +1,231 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Optional, Tuple, final + +from fairseq2.nn.normalization import LayerNorm +from fairseq2.nn.padding import PaddingMask, apply_padding_mask +from fairseq2.nn.transformer import MultiheadAttention, create_standard_layer_norm +from fairseq2.typing import DataType, Device, finaloverride +from torch import Tensor +from torch.nn import Conv1d, Dropout, Module, ReLU + +from seamless_communication.models.unity.film import FiLM + + +@final +class Conv1dBlock(Module): + """Represents the Conv1d block within the FFT Block as described in + :cite:t:`https://arxiv.org/pdf/1905.09263.pdf`.""" + + conv1: Conv1d + activation: ReLU + conv2: Conv1d + + def __init__( + self, + model_dim: int, + inner_dim: int, + kernel_size: int, + bias: bool = True, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + """ + :param model_dim: + The dimensionality of the model. + :param inner_dim: + The inner dimensionality between the two convolutional layers. + :param kernel_size: + The kernel size of the Conv1d layers. + :param bias: + If ``True``, both the inner and output projections learn an additive + bias. + """ + super().__init__() + + self.conv1 = Conv1d( + model_dim, + inner_dim, + kernel_size, + stride=1, + padding="same", + bias=bias, + device=device, + dtype=dtype, + ) + + self.activation = ReLU() + + self.conv2 = Conv1d( + inner_dim, + model_dim, + kernel_size, + stride=1, + padding="same", + bias=bias, + device=device, + dtype=dtype, + ) + + @finaloverride + def forward(self, seqs: Tensor, padding_mask: Optional[PaddingMask]) -> Tensor: + # Ensure that we do not leak padded positions in the convolution layer. + seqs = apply_padding_mask(seqs, padding_mask) + + # (N, S, M) -> (N, M, S) + seqs = seqs.transpose(1, 2) + + # (N, M, S) -> (N, inner_dim, S) + seqs = self.conv1(seqs) + + # (N, inner_dim, S) -> (N, S, inner_dim) + seqs = seqs.transpose(1, 2) + + seqs = apply_padding_mask(seqs, padding_mask) + + seqs = self.activation(seqs) + + # (N, S, inner_dim) -> (N, inner_dim, S) + seqs = seqs.transpose(1, 2) + + # (N, inner_dim, S) -> (N, M, S) + seqs = self.conv2(seqs) + + # (N, M, S) -> (N, S, M) + seqs = seqs.transpose(1, 2) + + return seqs + + +@final +class FeedForwardTransformerLayer(Module): + """Represents the FFT Block as described in + :cite:t:`https://arxiv.org/pdf/1905.09263.pdf`.""" + + model_dim: int + self_attn: MultiheadAttention + self_attn_dropout: Optional[Dropout] + self_attn_layer_norm: LayerNorm + conv1d: Conv1dBlock + conv1d_dropout: Optional[Dropout] + conv1d_layer_norm: LayerNorm + film: Optional[FiLM] + + def __init__( + self, + self_attn: MultiheadAttention, + conv1d: Conv1dBlock, + dropout_p: float = 0.1, + conv1d_dropout_p: float = 0.1, + use_film: bool = False, + film_cond_dim: int = 512, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + """ + :param self_attn: + The self attention layer. + :param conv1d: + The conv1d block. + :param dropout_p: + The dropout probability on the outputs of the self attention layer. + :param conv1d_dropout_p: + The dropout probability on the outputs of the conv1d block. + :param use_film: + Whether to condition on a fixed-size vector through FiLM. + :param film_cond_dim: + The dim of fixed-size vector conditioned on during model forward. + """ + super().__init__() + + self.model_dim = self_attn.model_dim + + self.self_attn = self_attn + + if dropout_p > 0.0: + self.self_attn_dropout = Dropout(dropout_p) + else: + self.register_module("self_attn_dropout", None) + + layer_norm_factory = create_standard_layer_norm + + self.self_attn_layer_norm = layer_norm_factory( + self.model_dim, device=device, dtype=dtype + ) + + self.conv1d = conv1d + + if conv1d_dropout_p > 0.0: + self.conv1d_dropout = Dropout(conv1d_dropout_p) + else: + self.register_module("conv1d_dropout", None) + + self.conv1d_layer_norm = layer_norm_factory( + self.model_dim, device=device, dtype=dtype + ) + + if use_film: + self.film = FiLM(film_cond_dim, self.model_dim, device=device, dtype=dtype) + else: + self.register_module("film", None) + + @finaloverride + def forward( + self, + seqs: Tensor, + padding_mask: Optional[PaddingMask], + film_cond_emb: Optional[Tensor] = None, + ) -> Tuple[Tensor, Optional[PaddingMask]]: + seqs = self._forward_self_attn(seqs, padding_mask) + + seqs = self._forward_conv1d(seqs, padding_mask) + + if self.film is not None and film_cond_emb is not None: + seqs = self.film(seqs, film_cond_emb) + seqs = apply_padding_mask(seqs, padding_mask) + + return seqs, padding_mask + + def _forward_self_attn( + self, + seqs: Tensor, + padding_mask: Optional[PaddingMask], + ) -> Tensor: + residual = seqs + + seqs = self.self_attn( + seqs, + padding_mask, + keys=seqs, + key_padding_mask=padding_mask, + values=seqs, + ) + + if self.self_attn_dropout is not None: + seqs = self.self_attn_dropout(seqs) + + seqs = seqs + residual + + seqs = self.self_attn_layer_norm(seqs) + + return seqs + + def _forward_conv1d( + self, seqs: Tensor, padding_mask: Optional[PaddingMask] + ) -> Tensor: + residual = seqs + + seqs = self.conv1d(seqs, padding_mask) + + if self.conv1d_dropout is not None: + seqs = self.conv1d_dropout(seqs) + + seqs = seqs + residual + + seqs = self.conv1d_layer_norm(seqs) + + return seqs diff --git a/seamless_communication/src/seamless_communication/models/unity/film.py b/seamless_communication/src/seamless_communication/models/unity/film.py new file mode 100644 index 0000000..f4f03dd --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/unity/film.py @@ -0,0 +1,68 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. +from typing import Optional + +import torch +from fairseq2.nn.projection import Linear +from fairseq2.typing import DataType, Device +from torch import Tensor +from torch.nn import Module, Parameter + + +class FiLM(Module): + """ + A Feature-wise Linear Modulation Layer from + 'FiLM: Visual Reasoning with a General Conditioning Layer' + """ + + proj: Linear + s_gamma: Parameter + s_beta: Parameter + + def __init__( + self, + cond_dim: int, + embed_dim: int, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ): + super().__init__() + + self.proj = Linear( + cond_dim, 2 * embed_dim, bias=True, device=device, dtype=dtype + ) + + self.s_gamma = Parameter( + torch.ones( + 1, + device=device, + dtype=dtype, + ), + requires_grad=True, + ) + + self.s_beta = Parameter( + torch.ones( + 1, + device=device, + dtype=dtype, + ), + requires_grad=True, + ) + + def forward(self, x: Tensor, cond_embs: Tensor) -> Tensor: + """ + x -- [B, T, H] + cond_emb -- [B, 1, C] + """ + # get trainable gamma, beta + gammas, betas = self.proj(cond_embs).chunk(2, dim=-1) # B x 1 x H + + # apply film + gammas = self.s_gamma * gammas.expand_as(x) + betas = self.s_beta * betas.expand_as(x) + + return (gammas + 1.0) * x + betas # type: ignore[no-any-return] diff --git a/seamless_communication/src/seamless_communication/models/unity/length_regulator.py b/seamless_communication/src/seamless_communication/models/unity/length_regulator.py new file mode 100644 index 0000000..c06ca5e --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/unity/length_regulator.py @@ -0,0 +1,321 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. +from typing import Literal, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +from fairseq2.nn.normalization import LayerNorm +from fairseq2.nn.padding import PaddingMask, apply_padding_mask, to_padding_mask +from fairseq2.nn.projection import Linear +from fairseq2.nn.transformer import create_standard_layer_norm +from fairseq2.typing import DataType, Device +from torch import Tensor +from torch.nn import Conv1d, Dropout, Module, ReLU, Sequential + +from seamless_communication.models.unity.film import FiLM + + +class HardUpsampling(Module): + """Upsamples sequences in a deterministic way as governed by durations.""" + + def forward(self, seqs: Tensor, durations: Tensor) -> Tuple[Tensor, Tensor]: + # seqs: (N, S, M), durations: (N, S) + if durations.dtype not in (torch.int16, torch.int32, torch.int64): + raise TypeError("The durations tensor should have an integer dtype.") + + upsampled_seq_lens = durations.sum(dim=1) + max_len = int(upsampled_seq_lens.max().item()) + N, _, M = seqs.shape + upsampled_seqs = seqs.new_zeros((N, max_len, M)) + + for b in range(N): + upsampled_seqs[b, : upsampled_seq_lens[b]] = seqs[b].repeat_interleave( + durations[b], dim=0 + ) + + return upsampled_seqs, upsampled_seq_lens + + +class GaussianUpsampling(Module): + """Gaussian upsampling with fixed temperature as in: + https://arxiv.org/abs/2010.04301 + """ + + def __init__(self, delta: float = 0.1): + super().__init__() + self.delta = delta + + def forward( + self, + x: Tensor, + durations: Tensor, + padding_mask: Optional[PaddingMask] = None, + ) -> Tuple[Tensor, Tensor]: + """Upsample hidden states according to durations. + Args: + x (Tensor): Batched hidden state to be expanded (B, T_text, C). + durations (Tensor): Batched token duration (B, T_text). + padding_mask (Tensor): Mask tensor (B, T_text). + Returns: + Tensor: Expanded hidden state (B, T_feat, C). + Tensor: Output lengths (B,). + """ + out_lens = durations.sum(dim=1) + y_mask = to_padding_mask(out_lens, max(out_lens)) + + B = durations.size(0) + if durations.sum() == 0: + # NOTE(kan-bayashi): This case must not be happened in teacher forcing. + # It will be happened in inference with a bad duration predictor. + # So we do not need to care the padded sequence case here. + durations[durations.sum(dim=1).eq(0)] = 1 + + if y_mask is None: + T_feat = durations.sum().int() + else: + T_feat = y_mask.size(-1) + + t = torch.arange(0, T_feat).unsqueeze(0).repeat(B, 1).to(x) + if y_mask is not None: + t = t * y_mask.float() + + c = durations.cumsum(dim=-1) - durations / 2 + energy = -1 * self.delta * (t.unsqueeze(-1) - c.unsqueeze(1)) ** 2 + + if padding_mask is not None: + energy = energy.masked_fill( + ~padding_mask.materialize().unsqueeze(1).repeat(1, T_feat, 1), + -float("inf"), + ) + + p_attn = F.softmax(energy, dim=2).to(x) # (B, T_feat, T_text) + x = torch.matmul(p_attn, x) + return x, out_lens + + +class VariancePredictor(Module): + """Represents the duration/pitch/energy predictor as described in + :cite:t:`https://arxiv.org/pdf/2006.04558.pdf`""" + + conv1: Sequential + ln1: LayerNorm + dropout_module: Dropout + conv2: Sequential + ln2: LayerNorm + proj: Linear + film: Optional[FiLM] + + def __init__( + self, + encoder_embed_dim: int, + var_pred_hidden_dim: int, + var_pred_kernel_size: int, + var_pred_dropout: float, + bias: bool = True, + use_film: bool = False, + film_cond_dim: int = 512, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ): + super().__init__() + + self.conv1 = Sequential( + Conv1d( + encoder_embed_dim, + var_pred_hidden_dim, + var_pred_kernel_size, + stride=1, + padding="same", + bias=bias, + device=device, + dtype=dtype, + ), + ReLU(), + ) + + layer_norm_factory = create_standard_layer_norm + + self.ln1 = layer_norm_factory(var_pred_hidden_dim, device=device, dtype=dtype) + + self.dropout_module = Dropout(p=var_pred_dropout) + + self.conv2 = Sequential( + Conv1d( + var_pred_hidden_dim, + var_pred_hidden_dim, + var_pred_kernel_size, + stride=1, + padding="same", + bias=bias, + device=device, + dtype=dtype, + ), + ReLU(), + ) + + self.ln2 = layer_norm_factory(var_pred_hidden_dim, device=device, dtype=dtype) + + self.proj = Linear( + var_pred_hidden_dim, 1, bias=True, device=device, dtype=dtype + ) + + if use_film: + self.film = FiLM( + film_cond_dim, var_pred_hidden_dim, device=device, dtype=dtype + ) + else: + self.register_module("film", None) + + def forward( + self, + seqs: Tensor, + padding_mask: Optional[PaddingMask] = None, + film_cond_emb: Optional[Tensor] = None, + ) -> Tensor: + # Ensure that we do not leak padded positions in the convolution layer. + seqs = apply_padding_mask(seqs, padding_mask) + + # (N, S, M) -> (N, M, S) + seqs = seqs.transpose(1, 2) + + # (N, M, S) -> (N, H, S) + seqs = self.conv1(seqs) + + # (N, H, S) -> (N, S, H) + seqs = seqs.transpose(1, 2) + + seqs = self.ln1(seqs) + + seqs = self.dropout_module(seqs) + + seqs = apply_padding_mask(seqs, padding_mask) + + # (N, S, H) -> (N, H, S) + seqs = seqs.transpose(1, 2) + + # (N, H, S) -> (N, H, S) + seqs = self.conv2(seqs) + + # (N, H, S) -> (N, S, H) + seqs = seqs.transpose(1, 2) + + seqs = self.ln2(seqs) + + seqs = self.dropout_module(seqs) + + seqs = apply_padding_mask(seqs, padding_mask) + + if self.film is not None and film_cond_emb is not None: + seqs = self.film(seqs, film_cond_emb) + seqs = apply_padding_mask(seqs, padding_mask) + + # (N, S, H) -> (N, S, 1) -> (N, S) + seqs = self.proj(seqs).squeeze(dim=2) + + return seqs + + +class VarianceAdaptor(Module): + """Represent the Variance adaptor as described in + :cite:t:`https://arxiv.org/pdf/2006.04558.pdf`""" + + duration_predictor: Optional[VariancePredictor] + pitch_predictor: Optional[VariancePredictor] + vuv_predictor: Optional[VariancePredictor] + energy_predictor: Optional[VariancePredictor] + length_regulator: Union[HardUpsampling, GaussianUpsampling] + + def __init__( + self, + duration_predictor: Optional[VariancePredictor] = None, + pitch_predictor: Optional[VariancePredictor] = None, + embed_pitch: Optional[Conv1d] = None, + vuv_predictor: Optional[VariancePredictor] = None, + energy_predictor: Optional[VariancePredictor] = None, + embed_energy: Optional[Conv1d] = None, + add_variance_parallel: bool = True, + upsampling_type: Literal["gaussian", "hard"] = "hard", + ): + super().__init__() + + if duration_predictor: + self.duration_predictor = duration_predictor + else: + self.register_module("duration_predictor", None) + + if pitch_predictor: + self.pitch_predictor = pitch_predictor + self.embed_pitch = embed_pitch + else: + self.register_module("pitch_predictor", None) + self.register_module("embed_pitch", None) + + if vuv_predictor: + self.vuv_predictor = vuv_predictor + else: + self.register_module("vuv_predictor", None) + + if energy_predictor: + self.energy_predictor = energy_predictor + self.embed_energy = embed_energy + else: + self.register_module("energy_predictor", None) + self.register_module("embed_energy", None) + + self.add_variance_parallel = add_variance_parallel + + if upsampling_type == "gaussian": + self.length_regulator = GaussianUpsampling() + else: + self.length_regulator = HardUpsampling() + + def forward( + self, + seqs: Tensor, + padding_mask: Optional[PaddingMask], + durations: Optional[Tensor] = None, + duration_factor: float = 1.0, + min_duration: int = 0, + film_cond_emb: Optional[Tensor] = None, + ) -> Tuple[Tensor, PaddingMask, Tensor]: + if self.duration_predictor is not None: + log_durations = self.duration_predictor(seqs, padding_mask, film_cond_emb) + durations = torch.clamp( + torch.round((torch.exp(log_durations) - 1) * duration_factor).long(), + min=min_duration, + ) + # We need to apply the padding_mask again since we clamp by min_duration. + durations = apply_padding_mask(durations, padding_mask, pad_value=0) + + assert durations is not None + + if self.pitch_predictor is not None: + pitch_out = self.pitch_predictor(seqs, padding_mask, film_cond_emb) + if self.vuv_predictor is not None: + vuv_out = self.vuv_predictor(seqs, padding_mask, film_cond_emb) + pitch_out = pitch_out * (torch.sigmoid(vuv_out) >= 0.5) + + assert self.embed_pitch is not None + pitch_embed = self.embed_pitch(pitch_out.unsqueeze(1)).transpose(1, 2) + if not self.add_variance_parallel: + seqs = seqs + pitch_embed + + if self.energy_predictor is not None: + energy_out = self.energy_predictor(seqs, padding_mask, film_cond_emb) + + assert self.embed_energy is not None + energy_embed = self.embed_energy(energy_out.unsqueeze(1)).transpose(1, 2) + if self.add_variance_parallel: + seqs = seqs + pitch_embed + energy_embed + else: + seqs = seqs + energy_embed + + if isinstance(self.length_regulator, GaussianUpsampling): + seqs, seq_lens = self.length_regulator(seqs, durations, padding_mask) + else: + seqs, seq_lens = self.length_regulator(seqs, durations) + + return seqs, PaddingMask(seq_lens, batch_seq_len=seqs.size(1)), durations diff --git a/seamless_communication/src/seamless_communication/models/unity/loader.py b/seamless_communication/src/seamless_communication/models/unity/loader.py new file mode 100644 index 0000000..625d20b --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/unity/loader.py @@ -0,0 +1,471 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Any, Dict, List, Mapping, Tuple, Union + +import torch +from fairseq2.assets import AssetStore, asset_store, download_manager +from fairseq2.assets.card import AssetCard, AssetCardFieldNotFoundError +from fairseq2.models.nllb import NllbConfig +from fairseq2.models.nllb.loader import NllbTokenizerLoader +from fairseq2.models.utils import ConfigLoader, ModelLoader +from fairseq2.models.utils.checkpoint import convert_fairseq_checkpoint + +from seamless_communication.models.unity.builder import ( + UnitYConfig, + create_unity_model, + unity_archs, +) +from seamless_communication.models.unity.char_tokenizer import load_unity_char_tokenizer +from seamless_communication.models.unity.model import UnitYModel +from seamless_communication.models.unity.unit_tokenizer import UnitTokenizer + + +def convert_unity_checkpoint( + checkpoint: Mapping[str, Any], config: UnitYConfig +) -> Mapping[str, Any]: + state_dict = checkpoint["model"] + + # Check if we have a fairseq2 checkpoint. + if "speech_encoder.inner.layers.0.self_attn_layer_norm.weight" in state_dict: + return checkpoint + + key_map = _fairseq_key_map(config) + + checkpoint = convert_fairseq_checkpoint(checkpoint, key_map) + + state_dict = checkpoint["model"] + + keys_to_delete = [] + + # ExpressiveUnitY model (from multi_arch codebase) + if config.prosody_encoder_config is not None: + encoder_key = "s2t_model.encoder" + decoder_key = "s2t_model.decoder" + t2u_decoder_key = "t2s_model.decoder" + # X2T/S2T + T2U model. + elif config.t2u_config is not None: + encoder_key = "encoder" + decoder_key = "target_letter_decoder" + t2u_decoder_key = "decoder" + # X2T model. + elif config.use_text_encoder: + encoder_key = "speech_encoder" + decoder_key = "shared_decoder" + # S2T model. + else: + encoder_key = "encoder" + decoder_key = "decoder" + + keys_to_delete.append(f"{decoder_key}.version") + keys_to_delete.append(f"{decoder_key}.embed_positions._float_tensor") + + if config.use_text_encoder: + keys_to_delete.append("text_encoder.version") + keys_to_delete.append("text_encoder.embed_positions._float_tensor") + + if not config.use_text_decoder: + text_decoder_keys = [key for key in state_dict if key.startswith(decoder_key)] + keys_to_delete.extend(text_decoder_keys) + + # Remnant of wav2vec2 pretraining, not needed for eval or fine-tuning. + keys_to_delete.append(f"{encoder_key}.w2v_encoder.w2v_model.mask_emb") + + if config.prosody_encoder_config is not None or config.t2u_config is not None: + keys_to_delete.append( + f"{t2u_decoder_key}.char_upsampler.embed_positions._float_tensor" + ) + keys_to_delete.append( + f"{t2u_decoder_key}.char_upsampler.embed_tokens_char.weight" + ) + + # Delete AlignmentEncoder keys for inference. + alignment_encoder_keys = [ + key + for key in state_dict + if key.startswith(f"{t2u_decoder_key}.alignment_encoder.") + ] + keys_to_delete.extend(alignment_encoder_keys) + + # Delete character-level projection for inference. + keys_to_delete.extend( + [ + "decoder_target_letter_decoder.proj.weight", + "decoder_target_letter_decoder.proj.bias", + ] + ) + + if config.prosody_encoder_config is not None: + keys_to_delete.extend( + [ + f"{t2u_decoder_key}.embed_positions._float_tensor", + "t2s_model.global_proj_dec.weight", + "t2s_model.global_proj_dec.bias", + "t2s_model.decoder_target_letter_nllb_spm_decoder.encoder.proj.weight", + "t2s_model.decoder_target_letter_nllb_spm_decoder.encoder.proj.bias", + ] + ) + + for key in keys_to_delete: + if key in state_dict: + del state_dict[key] + + if config.use_text_decoder: + embeds = state_dict["final_proj.weight"] + + # fairseq had a bug that accidentally introduced a dummy token in the + # embedding table of NLLB-100. We just discard it. + if ( + isinstance(config.mt_model_config, NllbConfig) and embeds.size(0) == 256103 + ): # means NLLB-100 + embeds = embeds[:-1] + + state_dict["final_proj.weight"] = embeds + + # fairseq checkpoints have duplicate embedding weights. Ensure that we + # use a single embedding table in fairseq2. + state_dict["text_decoder_frontend.embed.weight"] = embeds + + if config.use_text_encoder: + state_dict["text_encoder_frontend.embed.weight"] = embeds + + # The embedding positions of the control symbols in fairseq's dict do + # not match the SentencePiece model of the tokenizer. + with torch.inference_mode(): + # (BOS, PAD, EOS, UNK) -> (PAD, UNK, BOS, EOS) + embeds[[0, 1, 2, 3]] = embeds[[1, 3, 0, 2]] + + char_embeds = state_dict.get("t2u_model.decoder_frontend.embed_char.weight", None) + if char_embeds is not None: + index_mapping = _get_char_index_mapping(config) + vocab_size = len(index_mapping) + char_embeds[torch.arange(vocab_size)] = char_embeds[index_mapping] + + if config.t2u_config is not None: + # fairseq checkpoints have duplicate embedding weights. Ensure that we + # use a single embedding table in fairseq2. + embeds = state_dict["t2u_model.final_proj.weight"] + + if "t2u_model.decoder_frontend.embed.weight" in state_dict: + state_dict["t2u_model.decoder_frontend.embed.weight"] = embeds + + return checkpoint + + +def _get_char_index_mapping(config: UnitYConfig) -> List[int]: + assert config.t2u_config is not None + assert config.t2u_config.nar_decoder_config is not None + char_tokenizer = load_unity_char_tokenizer( + config.t2u_config.nar_decoder_config.model_name_or_card + ) + spm_order = [ + char_tokenizer.model.index_to_token(i) + for i in range(char_tokenizer.model.vocabulary_size) + ][4:] + spm_to_dict_mapping = { + ch: idx + for (idx, ch) in zip( + range(4, char_tokenizer.model.vocabulary_size), + sorted(spm_order), + ) + } + model_to_dict_mapping = [0, 1, 2, 3] + [spm_to_dict_mapping[ch] for ch in spm_order] + return model_to_dict_mapping + + +def _fairseq_key_map(config: UnitYConfig) -> Dict[str, str]: + # ExpressiveUnitY model (from multi_arch codebase) + if config.prosody_encoder_config is not None: + encoder_key = "s2t_model.encoder" + decoder_key = "s2t_model.decoder" + t2u_encoder_key = "t2s_model.encoder" + t2u_decoder_key = "t2s_model.decoder" + ecapa_tdnn_key = "global_prosody" + # X2T/S2T + T2U model. + elif config.t2u_config is not None: + encoder_key = "encoder" + decoder_key = "target_letter_decoder" + t2u_encoder_key = "synthesizer_encoder" + t2u_decoder_key = "decoder" + # X2T model. + elif config.use_text_encoder: + encoder_key = "speech_encoder" + decoder_key = "shared_decoder" + # S2T model. + else: + encoder_key = "encoder" + decoder_key = "decoder" + + key_map = { + # fmt: off + + # Speech Encoder + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.pos_conv\.0\.": r"speech_encoder_frontend.pos_encoder.conv.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.layer_norm\.": r"speech_encoder_frontend.post_extract_layer_norm.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.post_extract_proj\.": r"speech_encoder_frontend.model_dim_proj.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.feature_extractor\.conv_layers\.([0-9]+)\.0\.": r"speech_encoder_frontend.feature_extractor.layers.\1.conv.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.feature_extractor\.conv_layers\.([0-9]+)\.2\.1\.": r"speech_encoder_frontend.feature_extractor.layers.\1.layer_norm.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.feature_extractor\.conv_layers\.0\.2\.": r"speech_encoder_frontend.feature_extractor.layers.0.group_norm.", + + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.conv_module\.batch_norm\.": r"speech_encoder.inner.layers.\1.conv.batch_norm.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.conv_module\.layer_norm2\.": r"speech_encoder.inner.layers.\1.conv.layer_norm.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.conv_module\.depthwise_conv\.": r"speech_encoder.inner.layers.\1.conv.depthwise_conv.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.conv_module\.layer_norm\.": r"speech_encoder.inner.layers.\1.conv_layer_norm.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.conv_module\.pointwise_conv1\.": r"speech_encoder.inner.layers.\1.conv.pointwise_conv1.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.conv_module\.pointwise_conv2\.": r"speech_encoder.inner.layers.\1.conv.pointwise_conv2.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.ffn(1|2)\.layer_norm\.": r"speech_encoder.inner.layers.\1.ffn\2_layer_norm.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.ffn(1|2)\.w_1\.": r"speech_encoder.inner.layers.\1.ffn\2.inner_proj.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.ffn(1|2)\.w_2\.": r"speech_encoder.inner.layers.\1.ffn\2.output_proj.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn_layer_norm\.": r"speech_encoder.inner.layers.\1.self_attn_layer_norm.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.linear_q\.": r"speech_encoder.inner.layers.\1.self_attn.q_proj.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.linear_k\.": r"speech_encoder.inner.layers.\1.self_attn.k_proj.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.linear_v\.": r"speech_encoder.inner.layers.\1.self_attn.v_proj.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.linear_out\.": r"speech_encoder.inner.layers.\1.self_attn.output_proj.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.q_proj\.": r"speech_encoder.inner.layers.\1.self_attn.q_proj.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.k_proj\.": r"speech_encoder.inner.layers.\1.self_attn.k_proj.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.v_proj\.": r"speech_encoder.inner.layers.\1.self_attn.v_proj.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.rel_k_embedding\.": r"speech_encoder.inner.layers.\1.self_attn.sdpa.rel_k_embed.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.out_proj\.": r"speech_encoder.inner.layers.\1.self_attn.output_proj.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.linear_pos\.": r"speech_encoder.inner.layers.\1.self_attn.sdpa.r_proj.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.pos_bias_u": r"speech_encoder.inner.layers.\1.self_attn.sdpa.u_bias", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.self_attn\.pos_bias_v": r"speech_encoder.inner.layers.\1.self_attn.sdpa.v_bias", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layers\.([0-9]+)\.final_layer_norm\.": r"speech_encoder.inner.layers.\1.layer_norm.", + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layer_norm\.": r"speech_encoder.inner.layer_norm.", + + # Speech Encoder Adaptor + fr"^{encoder_key}\.adaptor\.proj\.0\.": r"speech_encoder.proj1.", + fr"^{encoder_key}\.adaptor\.proj\.2\.": r"speech_encoder.proj2.", + fr"^{encoder_key}\.adaptor\.out_ln\.": r"speech_encoder.layer_norm.", + + # Text Encoder + r"^text_encoder\.embed_tokens\.": r"text_encoder_frontend.embed.", + r"^text_encoder\.layers\.([0-9]+)\.self_attn\.out_proj\.": r"text_encoder.layers.\1.self_attn.output_proj.", + r"^text_encoder\.layers\.([0-9]+)\.self_attn\.": r"text_encoder.layers.\1.self_attn.", + r"^text_encoder\.layers\.([0-9]+)\.self_attn_layer_norm\.": r"text_encoder.layers.\1.self_attn_layer_norm.", + r"^text_encoder\.layers\.([0-9]+)\.encoder_attn\.out_proj\.": r"text_encoder.layers.\1.encoder_decoder_attn.output_proj.", + r"^text_encoder\.layers\.([0-9]+)\.encoder_attn\.": r"text_encoder.layers.\1.encoder_decoder_attn.", + r"^text_encoder\.layers\.([0-9]+)\.encoder_attn_layer_norm\.": r"text_encoder.layers.\1.encoder_decoder_attn_layer_norm.", + r"^text_encoder\.layers\.([0-9]+)\.fc1\.": r"text_encoder.layers.\1.ffn.inner_proj.", + r"^text_encoder\.layers\.([0-9]+)\.fc2\.": r"text_encoder.layers.\1.ffn.output_proj.", + r"^text_encoder\.layers\.([0-9]+)\.final_layer_norm\.": r"text_encoder.layers.\1.ffn_layer_norm.", + r"^text_encoder\.layer_norm\.": r"text_encoder.layer_norm.", + # fmt: on + } + + # In normal circumstances, we should never encounter a `LayerNorm` when + # `use_conformer` is `True`. Unfortunately, the w2v-BERT pretraining in + # fairseq was accidentally run with a pre-LN encoder, and ended up with + # a redundant `LayerNorm` right after the Conformer blocks. We mitigate + # that issue here by moving that `LayerNorm` to the adaptor block. + # fmt: off + if config.w2v2_encoder_config.use_conformer: + key_map.update( + { + fr"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layer_norm\.": r"speech_encoder.inner_layer_norm." + } + ) + else: + key_map.update( + { + rf"^{encoder_key}\.w2v_encoder\.w2v_model\.encoder\.layer_norm\.": r"speech_encoder.inner.layer_norm." + } + ) + # fmt: on + + if config.use_conformer_adaptor: + key_map.update( + { + # fmt: off + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.self_attn\.out_proj\.": r"speech_encoder.adaptor_layers.\1.block.self_attn.output_proj.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.self_attn\.": r"speech_encoder.adaptor_layers.\1.block.self_attn.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.self_attn_layer_norm\.": r"speech_encoder.adaptor_layers.\1.block.self_attn_layer_norm.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.ffn(1|2)\.layer_norm\.": r"speech_encoder.adaptor_layers.\1.block.ffn\2_layer_norm.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.ffn(1|2)\.w_1\.": r"speech_encoder.adaptor_layers.\1.block.ffn\2.inner_proj.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.ffn(1|2)\.w_2\.": r"speech_encoder.adaptor_layers.\1.block.ffn\2.output_proj.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_module\.batch_norm\.": r"speech_encoder.adaptor_layers.\1.block.conv.batch_norm.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_module\.depthwise_conv\.": r"speech_encoder.adaptor_layers.\1.block.conv.depthwise_conv.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_module\.layer_norm\.": r"speech_encoder.adaptor_layers.\1.block.conv_layer_norm.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_module\.pointwise_conv1\.": r"speech_encoder.adaptor_layers.\1.block.conv.pointwise_conv1.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_module\.pointwise_conv2\.": r"speech_encoder.adaptor_layers.\1.block.conv.pointwise_conv2.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.final_layer_norm\.": r"speech_encoder.adaptor_layers.\1.block.layer_norm.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_ln\.": r"speech_encoder.adaptor_layers.\1.layer_norm.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.conv_pool\.1\.": r"speech_encoder.adaptor_layers.\1.conv.", + # fmt: on + } + ) + else: + key_map.update( + { + # fmt: off + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.residual_layer_norm\.": r"speech_encoder.adaptor_layers.\1.residual_layer_norm.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.residual_pool\.1\.": r"speech_encoder.adaptor_layers.\1.residual_conv.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.attn_pool\.1\.": r"speech_encoder.adaptor_layers.\1.self_attn_conv.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.self_attn\.out_proj\.": r"speech_encoder.adaptor_layers.\1.self_attn.output_proj.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.self_attn\.": r"speech_encoder.adaptor_layers.\1.self_attn.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.self_attn_layer_norm\.": r"speech_encoder.adaptor_layers.\1.self_attn_layer_norm.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.fc1\.": r"speech_encoder.adaptor_layers.\1.ffn.inner_proj.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.fc2\.": r"speech_encoder.adaptor_layers.\1.ffn.output_proj.", + fr"^{encoder_key}\.adaptor\.layers\.([0-9]+)\.final_layer_norm\.": r"speech_encoder.adaptor_layers.\1.ffn_layer_norm.", + # fmt: on + } + ) + + key_map.update( + { + # fmt: off + # Text Decoder + fr"^{decoder_key}\.embed_tokens\.": r"text_decoder_frontend.embed.", + fr"^{decoder_key}\.layers\.([0-9]+)\.self_attn\.out_proj\.": r"text_decoder.layers.\1.self_attn.output_proj.", + fr"^{decoder_key}\.layers\.([0-9]+)\.self_attn\.": r"text_decoder.layers.\1.self_attn.", + fr"^{decoder_key}\.layers\.([0-9]+)\.self_attn_layer_norm\.": r"text_decoder.layers.\1.self_attn_layer_norm.", + fr"^{decoder_key}\.layers\.([0-9]+)\.encoder_attn\.out_proj\.": r"text_decoder.layers.\1.encoder_decoder_attn.output_proj.", + fr"^{decoder_key}\.layers\.([0-9]+)\.encoder_attn\.": r"text_decoder.layers.\1.encoder_decoder_attn.", + fr"^{decoder_key}\.layers\.([0-9]+)\.encoder_attn_layer_norm\.": r"text_decoder.layers.\1.encoder_decoder_attn_layer_norm.", + fr"^{decoder_key}\.layers\.([0-9]+)\.fc1\.": r"text_decoder.layers.\1.ffn.inner_proj.", + fr"^{decoder_key}\.layers\.([0-9]+)\.fc2\.": r"text_decoder.layers.\1.ffn.output_proj.", + fr"^{decoder_key}\.layers\.([0-9]+)\.final_layer_norm\.": r"text_decoder.layers.\1.ffn_layer_norm.", + fr"^{decoder_key}\.layer_norm\.": r"text_decoder.layer_norm.", + fr"^{decoder_key}\.output_projection\.": r"final_proj.", + # fmt: on + } + ) + # ExpressiveUnitY model (from multi_arch codebase) + if config.prosody_encoder_config is not None: + key_map.update( + { + # fmt: off + fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.film\.": r"t2u_model.decoder.layers.\1.film.", + fr"^{ecapa_tdnn_key}\.": r"prosody_encoder_model.", + r"^t2s_model\.global_proj_enc\.": r"t2u_model.prosody_proj.", + # fmt: on + } + ) + + # X2T/S2T + T2U model. + if config.t2u_config is not None: + key_map.update( + { + # fmt: off + # T2U Encoder + fr"^{t2u_encoder_key}\.layers\.([0-9]+)\.self_attn\.out_proj\.": r"t2u_model.encoder.layers.\1.self_attn.output_proj.", + fr"^{t2u_encoder_key}\.layers\.([0-9]+)\.self_attn\.": r"t2u_model.encoder.layers.\1.self_attn.", + fr"^{t2u_encoder_key}\.layers\.([0-9]+)\.self_attn_layer_norm\.": r"t2u_model.encoder.layers.\1.self_attn_layer_norm.", + fr"^{t2u_encoder_key}\.layers\.([0-9]+)\.fc1\.": r"t2u_model.encoder.layers.\1.ffn.inner_proj.", + fr"^{t2u_encoder_key}\.layers\.([0-9]+)\.fc2\.": r"t2u_model.encoder.layers.\1.ffn.output_proj.", + fr"^{t2u_encoder_key}\.layers\.([0-9]+)\.final_layer_norm\.": r"t2u_model.encoder.layers.\1.ffn_layer_norm.", + fr"^{t2u_encoder_key}\.layer_norm\.": r"t2u_model.encoder.layer_norm.", + + # T2U Decoder frontend + fr"^{t2u_decoder_key}\.embed_tokens_text\.": r"t2u_model.decoder_frontend.embed_char.", + fr"^{t2u_decoder_key}\.embed_tokens_unit\.": r"t2u_model.decoder_frontend.embed.", + fr"^{t2u_decoder_key}\.embed_tokens\.": r"t2u_model.decoder_frontend.embed.", + fr"^{t2u_decoder_key}\.var_adaptor\.duration_predictor\.": r"t2u_model.decoder_frontend.variance_adaptor.duration_predictor.", + fr"^{t2u_decoder_key}\.dec_pos_emb_alpha": r"t2u_model.decoder_frontend.pos_emb_alpha", + fr"^{t2u_decoder_key}\.char_upsampler\.pos_emb_alpha": r"t2u_model.decoder_frontend.pos_emb_alpha_char", + + # T2U Decoder + fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.self_attn\.out_proj\.": r"t2u_model.decoder.layers.\1.self_attn.output_proj.", + fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.self_attn\.": r"t2u_model.decoder.layers.\1.self_attn.", + fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.self_attn_layer_norm\.": r"t2u_model.decoder.layers.\1.self_attn_layer_norm.", + fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.layer_norm\.": r"t2u_model.decoder.layers.\1.self_attn_layer_norm.", + fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.encoder_attn\.out_proj\.": r"t2u_model.decoder.layers.\1.encoder_decoder_attn.output_proj.", + fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.encoder_attn\.": r"t2u_model.decoder.layers.\1.encoder_decoder_attn.", + fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.encoder_attn_layer_norm\.": r"t2u_model.decoder.layers.\1.encoder_decoder_attn_layer_norm.", + fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.fc1\.": r"t2u_model.decoder.layers.\1.ffn.inner_proj.", + fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.fc2\.": r"t2u_model.decoder.layers.\1.ffn.output_proj.", + fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.final_layer_norm\.": r"t2u_model.decoder.layers.\1.ffn_layer_norm.", + fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.ffn\.ffn\.0\.": r"t2u_model.decoder.layers.\1.conv1d.conv1.", + fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.ffn\.ffn\.2\.": r"t2u_model.decoder.layers.\1.conv1d.conv2.", + fr"^{t2u_decoder_key}\.layers\.([0-9]+)\.ffn\.layer_norm\.": r"t2u_model.decoder.layers.\1.conv1d_layer_norm.", + fr"^{t2u_decoder_key}\.layer_norm\.": r"t2u_model.decoder.layer_norm.", + fr"^{t2u_decoder_key}\.output_projection\.": r"t2u_model.final_proj.", + # fmt: on + } + ) + + return key_map + + +load_unity_config = ConfigLoader[UnitYConfig](asset_store, unity_archs) + + +load_unity_model = ModelLoader[UnitYModel, UnitYConfig]( + asset_store, + download_manager, + load_unity_config, + create_unity_model, + convert_unity_checkpoint, + restrict_checkpoints=False, +) + + +load_unity_text_tokenizer = NllbTokenizerLoader(asset_store, download_manager) + + +class UnitYUnitTokenizerLoader: + """Loads speech unit tokenizers of UnitY models.""" + + def __init__(self, asset_store: AssetStore) -> None: + """ + :param asset_store: + The asset store to retrieve the model information. + """ + self.asset_store = asset_store + + def __call__(self, model_name_or_card: Union[str, AssetCard]) -> UnitTokenizer: + """ + :param model_name_or_card: + The name of the model or an already loaded AssetCard + """ + + if isinstance(model_name_or_card, AssetCard): + card = model_name_or_card + else: + card = self.asset_store.retrieve_card(model_name_or_card) + + return UnitTokenizer( + card.field("num_units").as_(int), + card.field("unit_langs").as_list(str), + card.field("model_arch").as_(str), + ) + + +load_unity_unit_tokenizer = UnitYUnitTokenizerLoader(asset_store) + + +class GcmvnStatsLoader: + """Loads GCMVN stats (mean & std) for ProsodyUnitY.""" + + def __init__(self, asset_store: AssetStore) -> None: + """ + :param asset_store: + The asset store to retrieve the model information. + """ + self.asset_store = asset_store + + def __call__( + self, model_name_or_card: Union[str, AssetCard] + ) -> Tuple[List[float], List[float]]: + """ + :param model_name_or_card: + The name of the model or an already loaded AssetCard + """ + + if isinstance(model_name_or_card, AssetCard): + card = model_name_or_card + else: + card = self.asset_store.retrieve_card(model_name_or_card) + + try: + gcmvn_stats: Dict[str, List[float]] = card.field("gcmvn_stats").as_(dict) + except AssetCardFieldNotFoundError: + model_override = card.field("model_config").as_(dict) + gcmvn_stats = model_override["gcmvn_stats"] + + return gcmvn_stats["mean"], gcmvn_stats["std"] + + +load_gcmvn_stats = GcmvnStatsLoader(asset_store) diff --git a/seamless_communication/src/seamless_communication/models/unity/model.py b/seamless_communication/src/seamless_communication/models/unity/model.py new file mode 100644 index 0000000..7ffb127 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/unity/model.py @@ -0,0 +1,461 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass +from typing import Optional, Tuple, Union, final + +from fairseq2.data import VocabularyInfo +from fairseq2.models.encoder_decoder import EncoderDecoderModel +from fairseq2.models.sequence import SequenceModelOutput +from fairseq2.models.transformer.frontend import TransformerFrontend +from fairseq2.nn.incremental_state import IncrementalStateBag +from fairseq2.nn.padding import PaddingMask +from fairseq2.nn.projection import Projection +from fairseq2.nn.transformer import TransformerDecoder, TransformerEncoder +from overrides import final as finaloverride +from torch import Tensor +from torch.nn import Module + +from seamless_communication.models.generator.ecapa_tdnn import ECAPA_TDNN +from seamless_communication.models.unity.fft_decoder import FeedForwardTransformer +from seamless_communication.models.unity.nar_decoder_frontend import NARDecoderFrontend + + +@final +class UnitYModel(EncoderDecoderModel): + """Represents a UnitY model as described in + :cite:t`https://doi.org/10.48550/arxiv.2212.08055`. + + Note that this implementation is augmented with a text encoder to enable + translating from text. + """ + + model_dim: int + input_modality: str + speech_encoder_frontend: TransformerFrontend + speech_encoder: TransformerEncoder + text_encoder_frontend: Optional[TransformerFrontend] + text_encoder: Optional[TransformerEncoder] + text_decoder_frontend: Optional[TransformerFrontend] + text_decoder: Optional[TransformerDecoder] + final_proj: Optional[Projection] + t2u_model: Union["UnitYT2UModel", "UnitYNART2UModel", None] + prosody_encoder_model: Optional[ECAPA_TDNN] + + def __init__( + self, + speech_encoder_frontend: TransformerFrontend, + speech_encoder: TransformerEncoder, + text_encoder_frontend: Optional[TransformerFrontend], + text_encoder: Optional[TransformerEncoder], + text_decoder_frontend: Optional[TransformerFrontend], + text_decoder: Optional[TransformerDecoder], + final_proj: Optional[Projection], + t2u_model: Union["UnitYT2UModel", "UnitYNART2UModel", None], + target_vocab_info: VocabularyInfo, + prosody_encoder_model: Optional[ECAPA_TDNN] = None, + input_modality: str = "speech", + ) -> None: + model_dim = speech_encoder.model_dim + + super().__init__(model_dim, target_vocab_info) + + self.input_modality = input_modality + + self.speech_encoder_frontend = speech_encoder_frontend + self.speech_encoder = speech_encoder + + if text_encoder is not None: + if text_encoder_frontend is None: + raise ValueError( + "Both `text_encoder` and `text_encoder_frontend` must be specified, but `text_encoder_frontend` is `None`." + ) + + self.text_encoder_frontend = text_encoder_frontend + self.text_encoder = text_encoder + else: + if text_encoder_frontend is not None: + raise ValueError( + "Both `text_encoder` and `text_encoder_frontend` must be specified, but `text_encoder` is `None`." + ) + + self.register_module("text_encoder_frontend", None) + self.register_module("text_encoder", None) + + if text_decoder is not None: + if text_decoder_frontend is None: + raise ValueError( + "Both `text_decoder` and `text_decoder_frontend` must be specified, but `text_decoder_frontend` is `None`." + ) + + self.text_decoder_frontend = text_decoder_frontend + self.text_decoder = text_decoder + self.final_proj = final_proj + else: + if text_decoder_frontend is not None: + raise ValueError( + "Both `text_encoder` and `text_encoder_frontend` must be specified, but `text_decoder` is `None`." + ) + + self.register_module("text_decoder_frontend", None) + self.register_module("text_decoder", None) + self.register_module("final_proj", None) + + if t2u_model is not None: + self.t2u_model = t2u_model + else: + self.register_module("t2u_model", None) + + self.target_vocab_info = target_vocab_info + if prosody_encoder_model is not None: + self.prosody_encoder_model = prosody_encoder_model + else: + self.register_module("prosody_encoder_model", None) + + @finaloverride + def encode( + self, seqs: Tensor, padding_mask: Optional[PaddingMask] + ) -> Tuple[Tensor, Optional[PaddingMask]]: + if self.input_modality == "speech": + return self.encode_speech(seqs, padding_mask) + + if self.input_modality == "text": + return self.encode_text(seqs, padding_mask) + + raise RuntimeError( + f"`input_modality` must be 'speech' or 'text', but is '{self.input_modality}' instead." + ) + + def encode_speech( + self, seqs: Tensor, padding_mask: Optional[PaddingMask] + ) -> Tuple[Tensor, Optional[PaddingMask]]: + seqs, padding_mask = self.speech_encoder_frontend(seqs, padding_mask) + + return self.speech_encoder(seqs, padding_mask) # type: ignore[no-any-return] + + def encode_text( + self, seqs: Tensor, padding_mask: Optional[PaddingMask] + ) -> Tuple[Tensor, Optional[PaddingMask]]: + if self.text_encoder is None: + raise ValueError( + "`encode_text()` requires a text encoder, but the current UnitY model does not have one." + ) + + assert self.text_encoder_frontend is not None + + seqs, padding_mask = self.text_encoder_frontend(seqs, padding_mask) + + return self.text_encoder(seqs, padding_mask) # type: ignore[no-any-return] + + @finaloverride + def decode( + self, + seqs: Tensor, + padding_mask: Optional[PaddingMask], + encoder_output: Tensor, + encoder_padding_mask: Optional[PaddingMask], + *, + state_bag: Optional[IncrementalStateBag] = None, + ) -> Tuple[Tensor, Optional[PaddingMask]]: + if self.text_decoder is None: + raise ValueError( + "`decode()` requires a text decoder, but the current UnitY model does not have one." + ) + + assert self.text_decoder_frontend is not None + + seqs, padding_mask = self.text_decoder_frontend( + seqs, padding_mask, state_bag=state_bag + ) + + return self.text_decoder( # type: ignore[no-any-return] + seqs, + padding_mask, + encoder_output, + encoder_padding_mask, + state_bag=state_bag, + ) + + @finaloverride + def project( + self, decoder_output: Tensor, decoder_padding_mask: Optional[PaddingMask] + ) -> SequenceModelOutput: + if self.final_proj is None: + raise ValueError( + "`project()` requires a final_proj layer, but the current UnitY model does not have one." + ) + + logits = self.final_proj(decoder_output) + + return SequenceModelOutput(logits, self.target_vocab_info) + + +@final +class UnitYX2TModel(EncoderDecoderModel): + model_dim: int + encoder_frontend: TransformerFrontend + encoder: TransformerEncoder + decoder_frontend: TransformerFrontend + decoder: TransformerDecoder + final_proj: Projection + + def __init__( + self, + encoder_frontend: TransformerFrontend, + encoder: TransformerEncoder, + decoder_frontend: TransformerFrontend, + decoder: TransformerDecoder, + final_proj: Projection, + target_vocab_info: VocabularyInfo, + ) -> None: + model_dim = encoder.model_dim + + super().__init__(model_dim, target_vocab_info) + + self.encoder_frontend = encoder_frontend + self.encoder = encoder + self.decoder_frontend = decoder_frontend + self.decoder = decoder + self.final_proj = final_proj + self.target_vocab_info = target_vocab_info + + @finaloverride + def encode( + self, seqs: Tensor, padding_mask: Optional[PaddingMask] + ) -> Tuple[Tensor, Optional[PaddingMask]]: + seqs, padding_mask = self.encoder_frontend(seqs, padding_mask) + return self.encoder(seqs, padding_mask) # type: ignore[no-any-return] + + @finaloverride + def decode( + self, + seqs: Tensor, + padding_mask: Optional[PaddingMask], + encoder_output: Tensor, + encoder_padding_mask: Optional[PaddingMask], + *, + state_bag: Optional[IncrementalStateBag] = None, + ) -> Tuple[Tensor, Optional[PaddingMask]]: + seqs, padding_mask = self.decoder_frontend( + seqs, padding_mask, state_bag=state_bag + ) + + return self.decoder( # type: ignore[no-any-return] + seqs, + padding_mask, + encoder_output, + encoder_padding_mask, + state_bag=state_bag, + ) + + @finaloverride + def project( + self, decoder_output: Tensor, decoder_padding_mask: Optional[PaddingMask] + ) -> SequenceModelOutput: + logits = self.final_proj(decoder_output) + + return SequenceModelOutput(logits, self.target_vocab_info) + + +@final +class UnitYT2UModel(EncoderDecoderModel): + """Represents a UnitY T2U model as described in + :cite:t`https://doi.org/10.48550/arxiv.2212.08055`.""" + + encoder: Optional[TransformerEncoder] + decoder_frontend: TransformerFrontend + decoder: TransformerDecoder + final_proj: Projection + + def __init__( + self, + encoder: Optional[TransformerEncoder], + decoder_frontend: TransformerFrontend, + decoder: TransformerDecoder, + final_proj: Projection, + target_vocab_info: VocabularyInfo, + ) -> None: + super().__init__(decoder.model_dim, target_vocab_info) + + if encoder is not None: + self.encoder = encoder + else: + self.register_module("encoder", None) + + self.decoder_frontend = decoder_frontend + self.decoder = decoder + + self.final_proj = final_proj + + def encode( + self, seqs: Tensor, padding_mask: Optional[PaddingMask] + ) -> Tuple[Tensor, Optional[PaddingMask]]: + if self.encoder is None: + return seqs, padding_mask + + return self.encoder(seqs, padding_mask) # type: ignore[no-any-return] + + def decode( + self, + seqs: Tensor, + padding_mask: Optional[PaddingMask], + encoder_output: Tensor, + encoder_padding_mask: Optional[PaddingMask], + *, + state_bag: Optional[IncrementalStateBag] = None, + ) -> Tuple[Tensor, Optional[PaddingMask]]: + seqs, padding_mask = self.decoder_frontend( + seqs, padding_mask, state_bag=state_bag + ) + + return self.decoder( # type: ignore[no-any-return] + seqs, + padding_mask, + encoder_output, + encoder_padding_mask, + state_bag=state_bag, + ) + + def project( + self, decoder_output: Tensor, decoder_padding_mask: Optional[PaddingMask] + ) -> SequenceModelOutput: + logits = self.final_proj(decoder_output) + + return SequenceModelOutput(logits, self.target_vocab_info) + + +@final +class UnitYNART2UModel(Module): + """Represents a non-autoregressive UnitY T2U model.""" + + model_dim: int + encoder: Optional[TransformerEncoder] + decoder_frontend: NARDecoderFrontend + decoder: FeedForwardTransformer + final_proj: Projection + target_vocab_info: VocabularyInfo + prosody_proj: Optional[Projection] + + def __init__( + self, + encoder: Optional[TransformerEncoder], + decoder_frontend: NARDecoderFrontend, + decoder: FeedForwardTransformer, + final_proj: Projection, + target_vocab_info: VocabularyInfo, + prosody_proj: Optional[Projection] = None, + ) -> None: + super().__init__() + + self.model_dim = decoder.model_dim + + if encoder is not None: + if encoder.model_dim != self.model_dim: + raise ValueError( + f"`model_dim` of `encoder` and `model_dim` of `decoder` must be equal, but are {encoder.model_dim} and {self.model_dim} instead." + ) + + self.encoder = encoder + else: + self.register_module("encoder", None) + + if decoder_frontend.model_dim != self.model_dim: + raise ValueError( + f"`model_dim` of `decoder_frontend` and `model_dim` of `decoder` must be equal, but are {decoder_frontend.model_dim} and {self.model_dim} instead." + ) + + self.decoder_frontend = decoder_frontend + self.decoder = decoder + + self.final_proj = final_proj + + self.target_vocab_info = target_vocab_info + + self.prosody_proj = prosody_proj + + def forward( + self, + text_decoder_output: Tensor, + text_decoder_padding_mask: Optional[PaddingMask], + text_seqs: Optional[Tensor], + duration_factor: float = 1.0, + film_cond_emb: Optional[Tensor] = None, + ) -> Tuple[SequenceModelOutput, Optional[PaddingMask], Tensor]: + encoder_output, encoder_padding_mask = self.encode( + text_decoder_output, text_decoder_padding_mask + ) + + if self.prosody_proj is not None and film_cond_emb is not None: + encoder_output = encoder_output + self.prosody_proj(film_cond_emb) + + decoder_output, decoder_padding_mask, durations = self.decode( + encoder_output, + encoder_padding_mask, + text_seqs, + duration_factor, + film_cond_emb, + ) + + return self.project(decoder_output), decoder_padding_mask, durations + + def encode( + self, + text_decoder_output: Tensor, + text_decoder_padding_mask: Optional[PaddingMask], + ) -> Tuple[Tensor, Optional[PaddingMask]]: + if self.encoder is None: + return text_decoder_output, text_decoder_padding_mask + + return self.encoder(text_decoder_output, text_decoder_padding_mask) # type: ignore[no-any-return] + + def decode( + self, + encoder_output: Tensor, + encoder_padding_mask: Optional[PaddingMask], + text_seqs: Optional[Tensor], + duration_factor: float = 1.0, + film_cond_emb: Optional[Tensor] = None, + ) -> Tuple[Tensor, Optional[PaddingMask], Tensor]: + # encoder_output: (N, S, M) + # text_seqs: (N, S) + seqs, padding_mask, durations = self.decoder_frontend( + encoder_output, + encoder_padding_mask, + text_seqs, + duration_factor, + film_cond_emb, + ) + + seqs, padding_mask = self.decoder( + seqs, padding_mask, film_cond_emb=film_cond_emb + ) + + return seqs, padding_mask, durations # type: ignore[no-any-return] + + def project(self, decoder_output: Tensor) -> SequenceModelOutput: + logits = self.final_proj(decoder_output) + + return SequenceModelOutput(logits, self.target_vocab_info) + + +@dataclass +class UnitYOutput: + """Holds the output of a UnitY model.""" + + s2t_output: SequenceModelOutput + """The S2T output of the multitask model.""" + + mt_output: SequenceModelOutput + """The MT output of the multitask model.""" + + t2u_output: SequenceModelOutput + """The output of the T2U model.""" + + def compute_loss( + self, targets: Tensor, ignore_prefix_size: int = 0, label_smoothing: float = 0.0 + ) -> None: + # TODO: Implement R-Drop based loss + pass diff --git a/seamless_communication/src/seamless_communication/models/unity/nar_decoder_frontend.py b/seamless_communication/src/seamless_communication/models/unity/nar_decoder_frontend.py new file mode 100644 index 0000000..303f451 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/unity/nar_decoder_frontend.py @@ -0,0 +1,334 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import math +from typing import List, Optional, Tuple, final + +import torch +from fairseq2.data import VocabularyInfo +from fairseq2.models.nllb.tokenizer import NllbTokenizer +from fairseq2.nn.embedding import Embedding +from fairseq2.nn.normalization import LayerNorm +from fairseq2.nn.padding import PaddingMask +from fairseq2.nn.position_encoder import PositionEncoder +from fairseq2.nn.transformer import create_standard_layer_norm +from fairseq2.typing import DataType, Device, finaloverride +from torch import Tensor +from torch.nn import Dropout, Module, Parameter + +from seamless_communication.models.unity.char_tokenizer import CharTokenizer +from seamless_communication.models.unity.length_regulator import ( + HardUpsampling, + VarianceAdaptor, +) + +SPACE = "▁" + + +class TagManager: + def __init__(self, vocab_info: VocabularyInfo): + self.vocab_info = vocab_info + + def preprocess_text_seqs(self, text_seqs: Tensor) -> Tensor: + # Remove EOS, lang tokens as per NLLB "target" tokenizer mode. + text_seqs = text_seqs[:, 2:] + assert self.vocab_info.pad_idx is not None + text_seqs.masked_fill_( + text_seqs == self.vocab_info.eos_idx, self.vocab_info.pad_idx + ) + return text_seqs + + def postprocess_dur_or_len(self, dur_or_len: Tensor) -> Tensor: + N = dur_or_len.shape[0] + pad_zero = dur_or_len.new_zeros((N, 1)) + # Add pads for lang, EOS tokens as per NLLB "source" tokenizer mode. + dur_or_len = torch.cat([pad_zero, dur_or_len, pad_zero], dim=1) + return dur_or_len + + +@final +class NARDecoderFrontend(Module): + """Represents a Non-autoregressive decoder front-end.""" + + char_pos_encoder: PositionEncoder + pos_emb_alpha_char: Parameter + unit_pos_encoder: PositionEncoder + pos_emb_alpha: Parameter + scale: float + char_length_regulator: HardUpsampling + variance_adaptor: VarianceAdaptor + layer_norm: Optional[LayerNorm] + dropout: Optional[Dropout] + + def __init__( + self, + embed: Embedding, + embed_char: Embedding, + text_tokenizer: NllbTokenizer, + char_tokenizer: CharTokenizer, + unit_pos_encoder: PositionEncoder, + char_pos_encoder: PositionEncoder, + variance_adaptor: VarianceAdaptor, + no_scale: bool = False, + layer_norm: bool = False, + dropout_p: float = 0.1, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ): + self.model_dim = embed.embedding_dim + + super().__init__() + + self.embed = embed + self.embed_char = embed_char + self.text_tokenizer = text_tokenizer + self.char_tokenizer = char_tokenizer + self.tag_manager = TagManager(text_tokenizer.vocab_info) + + self.unk_idx = self.text_tokenizer.vocab_info.unk_idx + self.pad_idx = self.text_tokenizer.vocab_info.pad_idx + + # TODO: Implement AlignmentEncoder for training. + + if unit_pos_encoder.encoding_dim != self.model_dim: + raise ValueError( + f"`encoding_dim` of `unit_pos_encoder` and `embedding_dim` of `embed` must be equal, but are {unit_pos_encoder.encoding_dim} and {self.model_dim} instead." + ) + + if char_pos_encoder.encoding_dim != self.model_dim: + raise ValueError( + f"`encoding_dim` of `char_pos_encoder` and `embedding_dim` of `embed` must be equal, but are {char_pos_encoder.encoding_dim} and {self.model_dim} instead." + ) + + self.unit_pos_encoder = unit_pos_encoder + + self.pos_emb_alpha = Parameter(torch.ones(1, device=device, dtype=dtype)) + self.char_pos_encoder = char_pos_encoder + + self.pos_emb_alpha_char = Parameter(torch.ones(1, device=device, dtype=dtype)) + self.scale = 1.0 if no_scale else math.sqrt(self.model_dim) + + self.char_length_regulator = HardUpsampling() + + self.variance_adaptor = variance_adaptor + + if layer_norm: + self.layer_norm = create_standard_layer_norm( + self.model_dim, device=device, dtype=dtype + ) + else: + self.register_module("layer_norm", None) + + if dropout_p > 0.0: + self.dropout = Dropout(dropout_p) + else: + self.register_module("dropout", None) + + def indices_to_subwords(self, text_seqs: Tensor) -> List[List[str]]: + # TODO: To be replaced with fairseq2's indices_to_tokens SPM model method + # once implemented. + N, seq_len = text_seqs.shape + subwords_batch = [] + for b in range(N): + subwords = [] + for i in range(seq_len): + subword = self.text_tokenizer.model.index_to_token(int(text_seqs[b, i])) + subwords.append(str(subword)) + subwords_batch.append(subwords) + return subwords_batch + + def text_to_char_seqs(self, text_seqs: Tensor) -> Tuple[Tensor, Tensor, Tensor]: + text_seqs = self.tag_manager.preprocess_text_seqs(text_seqs) + + subwords_batch = self.indices_to_subwords(text_seqs) + + char_lens = self.count_character_length_in_subword(text_seqs, subwords_batch) + + char_lens = self.tag_manager.postprocess_dur_or_len(char_lens) + + char_seqs, char_seq_lens = self.get_char_seqs( + text_seqs, subwords_batch, char_lens + ) + + return char_seqs, char_seq_lens, char_lens + + def count_character_length_in_subword( + self, + text_seqs: Tensor, + subwords_batch: List[List[str]], + merge_space_with_prev_subword: bool = False, + ) -> Tensor: + N, _ = text_seqs.shape + + char_lens = text_seqs.new_zeros(text_seqs.size()) + + assert self.pad_idx is not None + subword_lens = text_seqs.ne(self.pad_idx).sum(1) + + for b in range(N): + # We slice out the tensor till the padding index. + subword_indices = text_seqs[b, : subword_lens[b]] + subwords = subwords_batch[b][: subword_lens[b]] + + assert subword_indices.shape[0] == len(subwords) + + is_next_start_with_space = [ + len(subwords[i + 1]) > 1 and subwords[i + 1][0] == SPACE + if i < len(subwords) - 1 + else False + for i in range(len(subwords)) + ] + is_punc = [ + len(subwords[i]) == 1 + and not subwords[i].isalpha() + and not subwords[i].isnumeric() + and subwords[i] != SPACE + for i in range(len(subwords)) + ] + for i, (subword_idx, subword) in enumerate(zip(subword_indices, subwords)): + if subword_idx == self.pad_idx: + break + + if subword_idx == self.unk_idx: + # We set char_len to 1 for an unk token. + char_len = 1 + + if merge_space_with_prev_subword and is_next_start_with_space[i]: + char_len += 1 + else: + # By default, spaces are merged with the next subword. + # char_len includes the space. + char_len = len(subword) + + if merge_space_with_prev_subword: + # Add the space for the next subword. + if is_next_start_with_space[i]: + char_len += 1 + # Subtract the space for the current subword. + if i > 0 and is_next_start_with_space[i - 1]: + char_len -= 1 + else: + # Merge space with punctuation mark by default. + if is_punc[i] and is_next_start_with_space[i]: + char_len += 1 + # Subtract the space for the subword succeeding the punctuation mark. + elif ( + i > 0 and is_punc[i - 1] and is_next_start_with_space[i - 1] + ): + char_len -= 1 + + char_lens[b, i] = char_len + + return char_lens + + def get_char_seqs( + self, text_seqs: Tensor, subwords_batch: List[List[str]], char_lens: Tensor + ) -> Tuple[Tensor, Tensor]: + N = text_seqs.shape[0] + max_len = int(char_lens.sum(1).max().item()) + + assert self.pad_idx is not None + char_seqs = text_seqs.new_zeros((N, max_len)).fill_(self.pad_idx) + char_seq_lens = char_seqs.new_zeros(N) + + assert self.pad_idx is not None + subword_lens = text_seqs.ne(self.pad_idx).sum(1) + + for b in range(N): + total = 0 + subword_indices = text_seqs[b, : subword_lens[b]] + subwords = subwords_batch[b][: subword_lens[b]] + for subword_idx, subword in zip(subword_indices, subwords): + if subword_idx == self.unk_idx: + char_ids = [self.unk_idx] + else: + # Get char token indices corresponding to the subwords. + char_ids = [ + self.char_tokenizer.model.token_to_index(ch) + for ch in list(subword) + ] + char_seq_len = len(char_ids) + char_seqs[b, total : total + char_seq_len] = torch.tensor(char_ids).to( + char_seqs + ) + total += char_seq_len + char_seq_lens[b] = total + return char_seqs, char_seq_lens + + def character_level_upsampling( + self, + seqs: Tensor, + padding_mask: Optional[PaddingMask], + char_seqs: Tensor, + char_lens: Tensor, + ) -> Tensor: + seqs, _ = self.char_length_regulator(seqs, char_lens) + + pos_embeds = self.pos_emb_alpha_char * ( + self.char_pos_encoder(seqs, padding_mask) - seqs + ) + + char_embeds = self.embed_char(char_seqs) + + if self.scale != 1.0: + char_embeds *= self.scale + + pos_embeds += char_embeds + + seqs += pos_embeds + + return seqs + + def forward_unit_pos_embedding( + self, seqs: Tensor, padding_mask: Optional[PaddingMask] + ) -> Tensor: + pos_embeds = self.pos_emb_alpha * ( + self.unit_pos_encoder(seqs, padding_mask) - seqs + ) + + seqs += pos_embeds + + if self.dropout is not None: + seqs = self.dropout(seqs) + + return seqs + + @finaloverride + def forward( + self, + encoder_output: Tensor, + encoder_padding_mask: Optional[PaddingMask], + text_seqs: Optional[Tensor], + duration_factor: float = 1.0, + film_cond_emb: Optional[Tensor] = None, + ) -> Tuple[Tensor, Optional[PaddingMask], Tensor]: + assert text_seqs is not None + + # text_seqs: (N, S_text) + char_seqs, char_seq_lens, char_lens = self.text_to_char_seqs(text_seqs) + + # char_seqs: (N, S_char) + encoder_padding_mask = PaddingMask( + char_seq_lens, batch_seq_len=char_seqs.size(1) + ) + + # (N, S_text, M) -> (N, S_char, M) + seqs = self.character_level_upsampling( + encoder_output, encoder_padding_mask, char_seqs, char_lens + ) + + # (N, S_char, M) -> (N, S_unit, M) + seqs, padding_mask, durations = self.variance_adaptor( + seqs, + encoder_padding_mask, + duration_factor=duration_factor, + min_duration=1, + film_cond_emb=film_cond_emb, + ) + + seqs = self.forward_unit_pos_embedding(seqs, padding_mask) + + return seqs, padding_mask, durations diff --git a/seamless_communication/src/seamless_communication/models/unity/t2u_builder.py b/seamless_communication/src/seamless_communication/models/unity/t2u_builder.py new file mode 100644 index 0000000..f8e73fd --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/unity/t2u_builder.py @@ -0,0 +1,735 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. +from dataclasses import dataclass +from typing import Literal, Optional, Union + +from fairseq2.assets import asset_store, download_manager +from fairseq2.assets.card import AssetCard +from fairseq2.data import VocabularyInfo +from fairseq2.models.nllb.loader import NllbTokenizerLoader +from fairseq2.models.transformer import ( + TransformerEmbeddingFrontend, + TransformerFrontend, +) +from fairseq2.models.utils.arch_registry import ArchitectureRegistry +from fairseq2.nn.embedding import Embedding, StandardEmbedding, init_scaled_embedding +from fairseq2.nn.position_encoder import SinusoidalPositionEncoder +from fairseq2.nn.projection import Linear, Projection, TiedProjection +from fairseq2.nn.transformer import ( + FeedForwardNetwork, + MultiheadAttention, + StandardFeedForwardNetwork, + StandardMultiheadAttention, + StandardTransformerDecoder, + StandardTransformerDecoderLayer, + StandardTransformerEncoder, + StandardTransformerEncoderLayer, + TransformerDecoder, + TransformerDecoderLayer, + TransformerEncoder, + TransformerEncoderLayer, + TransformerNormOrder, + create_default_sdpa, +) +from fairseq2.typing import DataType, Device +from torch.nn import GELU, ReLU + +from seamless_communication.models.unity.char_tokenizer import load_unity_char_tokenizer +from seamless_communication.models.unity.fft_decoder import FeedForwardTransformer +from seamless_communication.models.unity.fft_decoder_layer import ( + Conv1dBlock, + FeedForwardTransformerLayer, +) +from seamless_communication.models.unity.length_regulator import ( + VarianceAdaptor, + VariancePredictor, +) +from seamless_communication.models.unity.model import UnitYNART2UModel, UnitYT2UModel +from seamless_communication.models.unity.nar_decoder_frontend import NARDecoderFrontend + + +@dataclass +class VariancePredictorConfig: + var_pred_hidden_dim: int + var_pred_kernel_size: int + var_pred_dropout: float + use_film: bool + film_cond_dim: int + + +@dataclass +class NARDecoderFrontendConfig: + subword_to_unit_upsampling_type: Literal["gaussian", "hard"] + duration_predictor_config: VariancePredictorConfig + pitch_predictor_config: Optional[VariancePredictorConfig] + energy_predictor_config: Optional[VariancePredictorConfig] + + +@dataclass +class NARDecoderConfig: + model_name_or_card: Union[str, AssetCard] + char_vocabulary_size: int + char_max_seq_len: int + conv1d_kernel_size: int + conv1d_inner_dim: int + conv1d_dropout_p: float + use_film: bool + film_cond_dim: int + + +@dataclass +class UnitYT2UConfig: + """Holds the configuration of a UnitY T2U model as described in + :cite:t`https://doi.org/10.48550/arxiv.2212.08055`""" + + model_dim: int + """The dimensionality of the model.""" + + unit_max_seq_len: int + """The expected maximum unit sequence length.""" + + target_vocab_info: VocabularyInfo + """The target vocabulary information.""" + + num_encoder_layers: int + """The number of Transformer encoder layers.""" + + num_decoder_layers: int + """The number of Transformer decoder layers.""" + + nar_decoder_frontend_config: Optional[NARDecoderFrontendConfig] + """Non-autoregressive decoder front-end config.""" + + nar_decoder_config: Optional[NARDecoderConfig] + """Non-autoregressive decoder config.""" + + num_encoder_attn_heads: int + """The number of attention heads in Transformer encoder layers.""" + + num_decoder_attn_heads: int + """The number of attention heads in Transformer decoder layers.""" + + ffn_inner_dim: int + """The inner dimensionality of Transformer feed-forward networks.""" + + dropout_p: float + """The dropout probability in Transformer layers.""" + + use_gelu: bool + """If ``True``, uses GELU activation function in feed-forward networks.""" + + char_pad_idx: int + """The index of the pad symbol in the char vocabulary.""" + + use_prosody_proj: bool + """If ``True``, uses a prosody projection layer.""" + + prosody_encoder_dim: int + """The dimensionality of prosody encoder (e.g. ECAPA_TDNN) output""" + + +unity_t2u_archs = ArchitectureRegistry[UnitYT2UConfig]("unity_t2u") + + +unity_t2u_arch = unity_t2u_archs.decorator + + +@unity_t2u_arch("base") +def _base_t2u() -> UnitYT2UConfig: + return UnitYT2UConfig( + model_dim=1024, + unit_max_seq_len=2048, + target_vocab_info=VocabularyInfo( + size=10082, unk_idx=3, bos_idx=0, eos_idx=2, pad_idx=1 + ), + num_encoder_layers=6, + num_decoder_layers=6, + nar_decoder_frontend_config=None, + nar_decoder_config=None, + num_encoder_attn_heads=16, + num_decoder_attn_heads=16, + ffn_inner_dim=1024 * 8, + dropout_p=0.1, + use_gelu=False, + char_pad_idx=1, + use_prosody_proj=False, + prosody_encoder_dim=0, + ) + + +@unity_t2u_arch("medium") +def _medium_t2u() -> UnitYT2UConfig: + return UnitYT2UConfig( + model_dim=1024, + unit_max_seq_len=2048, + target_vocab_info=VocabularyInfo( + size=10082, unk_idx=3, bos_idx=0, eos_idx=2, pad_idx=1 + ), + num_encoder_layers=4, + num_decoder_layers=4, + nar_decoder_frontend_config=None, + nar_decoder_config=None, + num_encoder_attn_heads=16, + num_decoder_attn_heads=16, + ffn_inner_dim=1024 * 8, + dropout_p=0.1, + use_gelu=False, + char_pad_idx=1, + use_prosody_proj=False, + prosody_encoder_dim=0, + ) + + +@unity_t2u_arch("base_nar") +def _base_nar() -> UnitYT2UConfig: + duration_predictor_config = VariancePredictorConfig( + var_pred_hidden_dim=256, + var_pred_kernel_size=3, + var_pred_dropout=0.5, + use_film=False, + film_cond_dim=0, + ) + + nar_decoder_frontend_config = NARDecoderFrontendConfig( + subword_to_unit_upsampling_type="hard", + duration_predictor_config=duration_predictor_config, + pitch_predictor_config=None, + energy_predictor_config=None, + ) + + nar_decoder_config = NARDecoderConfig( + model_name_or_card="seamlessM4T_v2_large", + char_vocabulary_size=10943, + char_max_seq_len=4096, + conv1d_kernel_size=7, + conv1d_inner_dim=1024, + conv1d_dropout_p=0.1, + use_film=False, + film_cond_dim=0, + ) + + return UnitYT2UConfig( + model_dim=1024, + unit_max_seq_len=4096, + target_vocab_info=VocabularyInfo( + size=10082, unk_idx=3, bos_idx=0, eos_idx=2, pad_idx=1 + ), + num_encoder_layers=6, + num_decoder_layers=6, + nar_decoder_frontend_config=nar_decoder_frontend_config, + nar_decoder_config=nar_decoder_config, + num_encoder_attn_heads=16, + num_decoder_attn_heads=16, + ffn_inner_dim=1024 * 8, + dropout_p=0.0, + use_gelu=False, + char_pad_idx=1, + use_prosody_proj=False, + prosody_encoder_dim=0, + ) + + +@unity_t2u_arch("expressivity_nar") +def _expressivity_nar() -> UnitYT2UConfig: + duration_predictor_config = VariancePredictorConfig( + var_pred_hidden_dim=256, + var_pred_kernel_size=3, + var_pred_dropout=0.5, + use_film=True, + film_cond_dim=512, + ) + + nar_decoder_frontend_config = NARDecoderFrontendConfig( + subword_to_unit_upsampling_type="hard", + duration_predictor_config=duration_predictor_config, + pitch_predictor_config=None, + energy_predictor_config=None, + ) + + nar_decoder_config = NARDecoderConfig( + model_name_or_card="seamless_expressivity", + char_vocabulary_size=10904, + char_max_seq_len=10000, + conv1d_kernel_size=7, + conv1d_inner_dim=1024, + conv1d_dropout_p=0.1, + use_film=True, + film_cond_dim=512, + ) + + return UnitYT2UConfig( + model_dim=1024, + unit_max_seq_len=10000, + target_vocab_info=VocabularyInfo( + size=10005, unk_idx=3, bos_idx=0, eos_idx=2, pad_idx=1 + ), + num_encoder_layers=4, + num_decoder_layers=4, + nar_decoder_frontend_config=nar_decoder_frontend_config, + nar_decoder_config=nar_decoder_config, + num_encoder_attn_heads=16, + num_decoder_attn_heads=16, + ffn_inner_dim=1024 * 8, + dropout_p=0.0, + use_gelu=True, + char_pad_idx=1, + use_prosody_proj=True, + prosody_encoder_dim=512, + ) + + +class UnitYT2UBuilder: + """Builds modules of an autoregressive UnitY T2U model. + + To tweak the architecture, you can derive from this class and override the + corresponding methods. + """ + + config: UnitYT2UConfig + device: Optional[Device] + dtype: Optional[DataType] + + def __init__( + self, + config: UnitYT2UConfig, + *, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + """ + :param config: + The configuration to use. + :param device: + The device on which to initialize modules. + :param dtype: + The data type of module parameters and buffers. + """ + self.config = config + + self.device, self.dtype = device, dtype + + def build_model(self) -> UnitYT2UModel: + """Build an autoregressive UnitYT2U model.""" + + embed_unit = self.build_unit_embedding() + + encoder = self.build_encoder() + + decoder = self.build_decoder() + + final_proj = TiedProjection(embed_unit.weight, bias=None) + + decoder_frontend = self.build_decoder_frontend(embed_unit) + + return UnitYT2UModel( + encoder, + decoder_frontend, + decoder, + final_proj, + self.config.target_vocab_info, + ) + + def build_unit_embedding(self) -> StandardEmbedding: + """Build a unit embedding table.""" + + return StandardEmbedding( + num_embeddings=self.config.target_vocab_info.size, + embedding_dim=self.config.model_dim, + pad_idx=self.config.target_vocab_info.pad_idx, + init_fn=init_scaled_embedding, + device=self.device, + dtype=self.dtype, + ) + + def build_encoder(self) -> Optional[TransformerEncoder]: + """Build a Transformer encoder.""" + + num_layers = self.config.num_encoder_layers + if num_layers == 0: + return None + + layers = [self.build_encoder_layer() for _ in range(num_layers)] + + return StandardTransformerEncoder( + layers, + norm_order=TransformerNormOrder.PRE, + device=self.device, + dtype=self.dtype, + ) + + def build_encoder_layer(self) -> TransformerEncoderLayer: + """Build a Transformer encoder layer.""" + + self_attn = self.build_attention(self.config.num_encoder_attn_heads) + + ffn = self.build_ffn() + + return StandardTransformerEncoderLayer( + self_attn, + ffn, + dropout_p=self.config.dropout_p, + norm_order=TransformerNormOrder.PRE, + device=self.device, + dtype=self.dtype, + ) + + def build_decoder_frontend(self, embed_unit: Embedding) -> TransformerFrontend: + """Build a Transformer decoder front-end.""" + + pos_encoder = SinusoidalPositionEncoder( + self.config.model_dim, + self.config.unit_max_seq_len, + _legacy_pad_idx=self.config.target_vocab_info.pad_idx, + device=self.device, + ) + return TransformerEmbeddingFrontend( + embed_unit, + pos_encoder, + dropout_p=self.config.dropout_p, + device=self.device, + dtype=self.dtype, + ) + + def build_decoder(self) -> TransformerDecoder: + """Build a Transformer decoder.""" + + num_layers = self.config.num_decoder_layers + + layers = [self.build_decoder_layer() for _ in range(num_layers)] + + return StandardTransformerDecoder( + layers, + norm_order=TransformerNormOrder.PRE, + device=self.device, + dtype=self.dtype, + ) + + def build_decoder_layer(self) -> TransformerDecoderLayer: + """Build a Transformer decoder layer.""" + + self_attn = self.build_attention(self.config.num_decoder_attn_heads) + + encoder_decoder_attn = self.build_attention(self.config.num_decoder_attn_heads) + + ffn = self.build_ffn() + + return StandardTransformerDecoderLayer( + self_attn, + encoder_decoder_attn, + ffn, + dropout_p=self.config.dropout_p, + norm_order=TransformerNormOrder.PRE, + device=self.device, + dtype=self.dtype, + ) + + def build_attention(self, num_heads: int) -> MultiheadAttention: + """Build a Transformer multi-head attention layer.""" + + sdpa = create_default_sdpa(attn_dropout_p=self.config.dropout_p) + + return StandardMultiheadAttention( + self.config.model_dim, + num_heads, + sdpa=sdpa, + device=self.device, + dtype=self.dtype, + ) + + def build_ffn(self) -> FeedForwardNetwork: + """Build a Transformer feed-forward network.""" + + return StandardFeedForwardNetwork( + self.config.model_dim, + self.config.ffn_inner_dim, + bias=True, + norm_order=TransformerNormOrder.PRE, + device=self.device, + dtype=self.dtype, + ) + + +class UnitYNART2UBuilder: + """Builds modules of an NAR UnitY T2U model. + + To tweak the architecture, you can derive from this class and override the + corresponding methods. + """ + + config: UnitYT2UConfig + device: Optional[Device] + dtype: Optional[DataType] + + def __init__( + self, + config: UnitYT2UConfig, + *, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + """ + :param config: + The configuration to use. + :param device: + The device on which to initialize modules. + :param dtype: + The data type of module parameters and buffers. + """ + self.config = config + + self.device, self.dtype = device, dtype + + def build_model(self) -> UnitYNART2UModel: + """Build a non-autoregressive UnitY T2U model.""" + + embed_unit = self.build_unit_embedding() + + encoder = self.build_encoder() + + decoder = self.build_decoder() + + final_proj = TiedProjection(embed_unit.weight, bias=None) + + decoder_frontend = self.build_decoder_frontend(embed_unit) + + prosody_proj = self.build_prosody_proj() + + return UnitYNART2UModel( + encoder, + decoder_frontend, + decoder, + final_proj, + self.config.target_vocab_info, + prosody_proj=prosody_proj, + ) + + def build_unit_embedding(self) -> StandardEmbedding: + """Build a unit embedding table.""" + + return StandardEmbedding( + num_embeddings=self.config.target_vocab_info.size, + embedding_dim=self.config.model_dim, + pad_idx=self.config.target_vocab_info.pad_idx, + init_fn=init_scaled_embedding, + device=self.device, + dtype=self.dtype, + ) + + def build_encoder(self) -> Optional[TransformerEncoder]: + """Build a Transformer encoder.""" + + num_layers = self.config.num_encoder_layers + if num_layers == 0: + return None + + layers = [self.build_encoder_layer() for _ in range(num_layers)] + + return StandardTransformerEncoder( + layers, + norm_order=TransformerNormOrder.PRE, + device=self.device, + dtype=self.dtype, + ) + + def build_encoder_layer(self) -> TransformerEncoderLayer: + """Build a Transformer encoder layer.""" + + self_attn = self.build_attention(self.config.num_encoder_attn_heads) + + ffn = self.build_ffn() + + return StandardTransformerEncoderLayer( + self_attn, + ffn, + dropout_p=self.config.dropout_p, + norm_order=TransformerNormOrder.PRE, + device=self.device, + dtype=self.dtype, + ) + + def build_variance_adaptor( + self, nar_decoder_frontend_config: NARDecoderFrontendConfig + ) -> VarianceAdaptor: + """Build a variance adaptor module.""" + + duration_predictor_config = ( + nar_decoder_frontend_config.duration_predictor_config + ) + duration_predictor = VariancePredictor( + self.config.model_dim, + duration_predictor_config.var_pred_hidden_dim, + duration_predictor_config.var_pred_kernel_size, + duration_predictor_config.var_pred_dropout, + use_film=duration_predictor_config.use_film, + film_cond_dim=duration_predictor_config.film_cond_dim, + device=self.device, + dtype=self.dtype, + ) + + variance_adaptor = VarianceAdaptor( + duration_predictor, + pitch_predictor=None, + energy_predictor=None, + ) + + return variance_adaptor + + def build_decoder_frontend(self, embed_unit: Embedding) -> NARDecoderFrontend: + """Build a non-autoregressive decoder front-end.""" + + assert self.config.nar_decoder_config is not None + assert self.config.nar_decoder_frontend_config is not None + + unit_pos_encoder = SinusoidalPositionEncoder( + self.config.model_dim, + self.config.unit_max_seq_len, + _legacy_pad_idx=self.config.target_vocab_info.pad_idx, + device=self.device, + ) + + char_tokenizer = load_unity_char_tokenizer( + self.config.nar_decoder_config.model_name_or_card + ) + + variance_adaptor = self.build_variance_adaptor( + self.config.nar_decoder_frontend_config + ) + + nllb_tokenizer = NllbTokenizerLoader(asset_store, download_manager)( + self.config.nar_decoder_config.model_name_or_card + ) + + # The legacy pad idx should be the same as that of the unit_pos_encoder, + # since in fairseq1 the pos encoder is shared between both char, units. + char_pos_encoder = SinusoidalPositionEncoder( + self.config.model_dim, + self.config.nar_decoder_config.char_max_seq_len, + _legacy_pad_idx=self.config.target_vocab_info.pad_idx, + device=self.device, + ) + + embed_char = StandardEmbedding( + num_embeddings=self.config.nar_decoder_config.char_vocabulary_size, + embedding_dim=self.config.model_dim, + pad_idx=self.config.char_pad_idx, + init_fn=init_scaled_embedding, + device=self.device, + dtype=self.dtype, + ) + + return NARDecoderFrontend( + embed_unit, + embed_char, + nllb_tokenizer, + char_tokenizer, + unit_pos_encoder, + char_pos_encoder, + variance_adaptor, + dropout_p=self.config.dropout_p, + device=self.device, + dtype=self.dtype, + ) + + def build_decoder(self) -> FeedForwardTransformer: + """Build a Transformer decoder.""" + + num_layers = self.config.num_decoder_layers + + layers = [self.build_decoder_layer() for _ in range(num_layers)] + + return FeedForwardTransformer( + layers, + norm_order=TransformerNormOrder.PRE, + device=self.device, + dtype=self.dtype, + ) + + def build_decoder_layer(self) -> FeedForwardTransformerLayer: + """Build a Transformer decoder layer.""" + + assert self.config.nar_decoder_config is not None + + self_attn = self.build_attention(self.config.num_decoder_attn_heads) + + conv1d = Conv1dBlock( + self.config.model_dim, + self.config.nar_decoder_config.conv1d_inner_dim, + self.config.nar_decoder_config.conv1d_kernel_size, + bias=True, + device=self.device, + dtype=self.dtype, + ) + + return FeedForwardTransformerLayer( + self_attn, + conv1d, + dropout_p=self.config.dropout_p, + conv1d_dropout_p=self.config.nar_decoder_config.conv1d_dropout_p, + use_film=self.config.nar_decoder_config.use_film, + film_cond_dim=self.config.nar_decoder_config.film_cond_dim, + device=self.device, + dtype=self.dtype, + ) + + def build_attention(self, num_heads: int) -> MultiheadAttention: + """Build a Transformer multi-head attention layer.""" + + sdpa = create_default_sdpa(attn_dropout_p=self.config.dropout_p) + + return StandardMultiheadAttention( + self.config.model_dim, + num_heads, + sdpa=sdpa, + device=self.device, + dtype=self.dtype, + ) + + def build_ffn(self) -> FeedForwardNetwork: + """Build a Transformer feed-forward network.""" + + return StandardFeedForwardNetwork( + self.config.model_dim, + self.config.ffn_inner_dim, + bias=True, + inner_activation=GELU() if self.config.use_gelu else ReLU(), + norm_order=TransformerNormOrder.PRE, + device=self.device, + dtype=self.dtype, + ) + + def build_prosody_proj(self) -> Optional[Projection]: + """Build a prosody projection layer if needed""" + + if self.config.use_prosody_proj: + return Linear( + self.config.prosody_encoder_dim, + self.config.model_dim, + bias=True, + dtype=self.dtype, + device=self.device, + ) + else: + return None + + +def create_unity_t2u_model( + config: UnitYT2UConfig, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, +) -> Union[UnitYT2UModel, UnitYNART2UModel]: + """Create a UnitY T2U model. + + :param config: + The configuration to use. + :param device: + The device on which to initialize modules. + :param dtype: + The data type of module parameters and buffers. + """ + if config.nar_decoder_config is None: + return UnitYT2UBuilder(config, device=device, dtype=dtype).build_model() + else: + return UnitYNART2UBuilder(config, device=device, dtype=dtype).build_model() diff --git a/seamless_communication/src/seamless_communication/models/unity/unit_tokenizer.py b/seamless_communication/src/seamless_communication/models/unity/unit_tokenizer.py new file mode 100644 index 0000000..1bb2e60 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/unity/unit_tokenizer.py @@ -0,0 +1,243 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Dict, Optional, Sequence + +import torch +from fairseq2.data import VocabularyInfo +from fairseq2.typing import Device +from torch import Tensor + + +class UnitTokenizer: + """Represents a tokenizer to encode and decode UnitY speech units.""" + + num_units: int + langs: Sequence[str] + lang_map: Dict[str, int] + + def __init__(self, num_units: int, langs: Sequence[str], model_arch: str) -> None: + """ + :param num_units: + The number of speech units. + :param langs: + The list of supported languages. + :param model_arch: + The type of UnitY model architecture. + """ + self.num_units = num_units + + self.langs = langs + + self.lang_map = {lang: idx for idx, lang in enumerate(langs)} + + # The "_v2" unity architectures have a non-autoregressive decoder. + if model_arch.split("_")[-1] == "v2": + self.is_nar_decoder = True + self.lang_symbol_repititions = 1 + else: + self.is_nar_decoder = False + # For legacy reasons, we have to repeat the language symbols twice, + # along with a placeholder `` token for UnitY autoregressive models. + self.lang_symbol_repititions = 2 + + vocab_size = num_units + self.lang_symbol_repititions * (len(langs) + 1) + 4 + + # We use fairseq's control symbol order. + self.vocab_info = VocabularyInfo( + size=vocab_size, bos_idx=0, pad_idx=1, eos_idx=2, unk_idx=3 + ) + + def lang_to_index(self, lang: str) -> int: + """Return the symbol index of the specified language.""" + # +4 for PAD/EOS/BOS/UNK, and +1 for the `` token. + try: + return ( + self.num_units + + (self.lang_symbol_repititions - 1) * (len(self.langs) + 1) + + self.lang_map[lang] + + 4 + ) + except KeyError: + langs = ", ".join(self.langs) + + raise ValueError( + f"`lang` must be one of the supported languages, but is '{lang}' instead. Supported languages: {langs}" + ) + + def index_to_lang(self, idx: int) -> str: + """Return the language of the specified language symbol index.""" + relative_idx = ( + idx + - self.num_units + - (self.lang_symbol_repititions - 1) * (len(self.langs) + 1) + - 4 + ) + + if relative_idx < 0 or relative_idx >= len(self.langs): + raise ValueError( + f"`idx` must correspond to one of the supported language symbol indices (0 to {len(self.langs) - 1}), but is {idx} instead." + ) + + return self.langs[relative_idx] + + def create_encoder( + self, lang: str, device: Optional[Device] = None + ) -> "UnitTokenEncoder": + """Create a token encoder. + + :param lang: + The language of generated token indices. + """ + return UnitTokenEncoder(self, lang, self.is_nar_decoder, device=device) + + def create_decoder(self) -> "UnitTokenDecoder": + """Create a token decoder.""" + return UnitTokenDecoder(self, self.is_nar_decoder) + + +class UnitTokenEncoder: + """Encodes speech units into token indices.""" + + tokenizer: UnitTokenizer + eos_idx: int + unk_idx: int + lang_idx: int + prefix_indices: Optional[Tensor] + + def __init__( + self, + tokenizer: UnitTokenizer, + lang: str, + is_nar_decoder: bool, + device: Optional[Device] = None, + ) -> None: + """ + :param tokenizer: + The unit tokenizer to use. + :param lang: + The language of generated token indices. + """ + if not lang in tokenizer.lang_map: + langs = ", ".join(tokenizer.langs) + + raise ValueError( + f"`lang` must be one of the supported languages, but is '{lang}' instead. Supported languages: {langs}" + ) + + self.tokenizer = tokenizer + self.is_nar_decoder = is_nar_decoder + + assert tokenizer.vocab_info.eos_idx is not None + assert tokenizer.vocab_info.unk_idx is not None + + self.eos_idx = tokenizer.vocab_info.eos_idx + self.unk_idx = tokenizer.vocab_info.unk_idx + + self.lang_idx = tokenizer.lang_to_index(lang) + + if device is None: + device = Device("cpu") + + if not self.is_nar_decoder: + # We always start sequences with EOS, followed by the language token. + self.prefix_indices = torch.tensor( + [self.eos_idx, self.lang_idx], device=device, dtype=torch.int64 + ) + else: + self.prefix_indices = None + + def __call__(self, units: Tensor) -> Tensor: + """Encode ``units`` to token indices. + + :param units: + The speech units to encode. *Shape:* :math:`(N,S)`, where :math:`N` + is the batch size and :math:`S` is the sequence length. + + :returns: + The token indices corresponding to ``units``. *Shape:* + :math:`(N,S_{tok})` ,where :math:`N` is the batch size and + :math`S_{tok}` is the sequence length of the token indices. + """ + batch_size = units.size(0) + + if self.prefix_indices is not None: + token_indices = torch.cat( + [self.prefix_indices.clone().expand(batch_size, -1), units.detach()], + dim=1, + ) + + # Ensure that non-symbol indices larger than `num_units` are replaced + # with UNK. + seqs = token_indices[:, 2:] + else: + token_indices = units.clone().detach() + seqs = token_indices + + # Add offset for control symbols. + seqs += 4 + + seqs[seqs >= self.tokenizer.num_units + 4] = self.unk_idx + + return token_indices + + +class UnitTokenDecoder: + """Decodes speech units from token indices.""" + + eos_idx: int + pad_idx: int + + def __init__(self, tokenizer: UnitTokenizer, is_nar_decoder: bool) -> None: + """ + :param tokenizer: + The unit tokenizer to use. + :param is_nar_decoder: + If True, the unit decoder is non-autoregressive. + """ + assert tokenizer.vocab_info.eos_idx is not None + assert tokenizer.vocab_info.pad_idx is not None + + self.eos_idx = tokenizer.vocab_info.eos_idx + self.pad_idx = tokenizer.vocab_info.pad_idx + + self.is_nar_decoder = is_nar_decoder + + def __call__(self, token_indices: Tensor) -> Tensor: + """Decode ``token_indices`` to speech units. + + :param token_indices: + The token indices to decode. *Shape:* :math:`(N,S)`, where :math:`N` + is the batch size and :math:`S` is the sequence length. + + :returns: + The speech units corresponding to ``token_indices``. *Shape:* + :math:`(N,S_{unt})`, where :math:`N` is the batch size and + :math`S_{unt}` is the sequence length of the speech units. + """ + if token_indices.size(1) == 0: + return token_indices + + units = token_indices.clone().detach() + + # Remove the prefix EOS symbol from the decoded output for + # autoregressive UnitY. + if not self.is_nar_decoder: + units = units[:, 1:] + + # Also, replace EOS with PAD at sequence ends. + units[units == self.eos_idx] = self.pad_idx + + units[units == self.pad_idx] = self.pad_idx + 4 + + # Remove offset of control symbols. + if self.is_nar_decoder: + units -= 4 + else: + # Exclude language symbol for autoregressive UnitY. + units[:, 1:] -= 4 + + return units diff --git a/seamless_communication/src/seamless_communication/models/vocoder/__init__.py b/seamless_communication/src/seamless_communication/models/vocoder/__init__.py new file mode 100644 index 0000000..7ac5306 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/vocoder/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from seamless_communication.models.vocoder.builder import ( + VocoderBuilder as VocoderBuilder, +) +from seamless_communication.models.vocoder.builder import VocoderConfig as VocoderConfig +from seamless_communication.models.vocoder.codehifigan import ( + CodeGenerator as CodeGenerator, +) +from seamless_communication.models.vocoder.hifigan import Generator as Generator +from seamless_communication.models.vocoder.loader import ( + load_vocoder_model as load_vocoder_model, +) +from seamless_communication.models.vocoder.vocoder import Vocoder as Vocoder diff --git a/seamless_communication/src/seamless_communication/models/vocoder/builder.py b/seamless_communication/src/seamless_communication/models/vocoder/builder.py new file mode 100644 index 0000000..bd9eaf3 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/vocoder/builder.py @@ -0,0 +1,135 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +from fairseq2.models.utils.arch_registry import ArchitectureRegistry +from fairseq2.typing import DataType, Device + +from seamless_communication.models.vocoder.codehifigan import CodeGenerator +from seamless_communication.models.vocoder.vocoder import Vocoder + + +@dataclass +class VocoderConfig: + """Holds the configuration of a Vocoder model.""" + + upsample_rates: List[int] + upsample_kernel_sizes: List[int] + upsample_initial_channel: int + resblock_kernel_sizes: List[int] + resblock_dilation_sizes: List[List[int]] + model_in_dim: int + num_embeddings: int + embedding_dim: int + dur_predictor_params: Dict[str, float] + lang_embedding_dim: int + num_langs: int + spkr_embedding_dim: int + num_spkrs: int + lang_spkr_idx_map: Dict[str, Any] + + +vocoder_archs = ArchitectureRegistry[VocoderConfig]("vocoder_code_hifigan") + +vocoder_arch = vocoder_archs.decorator + + +@vocoder_arch("base") +def _base_vocoder() -> VocoderConfig: + return VocoderConfig( + upsample_rates=[5, 4, 4, 2, 2], + upsample_kernel_sizes=[11, 8, 8, 4, 4], + upsample_initial_channel=512, + resblock_kernel_sizes=[3, 7, 11], + resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], + model_in_dim=1792, + num_embeddings=10000, + embedding_dim=1280, + dur_predictor_params={ + "encoder_embed_dim": 1280, + "var_pred_hidden_dim": 1280, + "var_pred_kernel_size": 3, + "var_pred_dropout": 0.5, + }, + lang_embedding_dim=256, + num_langs=36, + spkr_embedding_dim=256, + num_spkrs=200, + lang_spkr_idx_map={}, + ) + + +class VocoderBuilder: + """Builds modules of a vocoder model (Code Hifigan) as described in + :cite:t`https://github.com/facebookresearch/speech-resynthesis`. + + To tweak the architecture, you can derive from this class and override the + corresponding methods. + """ + + config: VocoderConfig + device: Optional[Device] + dtype: Optional[DataType] + + def __init__( + self, + config: VocoderConfig, + *, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, + ) -> None: + """ + :param config: + The configuration to use. + :param device: + The device on which to initialize modules. + :param dtype: + The data type of module parameters and buffers. + """ + self.config = config + self.device, self.dtype = device, dtype + + def build_model(self) -> Vocoder: + """Build a model.""" + + code_generator = CodeGenerator( + self.config.upsample_rates, + self.config.upsample_kernel_sizes, + self.config.upsample_initial_channel, + self.config.resblock_kernel_sizes, + self.config.resblock_dilation_sizes, + self.config.model_in_dim, + self.config.num_embeddings, + self.config.embedding_dim, + self.config.dur_predictor_params, + self.config.lang_embedding_dim, + self.config.num_langs, + self.config.spkr_embedding_dim, + self.config.num_spkrs, + ) + code_generator.to(device=self.device, dtype=self.dtype) + vocoder = Vocoder(code_generator, self.config.lang_spkr_idx_map) + return vocoder + + +def create_vocoder_model( + config: VocoderConfig, + device: Optional[Device] = None, + dtype: Optional[DataType] = None, +) -> Vocoder: + """Create a Vocoder model. + + :param config: + The configuration to use. + :param device: + The device on which to initialize modules. + :param dtype: + The data type of module parameters and buffers. + """ + + return VocoderBuilder(config, device=device, dtype=dtype).build_model() diff --git a/seamless_communication/src/seamless_communication/models/vocoder/codehifigan.py b/seamless_communication/src/seamless_communication/models/vocoder/codehifigan.py new file mode 100644 index 0000000..42bd75a --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/vocoder/codehifigan.py @@ -0,0 +1,101 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. +from typing import Any, Dict, List, Optional + +import torch +import torch.nn as nn +from torch import Tensor + +from seamless_communication.models.unity import VariancePredictor +from seamless_communication.models.vocoder.hifigan import Generator + + +class CodeGenerator(Generator): + def __init__( + self, + upsample_rates: List[int], + upsample_kernel_sizes: List[int], + upsample_initial_channel: int, + resblock_kernel_sizes: List[int], + resblock_dilation_sizes: List[List[int]], + model_in_dim: Optional[int], + num_embeddings: int, + embedding_dim: int, + dur_predictor_params: Dict[str, Any], + lang_embedding_dim: int, + num_langs: int, + spkr_embedding_dim: int, + num_spkrs: int, + ): + super().__init__( + upsample_rates, + upsample_kernel_sizes, + upsample_initial_channel, + resblock_kernel_sizes, + resblock_dilation_sizes, + model_in_dim, + ) + self.dict = nn.Embedding(num_embeddings, embedding_dim) + self.spkr = nn.Embedding(num_spkrs, spkr_embedding_dim) + self.lang = nn.Embedding(num_langs, lang_embedding_dim) + + self.dur_predictor = None + if dur_predictor_params: + self.dur_predictor = VariancePredictor(**dur_predictor_params) + + self.num_spkrs = num_spkrs + self.num_langs = num_langs + + @staticmethod + def _upsample(signal: Tensor, max_frames: int) -> Tensor: + if signal.dim() == 3: + bsz, channels, cond_length = signal.size() + elif signal.dim() == 2: + signal = signal.unsqueeze(2) + bsz, channels, cond_length = signal.size() + else: + signal = signal.view(-1, 1, 1) + bsz, channels, cond_length = signal.size() + + signal = signal.unsqueeze(3).repeat(1, 1, 1, max_frames // cond_length) + + # pad zeros as needed (if signal's shape does not divide completely with max_frames) + reminder = (max_frames - signal.shape[2] * signal.shape[3]) // signal.shape[3] + if reminder > 0: + raise NotImplementedError( + "Padding condition signal - misalignment between condition features." + ) + + signal = signal.view(bsz, channels, max_frames) + return signal + + def forward(self, sample: Dict[str, Any], dur_prediction: bool) -> Tensor: # type: ignore + x = sample["code"] + x = self.dict(x).transpose(1, 2) + + if self.dur_predictor and dur_prediction: + log_dur_pred = self.dur_predictor(x.transpose(1, 2), None) + dur_out = torch.clamp( + torch.round((torch.exp(log_dur_pred) - 1)).long(), min=1 + ) + # B x C x T + repeat_interleaved_x = [] + for i in range(x.size(0)): + repeat_interleaved_x.append(torch.repeat_interleave(x[i].unsqueeze(0), dur_out[i].view(-1), dim=2)) + x = torch.cat(repeat_interleaved_x) + upsampled_spkr = [] + upsampled_lang = [] + spkr = self.spkr(sample["spkr"]).transpose(1, 2) + lang = self.lang(sample["lang"]).transpose(1, 2) + for i in range(x.size(0)): + upsampled_spkr.append(self._upsample(spkr[i], x.shape[-1])) + upsampled_lang.append(self._upsample(lang[i], x.shape[-1])) + spkr = torch.cat(upsampled_spkr, dim=1).transpose(0, 1) + lang = torch.cat(upsampled_lang, dim=1).transpose(0, 1) + x = torch.cat([x, spkr], dim=1) + x = torch.cat([lang, x], dim=1) + + return super().forward(x) diff --git a/seamless_communication/src/seamless_communication/models/vocoder/hifigan.py b/seamless_communication/src/seamless_communication/models/vocoder/hifigan.py new file mode 100644 index 0000000..58702b3 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/vocoder/hifigan.py @@ -0,0 +1,205 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import List, Optional + +import logging +import torch +import torch.nn.functional as F + +from torch import Tensor +from torch.nn import Conv1d, ConvTranspose1d, Module, ModuleList +from torch.nn.utils.weight_norm import remove_weight_norm, weight_norm + +LRELU_SLOPE = 0.1 + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s -- %(name)s: %(message)s", +) + +logger = logging.getLogger(__name__) + + +def init_weights(m, mean: float = 0.0, std: float = 0.01) -> None: # type: ignore + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size: int, dilation: int = 1) -> int: + return (kernel_size * dilation - dilation) // 2 + + +class ResBlock(Module): + def __init__( + self, channels: int, kernel_size: int = 3, dilation: List[int] = [1, 3, 5] + ): + super(ResBlock, self).__init__() + self.convs1 = ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]), + ) + ), + ] + ) + self.convs1.apply(init_weights) + + self.convs2 = ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + ] + ) + self.convs2.apply(init_weights) + + def forward(self, x: Tensor) -> Tensor: + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self) -> None: + for layer in self.convs1: + remove_weight_norm(layer) + for layer in self.convs2: + remove_weight_norm(layer) + + +class Generator(Module): + def __init__( + self, + upsample_rates: List[int], + upsample_kernel_sizes: List[int], + upsample_initial_channel: int, + resblock_kernel_sizes: List[int], + resblock_dilation_sizes: List[List[int]], + model_in_dim: Optional[int], + add_ups_out_pad: bool = False, + ): + super().__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = weight_norm( + Conv1d( + model_in_dim if model_in_dim is not None else 80, + upsample_initial_channel, + 7, + 1, + padding=3, + ) + ) + + self.ups = ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + out_pad = u % 2 if add_ups_out_pad else 0 + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2 + out_pad, + output_padding=out_pad, + ) + ) + ) + + self.resblocks = ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes): + self.resblocks.append(ResBlock(ch, k, d)) + + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + + def forward(self, x: Tensor) -> Tensor: + x = self.conv_pre(x) + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels # type: ignore + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self) -> None: + logger.info("Removing weight norm in Generator.") + for layer in self.ups: + remove_weight_norm(layer) + for layer in self.resblocks: + layer.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) diff --git a/seamless_communication/src/seamless_communication/models/vocoder/loader.py b/seamless_communication/src/seamless_communication/models/vocoder/loader.py new file mode 100644 index 0000000..e153d5a --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/vocoder/loader.py @@ -0,0 +1,48 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Any, Mapping + +from fairseq2.assets import asset_store, download_manager +from fairseq2.models.utils import ConfigLoader, ModelLoader + +from seamless_communication.models.vocoder.builder import ( + VocoderConfig, + create_vocoder_model, + vocoder_archs, +) +from seamless_communication.models.vocoder.vocoder import Vocoder + + +def convert_vocoder_checkpoint( + checkpoint: Mapping[str, Any], config: VocoderConfig +) -> Mapping[str, Any]: + if ( + "model" in checkpoint + and "code_generator.resblocks.0.convs1.0.weight_g" in checkpoint["model"] + ): + return checkpoint + + old_state_dict = checkpoint["generator"] + new_state_dict = {} + for key in old_state_dict: + new_key = f"code_generator.{key}" + new_state_dict[new_key] = old_state_dict[key] + checkpoint["model"] = new_state_dict # type: ignore + del checkpoint["generator"] # type: ignore + return checkpoint + + +load_vocoder_config = ConfigLoader[VocoderConfig](asset_store, vocoder_archs) + + +load_vocoder_model = ModelLoader[Vocoder, VocoderConfig]( + asset_store, + download_manager, + load_vocoder_config, + create_vocoder_model, + convert_vocoder_checkpoint, +) diff --git a/seamless_communication/src/seamless_communication/models/vocoder/vocoder.py b/seamless_communication/src/seamless_communication/models/vocoder/vocoder.py new file mode 100644 index 0000000..c875643 --- /dev/null +++ b/seamless_communication/src/seamless_communication/models/vocoder/vocoder.py @@ -0,0 +1,49 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Any, Dict, Optional, List, Union +import torch +from torch import Tensor +from torch.nn import Module + +from seamless_communication.models.vocoder.codehifigan import CodeGenerator + + +class Vocoder(Module): + def __init__( + self, + code_generator: CodeGenerator, + lang_spkr_idx_map: Dict[str, Any], + ): + super().__init__() + self.code_generator = code_generator + self.lang_spkr_idx_map = lang_spkr_idx_map + + def forward( + self, + units: Tensor, + lang_list: Union[List[str], str], + spkr_list: Union[Optional[List[int]], int] = None, + dur_prediction: bool = True, + ) -> Tensor: + # TODO: Do we need this backward compatibility, or just update all calling sites? + if len(units.shape) == 1: + units = units.unsqueeze(0) # add batch dim + if isinstance(lang_list, str): + lang_list = [lang_list] * units.size(0) + if isinstance(spkr_list, int): + spkr_list = [spkr_list] * units.size(0) + lang_idx_list = [self.lang_spkr_idx_map["multilingual"][l] for l in lang_list] + if not spkr_list: + spkr_list = [-1 for _ in range(len(lang_list))] + spkr_list = [self.lang_spkr_idx_map["multispkr"][lang_list[i]][0] if spkr_list[i] == -1 else spkr_list[i] for i in range(len(spkr_list))] + x = { + "code": units.view(units.size(0), -1), + "spkr": torch.tensor([spkr_list], device=units.device).t(), + "lang": torch.tensor([lang_idx_list], device=units.device).t(), + + } + return self.code_generator(x, dur_prediction) # type: ignore[no-any-return] diff --git a/seamless_communication/src/seamless_communication/py.typed b/seamless_communication/src/seamless_communication/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/seamless_communication/src/seamless_communication/store.py b/seamless_communication/src/seamless_communication/store.py new file mode 100644 index 0000000..bf1c563 --- /dev/null +++ b/seamless_communication/src/seamless_communication/store.py @@ -0,0 +1,32 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from pathlib import Path + +from fairseq2.assets import InProcAssetMetadataProvider, asset_store + + +def add_gated_assets(model_dir: Path) -> None: + asset_store.env_resolvers.append(lambda: "gated") + + model_dir = model_dir.resolve() + + gated_metadata = [ + { + "name": "seamless_expressivity@gated", + "checkpoint": model_dir.joinpath("m2m_expressive_unity.pt"), + }, + { + "name": "vocoder_pretssel@gated", + "checkpoint": model_dir.joinpath("pretssel_melhifigan_wm.pt"), + }, + { + "name": "vocoder_pretssel_16khz@gated", + "checkpoint": model_dir.joinpath("pretssel_melhifigan_wm-16khz.pt"), + }, + ] + + asset_store.metadata_providers.append(InProcAssetMetadataProvider(gated_metadata)) diff --git a/seamless_communication/src/seamless_communication/streaming/__init__.py b/seamless_communication/src/seamless_communication/streaming/__init__.py new file mode 100644 index 0000000..15d8859 --- /dev/null +++ b/seamless_communication/src/seamless_communication/streaming/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. diff --git a/seamless_communication/src/seamless_communication/streaming/agents/__init__.py b/seamless_communication/src/seamless_communication/streaming/agents/__init__.py new file mode 100644 index 0000000..ab49aa7 --- /dev/null +++ b/seamless_communication/src/seamless_communication/streaming/agents/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + diff --git a/seamless_communication/src/seamless_communication/streaming/agents/common.py b/seamless_communication/src/seamless_communication/streaming/agents/common.py new file mode 100644 index 0000000..6032dba --- /dev/null +++ b/seamless_communication/src/seamless_communication/streaming/agents/common.py @@ -0,0 +1,34 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +""" +Mixins + common for fairseq2 simuleval agents +""" + +from simuleval.data.segments import Segment +from simuleval.agents.states import AgentStates as AgentStatesOrig + + +class EarlyStoppingMixin: + def reset_early(self) -> None: + """ + Implement to override for different behavior on a reset that + happens before EOS + """ + raise NotImplementedError() + + +class AgentStates(AgentStatesOrig): # type: ignore + def update_target(self, segment: Segment) -> None: + """An AgentStates impl which doesn't update states.target""" + self.target_finished = segment.finished + + +class NoUpdateTargetMixin: + """A shortcut to make agents default to the AgentStates impl above""" + + def build_states(self) -> AgentStates: + return AgentStates() diff --git a/seamless_communication/src/seamless_communication/streaming/agents/detokenizer.py b/seamless_communication/src/seamless_communication/streaming/agents/detokenizer.py new file mode 100644 index 0000000..1da1576 --- /dev/null +++ b/seamless_communication/src/seamless_communication/streaming/agents/detokenizer.py @@ -0,0 +1,79 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. +from __future__ import annotations + +from argparse import ArgumentParser, Namespace +from typing import Any, Dict + +from simuleval.agents import TextToTextAgent +from simuleval.agents.actions import Action, ReadAction, WriteAction +from seamless_communication.streaming.agents.common import ( + AgentStates, + NoUpdateTargetMixin, +) +from seamless_communication.streaming.agents.online_text_decoder import ( + UnitYTextDecoderOutput, +) +from simuleval.data.segments import Segment, EmptySegment + + +class DetokenizerAgent(NoUpdateTargetMixin, TextToTextAgent): # type: ignore + def __init__(self, args: Namespace): + super().__init__(args) + self.detokenize_only = args.detokenize_only + + @classmethod + def from_args(cls, args: Namespace, **kwargs: Dict[str, Any]) -> DetokenizerAgent: + return cls(args) + + def add_args(parser: ArgumentParser) -> None: + parser.add_argument( + "--detokenize-only", + action="store_true", + default=True, + help="Run detokenization without waiting for a new token.", + ) + + def policy(self, states: AgentStates) -> Action: + possible_full_words = self.decode(" ".join([x for x in states.source])) + + if self.detokenize_only and len(states.source) > 0: + states.source = [] + if len(possible_full_words) == 0 and not states.source_finished: + return ReadAction() + else: + return WriteAction(possible_full_words, states.source_finished) + + if states.source_finished: + return WriteAction(possible_full_words, True) + elif len(possible_full_words.split()) > 1: + full_word = possible_full_words.split()[0] + states.source = states.source[-1:] + return WriteAction(full_word, finished=False) + else: + return ReadAction() + + def decode(self, x: str) -> str: + return x.replace(" ", "").replace("\u2581", " ").strip() + + +class UnitYDetokenizerAgentStates(AgentStates): + def update_source(self, segment: Segment) -> None: + """ + Extract tokens from UnitYTextDecoderOutput + """ + self.source_finished = segment.finished + if isinstance(segment, EmptySegment): + return + # TextSegment + segment_content: UnitYTextDecoderOutput = segment.content + token = segment_content.tokens + self.source += token + + +class UnitYDetokenizerAgent(DetokenizerAgent): + def build_states(self) -> UnitYDetokenizerAgentStates: + return UnitYDetokenizerAgentStates() diff --git a/seamless_communication/src/seamless_communication/streaming/agents/dual_vocoder_agent.py b/seamless_communication/src/seamless_communication/streaming/agents/dual_vocoder_agent.py new file mode 100644 index 0000000..2e72594 --- /dev/null +++ b/seamless_communication/src/seamless_communication/streaming/agents/dual_vocoder_agent.py @@ -0,0 +1,116 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. +from __future__ import annotations +import copy + +import logging +from argparse import ArgumentParser, Namespace +from typing import Dict, Any + +from simuleval.agents import TextToSpeechAgent +from seamless_communication.streaming.agents.common import AgentStates +from simuleval.data.segments import Segment +from simuleval.agents.actions import Action + +from seamless_communication.streaming.agents.pretssel_vocoder import ( + PretsselVocoderAgent, +) +from seamless_communication.streaming.agents.online_vocoder import VocoderAgent + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s -- %(name)s: %(message)s", +) + +logger = logging.getLogger(__name__) + + +class DualVocoderStates(AgentStates): + def __init__( + self, vocoder_states: AgentStates, expr_vocoder_states: AgentStates + ) -> None: + self.vocoder_states = vocoder_states + self.expr_vocoder_states = expr_vocoder_states + self.config: Dict[str, Any] = {} + + @property + def target_finished(self): # type: ignore + return ( + self.vocoder_states.target_finished + or self.expr_vocoder_states.target_finished + ) + + def reset(self) -> None: + self.vocoder_states.reset() + self.expr_vocoder_states.reset() + self.config = {} + + def update_source(self, segment: Segment) -> None: + self.vocoder_states.update_config(segment.config) + self.vocoder_states.update_source(segment) + self.expr_vocoder_states.update_config(segment.config) + self.expr_vocoder_states.update_source(segment) + + def update_target(self, segment: Segment) -> None: + self.vocoder_states.update_target(segment) + self.expr_vocoder_states.update_target(segment) + + +class DualVocoderAgent(TextToSpeechAgent): # type: ignore + def __init__( + self, + args: Namespace, + vocoder: VocoderAgent, + expr_vocoder: PretsselVocoderAgent, + ) -> None: + self.vocoder = vocoder + self.expr_vocoder = expr_vocoder + super().__init__(args) + self.expressive = args.expressive + + def build_states(self) -> DualVocoderStates: + return DualVocoderStates( + self.vocoder.build_states(), self.expr_vocoder.build_states() + ) + + @classmethod + def add_args(cls, parser: ArgumentParser) -> None: + PretsselVocoderAgent.add_args(parser) + VocoderAgent.add_args(parser) + parser.add_argument( + "--expr-vocoder-name", + type=str, + required=True, + help="expressive vocoder name - vocoder_pretssel or vocoder_pretssel_16khz", + ) + parser.add_argument( + "--expressive", + action="store_true", + help="Whether to use expressive vocoder (overridable in segment.config)", + ) + + @classmethod + def from_args(cls, args: Namespace, **kwargs: Dict[str, Any]) -> DualVocoderAgent: + vocoder = VocoderAgent.from_args(args) + expr_args = copy.deepcopy(args) + expr_args.vocoder_name = args.expr_vocoder_name + expr_vocoder = PretsselVocoderAgent.from_args(expr_args) + return cls(args, vocoder, expr_vocoder) + + def policy(self, states: AgentStates) -> Action: + expressive = self.expressive + if states.config is not None and "expressive" in states.config: + expressive = states.config["expressive"] + if expressive: + states.expr_vocoder_states.upstream_states = states.upstream_states + action = self.expr_vocoder.policy(states.expr_vocoder_states) + if len(states.expr_vocoder_states.source) == 0: + states.vocoder_states.source = [] + else: + action = self.vocoder.policy(states.vocoder_states) + if len(states.vocoder_states.source) == 0: + states.expr_vocoder_states.source = [] + return action diff --git a/seamless_communication/src/seamless_communication/streaming/agents/offline_w2v_bert_encoder.py b/seamless_communication/src/seamless_communication/streaming/agents/offline_w2v_bert_encoder.py new file mode 100644 index 0000000..b36421c --- /dev/null +++ b/seamless_communication/src/seamless_communication/streaming/agents/offline_w2v_bert_encoder.py @@ -0,0 +1,110 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. +from __future__ import annotations + +from argparse import ArgumentParser, Namespace +from typing import Any, Dict + +import torch +from fairseq2.data import SequenceData +from fairseq2.data.data_pipeline import Collater +from fairseq2.data.text import TextTokenizer +from fairseq2.models.wav2vec2 import Wav2Vec2EncoderConfig +from fairseq2.nn.padding import get_seqs_and_padding_mask +from seamless_communication.models.unity.model import UnitYModel +from simuleval.agents import SpeechToSpeechAgent +from simuleval.agents.actions import Action, ReadAction, WriteAction +from simuleval.data.segments import SpeechSegment +from seamless_communication.streaming.agents.common import ( + AgentStates, + NoUpdateTargetMixin, +) + + +class OfflineWav2VecBertEncoderAgent(NoUpdateTargetMixin, SpeechToSpeechAgent): # type: ignore + """ + Incremental encoding of an wav2vec encoder output + It update the whole encoder states every time when there is a new incoming segment. + """ + + def __init__( + self, + unity_model: UnitYModel, + w2v2_encoder_config: Wav2Vec2EncoderConfig, + text_tokenizer: TextTokenizer, + args: Namespace, + ) -> None: + super().__init__(args) + self.model = unity_model + self.w2v2_encoder_config = w2v2_encoder_config + self.collate = Collater( + pad_value=text_tokenizer.vocab_info.pad_idx, pad_to_multiple=2 + ) + self.device = args.device + self.dtype = args.dtype + self.min_starting_wait = args.min_starting_wait_w2vbert + + @property + def min_input_length(self) -> int: + return self.w2v2_encoder_config.fbank_stride + + @staticmethod + def add_args(parser: ArgumentParser) -> None: + parser.add_argument( + "--min-starting-wait-w2vbert", + default=None, + type=int, + help="Min starting wait in w2vbert", + ) + + @torch.inference_mode() + def policy(self, states: AgentStates) -> Action: + """ + The policy for encoder is always write + only if the input is too short + """ + if ( + self.min_starting_wait is not None + and len(states.source) < self.min_starting_wait + and not states.source_finished + ): + return ReadAction() + + if len(states.source) < self.min_input_length: + if states.source_finished: + return WriteAction({}, finished=states.source_finished) + else: + return ReadAction() + + inputs = torch.stack(states.source).to(device=self.device, dtype=self.dtype) + src: SequenceData = self.collate(inputs) + + seqs, padding_mask = get_seqs_and_padding_mask(src) + encoder_output, _ = self.model.encode_speech( + seqs, + padding_mask, + ) + + return WriteAction( + SpeechSegment( + content=encoder_output, + tgt_lang=states.tgt_lang, + finished=states.source_finished, + ), + finished=states.source_finished, + ) + + @classmethod + def from_args( + cls, args: Namespace, **kwargs: Dict[str, Any] + ) -> OfflineWav2VecBertEncoderAgent: + unity_model = kwargs.get("unity_model", None) + assert isinstance(unity_model, UnitYModel) + unity_config = kwargs.get("unity_config", None) + assert unity_config is not None + text_tokenizer = kwargs.get("text_tokenizer", None) + assert isinstance(text_tokenizer, TextTokenizer) + return cls(unity_model, unity_config.w2v2_encoder_config, text_tokenizer, args) diff --git a/seamless_communication/src/seamless_communication/streaming/agents/online_feature_extractor.py b/seamless_communication/src/seamless_communication/streaming/agents/online_feature_extractor.py new file mode 100644 index 0000000..b69f00b --- /dev/null +++ b/seamless_communication/src/seamless_communication/streaming/agents/online_feature_extractor.py @@ -0,0 +1,152 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from __future__ import annotations + +import math +import torch + +from argparse import ArgumentParser, Namespace +from typing import Any, List + +from fairseq2.data.audio import WaveformToFbankConverter, WaveformToFbankInput + +from simuleval.agents import SpeechToSpeechAgent +from simuleval.agents.actions import Action, ReadAction, WriteAction +from simuleval.data.segments import Segment, SpeechSegment +from seamless_communication.streaming.agents.common import AgentStates + + +SHIFT_SIZE = 10 +WINDOW_SIZE = 25 +SAMPLE_RATE = 16000 +FEATURE_DIM = 80 + + +class FeatureStates(AgentStates): # type: ignore + def reset(self) -> None: + super().reset() + self.previous_residual_samples: List[float] = [] + self.tgt_lang = None + + def update_source(self, segment: Segment) -> None: + """ + Update states from input segment + Args: + segment (~simuleval.agents.segments.Segment): input segment + """ + self.source_finished = segment.finished + if self.tgt_lang is None and segment.tgt_lang is not None: + self.tgt_lang = segment.tgt_lang + if not segment.is_empty: + self.source.append(segment.content) + + +class OnlineFeatureExtractorAgent(SpeechToSpeechAgent): # type: ignore + """ + Extract speech features on the fly. + """ + + def __init__(self, args: Namespace): + super().__init__(args) + self.shift_size = args.shift_size + self.window_size = args.window_size + assert self.window_size >= self.shift_size + + self.sample_rate = args.sample_rate + self.feature_dim = args.feature_dim + self.num_samples_per_shift = int(self.shift_size * self.sample_rate / 1000) + self.num_samples_per_window = int(self.window_size * self.sample_rate / 1000) + self.len_ms_to_samples = lambda x: x * self.sample_rate / 1000 + + self.convert_to_fbank = WaveformToFbankConverter( + num_mel_bins=80, + waveform_scale=2**15 if args.denormalize else 1.0, + standardize=False, + device=args.device, + dtype=args.dtype, + ) + + def build_states(self) -> FeatureStates: + return FeatureStates() + + @staticmethod + def add_args(parser: ArgumentParser) -> None: + parser.add_argument( + "--shift-size", + type=int, + default=SHIFT_SIZE, + help="Shift size of feature extraction window.", + ) + parser.add_argument( + "--window-size", + type=int, + default=WINDOW_SIZE, + help="Window size of feature extraction window.", + ) + parser.add_argument( + "--feature-dim", + type=int, + default=FEATURE_DIM, + help="Acoustic feature dimension.", + ) + parser.add_argument( + "--denormalize", + action="store_true", + help="denormalized to 16-bit signed integers", + ) + + def policy(self, states: FeatureStates) -> Action: + if len(states.source) == 0: + if states.source_finished: + return WriteAction({}, finished=states.source_finished) + else: + return ReadAction() + + samples = states.source[-1] + + samples = states.previous_residual_samples + samples + if len(samples) < self.num_samples_per_window: + states.previous_residual_samples = samples + return ReadAction() + + # num_frames is the number of frames from the new segment + num_frames = math.floor( + (len(samples) - self.len_ms_to_samples(self.window_size - self.shift_size)) + / self.num_samples_per_shift + ) + + # the number of frames used for feature extraction + # including some part of the previous segment + effective_num_samples = int( + num_frames * self.len_ms_to_samples(self.shift_size) + + self.len_ms_to_samples(self.window_size - self.shift_size) + ) + + input_samples = samples[:effective_num_samples] + states.previous_residual_samples = samples[ + num_frames * self.num_samples_per_shift : + ] + + data: WaveformToFbankInput = { + "waveform": torch.tensor(input_samples).unsqueeze(0), + "sample_rate": self.sample_rate, + } + + output = self.convert_to_fbank(data)["fbank"] + + return WriteAction( + SpeechSegment( + content=output, + tgt_lang=states.tgt_lang, + finished=states.source_finished, + ), + finished=states.source_finished, + ) + + @classmethod + def from_args(cls, args: Any, **kwargs: Any) -> OnlineFeatureExtractorAgent: + return cls(args) diff --git a/seamless_communication/src/seamless_communication/streaming/agents/online_text_decoder.py b/seamless_communication/src/seamless_communication/streaming/agents/online_text_decoder.py new file mode 100644 index 0000000..3b5fedd --- /dev/null +++ b/seamless_communication/src/seamless_communication/streaming/agents/online_text_decoder.py @@ -0,0 +1,444 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. +from __future__ import annotations + +from argparse import ArgumentParser, Namespace +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Set, Tuple + +import torch +from fairseq2.models.nllb.tokenizer import NllbTokenizer +from fairseq2.nn.incremental_state import IncrementalStateBag +from seamless_communication.models.monotonic_decoder import ( + MonotonicDecoderConfig, + MonotonicDecoderModel, +) +from seamless_communication.streaming.agents.common import AgentStates +from simuleval.agents import GenericAgent +from simuleval.agents.actions import Action, ReadAction, WriteAction +from simuleval.data.segments import Segment, TextSegment +from torch import Tensor + + +class DecoderAgentStates(AgentStates): # type: ignore + def reset(self) -> None: + self.source_len = 0 + self.target_indices: List[int] = [] + self.tgt_lang = None + self.ngram_block_count = 0 + super().reset() + + def update_source(self, segment: Segment) -> None: + """ + Update states from input segment + Additionlly update incremental states + + Args: + segment (~simuleval.agents.segments.Segment): input segment + """ + self.source_finished = segment.finished + if self.tgt_lang is None and segment.tgt_lang is not None: + self.tgt_lang = segment.tgt_lang + if not segment.is_empty: + self.source = segment.content + if len(self.source) == 0 and segment.finished: + self.target_finished = True + return + self.source_len = self.source.size(1) + + +class OnlineTextDecoderAgent(GenericAgent): # type: ignore + """ + Online text decoder + """ + + target_type = "text" + + def __init__( + self, + model: MonotonicDecoderModel, + config: MonotonicDecoderConfig, + text_tokenizer: NllbTokenizer, + args: Namespace, + ) -> None: + super().__init__(args) + self.model = model + self.config = config + self.text_tokenizer = text_tokenizer + + self.max_len_a: int = args.max_len_a + self.max_len_b: int = args.max_len_b + self.max_consecutive_writes = self.args.max_consecutive_write + self.min_starting_wait = args.min_starting_wait + self.no_early_stop = args.no_early_stop + + self.device = args.device + self.dtype = args.dtype + self.eos_idx = text_tokenizer.vocab_info.eos_idx + + tgt_lang = getattr(args, "tgt_lang", None) + assert tgt_lang is not None + self.token_encoder = text_tokenizer.create_encoder(lang=tgt_lang, mode="target") + prefix_indices = self.token_encoder.prefix_indices + assert prefix_indices is not None + self.prefix_indices: List[int] = prefix_indices.tolist() + + def build_states(self) -> DecoderAgentStates: + return DecoderAgentStates() + + def max_len(self, states: DecoderAgentStates) -> int: + return self.max_len_a * int(states.source.size(1)) + self.max_len_b + + @staticmethod + def add_args(parser: ArgumentParser) -> None: + parser.add_argument( + "--max-len-a", + type=int, + default=1, + help="Max length of predictions, a in ax + b", + ) + parser.add_argument( + "--max-len-b", + type=int, + default=200, + help="Max length of predictions, b in ax + b", + ) + parser.add_argument( + "--max-consecutive-write", + type=int, + default=50, + help="Max consecutive writes.", + ) + parser.add_argument( + "--min-starting-wait", + type=int, + default=1, + help="Minimal starting waiting source steps", + ) + parser.add_argument( + "--no-early-stop", + action="store_true", + default=False, + ) + parser.add_argument( + "--tgt-lang", + default="eng", + type=str, + ) + + def policy(self, states: DecoderAgentStates) -> Action: + raise NotImplementedError + + def enforce_tgt_lang_in_prefix(self, states: DecoderAgentStates) -> None: + if states.tgt_lang: + tgt_lang_tag = f"__{states.tgt_lang}__" + tgt_lang_tag_idx = self.text_tokenizer.model.token_to_index(tgt_lang_tag) + self.prefix_indices[-1] = tgt_lang_tag_idx + + +class MMATextDecoderAgent(OnlineTextDecoderAgent): # type: ignore + def __init__( + self, + model: MonotonicDecoderModel, + config: MonotonicDecoderConfig, + text_tokenizer: NllbTokenizer, + args: Namespace, + ) -> None: + super().__init__(model, config, text_tokenizer, args=args) + + self.num_decoder_layers = self.config.num_decoder_layers + + self.decision_threshold = args.decision_threshold + self.decision_method = args.decision_method + self.block_ngrams = args.block_ngrams + self.p_choose_start_layer = args.p_choose_start_layer + + @staticmethod + def add_args(parser: ArgumentParser) -> None: + OnlineTextDecoderAgent.add_args(parser) + parser.add_argument( + "--decision-threshold", + type=float, + default=0.5, + help="The threshold to write an output, from 0 to 1. Small values give low latency.", + ) + parser.add_argument( + "--decision-method", + type=str, + default="min", + choices=["mean", "min", "median"], + help="The method to determine the decision. Either average all attention heads, or just pick the smallest one", + ) + parser.add_argument( + "--p-choose-start-layer", + type=int, + default=0, + help="Encoder layer from which p_choose should be considered for selection.", + ) + parser.add_argument( + "--block-ngrams", + action="store_true", + ) + + @classmethod + def from_args( + cls, args: Namespace, **kwargs: Dict[str, Any] + ) -> MMATextDecoderAgent: + model = kwargs.get("monotonic_decoder_model", None) + config = kwargs.get("monotonic_decoder_config", None) + text_tokenizer = kwargs.get("text_tokenizer", None) + + assert isinstance(model, MonotonicDecoderModel) + assert isinstance(config, MonotonicDecoderConfig) + assert isinstance(text_tokenizer, NllbTokenizer) + + return cls( + model=model, + config=config, + text_tokenizer=text_tokenizer, + args=args, + ) + + def run_decoder( + self, states: DecoderAgentStates, pred_indices: List[int] + ) -> Tuple[int, float, Tensor]: + if len(pred_indices) == 0: + self.enforce_tgt_lang_in_prefix(states) + target_input = torch.tensor( + self.prefix_indices + states.target_indices, + device=self.device, + dtype=torch.int64, + ).unsqueeze(0) + else: + target_input = torch.tensor( + pred_indices[-1:], device=self.device, dtype=torch.int64 + ).unsqueeze(0) + + encoder_output = states.source + decoder_output, _, p_choose = self.model.decode( + target_input, None, encoder_output, None, state_bag=self.state_bag + ) + + logits = self.model.project(decoder_output) + if self.block_ngrams and states.source_finished: + all_indices = states.target_indices + pred_indices + blocked_indices = all_indices[-4:] + logits[:, :, blocked_indices] = float("-inf") + + index = int(logits[0, -1].argmax().item()) + _, tgt_len, src_len = p_choose.size() + + p_choose = p_choose.view(self.num_decoder_layers, -1, tgt_len, src_len) + + if self.decision_method == "min": + prob = p_choose[self.p_choose_start_layer :, :, -1, -1].min().item() + elif self.decision_method == "mean": + prob = p_choose[self.p_choose_start_layer :, :, -1, -1].mean().item() + else: + prob = p_choose[self.p_choose_start_layer :, :, -1, -1].median().item() + + return index, prob, decoder_output + + def postprocess( + self, + states: DecoderAgentStates, + pred_indices: List[int], + finished: bool, + decoder_features_out: Optional[Tensor] = None, + ) -> TextSegment: + return TextSegment( + content=" ".join( + [self.text_tokenizer.model.index_to_token(idx) for idx in pred_indices] + ), + finished=finished, + tgt_lang=states.tgt_lang, + ) + + def get_blocked_ngrams(self, target_indices: List[int]) -> Optional[Set[str]]: + # TODO: make it configurable and use itertools + if not self.block_ngrams: + return None + blocked_ngrams = set() + if len(target_indices) >= 4: + blocked_ngrams.add(str(target_indices[-4:])) + blocked_ngrams.add(str(target_indices[-4:-2])) + blocked_ngrams.add(str(target_indices[-4:-1])) + if len(target_indices) >= 3: + blocked_ngrams.add(str(target_indices[-3:])) + blocked_ngrams.add(str(target_indices[-3:-1])) + if len(target_indices) >= 2: + blocked_ngrams.add(str(target_indices[-2:])) + return blocked_ngrams + + def maybe_block_ngrams( + self, + states: DecoderAgentStates, + pred_indices: List[int], + decoder_features_out: Tensor, + blocked_ngrams: Optional[Set[str]], + index: int, + ) -> Tuple[bool, Tensor]: + """ + This check is used to force a READ decision when n-gram repeat + happens before source_finished + """ + if not self.block_ngrams or states.source_finished: + return False, decoder_features_out + + assert blocked_ngrams is not None + all_indices = states.target_indices + pred_indices + [index] + for n in [3, 2]: # TODO: make it configurable + if len(all_indices) >= n and states.ngram_block_count <= 4: + if str(all_indices[-n:]) in blocked_ngrams: + states.ngram_block_count += 1 + pred_indices[:] = pred_indices[: -(n - 1)] + decoder_features_out = decoder_features_out[:, : -(n - 1)] + return True, decoder_features_out + blocked_ngrams.add(str(all_indices[-n:])) + return False, decoder_features_out + + @torch.inference_mode() + def policy(self, states: DecoderAgentStates) -> Action: + if len(states.source) == 0: + return ReadAction() + + if states.source_len < self.min_starting_wait and not states.source_finished: + return ReadAction() + + if states.target_finished: + return WriteAction("", finished=True) + + if len(states.source) == 0: + return ReadAction() + + self.state_bag = IncrementalStateBag(4096) + + states.source_len = states.source.size(1) + + pred_indices: List[int] = [] + index = None + prob = None + finished = False + blocked_ngrams = self.get_blocked_ngrams(states.target_indices) + decoder_features_out = None + + while True: + index, prob, decoder_features = self.run_decoder(states, pred_indices) + + if decoder_features_out is None: + decoder_features_out = decoder_features.new(0) + decoder_features_out = torch.cat( + [decoder_features_out, decoder_features], dim=1 + ) + + if ( + self.no_early_stop + and not states.source_finished + and (prob < self.decision_threshold or index == self.eos_idx) + ): + if prob == 1.0: + pred_indices = [] + break + block_ngram, decoder_features_out = self.maybe_block_ngrams( + states, pred_indices, decoder_features_out, blocked_ngrams, index + ) + if block_ngram: + break + if ( + finished + or index == self.eos_idx + or len(states.target_indices + pred_indices) > self.max_len(states) + ): + finished = True + break + + if prob < self.decision_threshold and not states.source_finished: + break + + if ( + len(states.target_indices + pred_indices) >= self.max_len(states) + or len(pred_indices) >= self.max_consecutive_writes + ): + break + + pred_indices.append(index) + if self.state_bag.step_nr == 0: + self.state_bag.increment_step_nr( + len(self.prefix_indices + states.target_indices) + ) + else: + self.state_bag.increment_step_nr() + + states.target_indices += pred_indices + + if len(pred_indices) > 0 or finished: + finished = finished or len( + states.target_indices + pred_indices + ) > self.max_len(states) + states.ngram_block_count = 0 + return WriteAction( + self.postprocess(states, pred_indices, finished, decoder_features_out), + finished=finished, + ) + else: + return ReadAction() + + +class MMASpeechToTextDecoderAgent(MMATextDecoderAgent): + source_type = "speech" + + +@dataclass +class UnitYTextDecoderOutput: + decoder_features: Tensor + tokens: List[str] + target_indices: Optional[Tensor] = None + + +class UnitYMMATextDecoderAgent(MMASpeechToTextDecoderAgent): + """ + MMA UnitY text decoder agent which just prepares the decoder + output for the downstream agent. + """ + + def postprocess( + self, + states: DecoderAgentStates, + pred_indices: List[int], + finished: bool, + decoder_features_out: Optional[Tensor] = None, + ) -> TextSegment: + tokens: List[str] = [ + self.text_tokenizer.model.index_to_token(idx) for idx in pred_indices + ] + assert decoder_features_out is not None + token_list = self.prefix_indices + states.target_indices + if ( + len(pred_indices) > 0 + and pred_indices[-1] != self.text_tokenizer.vocab_info.eos_idx + ): + # Append "," to make speech smooth + # TODO: a temporary solution. + ending_token_index = self.text_tokenizer.model.token_to_index(",") + token_list.append(ending_token_index) + self.state_bag.increment_step_nr() + + _, _, decoder_features = self.run_decoder(states, [ending_token_index]) + decoder_features_out = torch.cat( + [decoder_features_out, decoder_features], dim=1 + ) + + target_input = torch.tensor( + token_list, + device=self.device, + dtype=torch.int64, + ).unsqueeze(0) + + return TextSegment( + content=UnitYTextDecoderOutput(decoder_features_out, tokens, target_input), + finished=finished, + tgt_lang=states.tgt_lang, + ) diff --git a/seamless_communication/src/seamless_communication/streaming/agents/online_unit_decoder.py b/seamless_communication/src/seamless_communication/streaming/agents/online_unit_decoder.py new file mode 100644 index 0000000..ac96cf8 --- /dev/null +++ b/seamless_communication/src/seamless_communication/streaming/agents/online_unit_decoder.py @@ -0,0 +1,156 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. +from __future__ import annotations + +from argparse import ArgumentParser, Namespace +from typing import Any, List, Optional + +import torch +from seamless_communication.models.unity.model import UnitYModel, UnitYNART2UModel +from seamless_communication.models.unity.unit_tokenizer import UnitTokenizer +from seamless_communication.streaming.agents.online_text_decoder import ( + UnitYTextDecoderOutput, +) +from seamless_communication.streaming.agents.common import AgentStates +from simuleval.agents import GenericAgent +from simuleval.agents.actions import Action, ReadAction, WriteAction +from simuleval.data.segments import Segment, TextSegment + + +class NARUnitDecoderAgentStates(AgentStates): # type: ignore + def reset(self) -> None: + self.source_token_list: List[str] = [] + self.source_indices: Optional[torch.Tensor] = None + self.duration_start_index: int = 0 + self.tgt_lang = None + super().reset() + + def update_source(self, segment: Segment) -> None: + """ + Update states from input segment + Additionlly update incremental states + + Args: + segment (~simuleval.agents.segments.Segment): input segment + """ + self.source_finished = segment.finished + if self.tgt_lang is None and segment.tgt_lang is not None: + self.tgt_lang = segment.tgt_lang + if segment.is_empty: + if segment.finished: + self.target_finished = True + return + segment_content: UnitYTextDecoderOutput = segment.content + content = segment_content.decoder_features + token = segment_content.tokens + self.source_indices = segment_content.target_indices + self.source_token_list += token + self.source = content + + +class NARUnitYUnitDecoderAgent(GenericAgent): # type: ignore + """Non-autoregressive unit decoder""" + + source_type = "text" + target_type = "text" + + def __init__( + self, model: UnitYNART2UModel, tokenizer: UnitTokenizer, args: Namespace + ) -> None: + self.model = model + self.tokenizer = tokenizer + self.min_unit_chunk_size = args.min_unit_chunk_size + self.d_factor = args.d_factor + self.device = args.device + self.dtype = args.dtype + self.token_decoder = self.tokenizer.create_decoder() + super().__init__(args) + + def build_states(self) -> NARUnitDecoderAgentStates: + return NARUnitDecoderAgentStates() + + @property + def max_len(self) -> int: + return 200 + + @staticmethod + def add_args(parser: ArgumentParser) -> None: + parser.add_argument( + "--min-unit-chunk-size", + type=int, + required=True, + help="Minimal units to produce every chunk", + ) + parser.add_argument( + "--d-factor", + type=float, + default=1.0, + help="scaling factor for duration prediction", + ) + + @torch.inference_mode() + def policy(self, states: NARUnitDecoderAgentStates) -> Action: + if states.target_finished: + return WriteAction("", finished=True) + + if len(states.source_token_list) < 2: + if not states.source_finished: + return ReadAction() + else: + return WriteAction("", finished=True) + + model_output, _, durations = self.model( + text_decoder_output=states.source, + text_decoder_padding_mask=None, + text_seqs=states.source_indices, + duration_factor=self.d_factor, + ) + durations = durations[0] + + if states.source_finished and states.duration_start_index > 0: + # We have to consider one more word for EOS, because we append an EOS at the end. + if sum(durations[states.duration_start_index :]) == 0: + # If you reach here, it means that the last source token is a silence token (e.g. punctuations) + # In that case no need to consider one more token. + return WriteAction("", finished=True) + else: + states.duration_start_index = max(states.duration_start_index - 1, 0) + + current_duration = sum(durations[states.duration_start_index :]) + + if current_duration < self.min_unit_chunk_size: + if not states.source_finished: + # if current untranslated source result less than self.min_unit_chunk_size units + return ReadAction() + else: + if current_duration == 0: + return WriteAction("", finished=True) + + unit_seqs = model_output.logits[0].argmax(dim=-1) + index_start_offset = sum(durations[: states.duration_start_index]) + unit_seqs = unit_seqs[index_start_offset:].unsqueeze(0) + units = self.token_decoder(unit_seqs) + + # minus one because we add a ending_token on each s2t output phrase + states.duration_start_index = len(durations) - 1 + + return WriteAction( + TextSegment( + content=units, + finished=states.source_finished, + tgt_lang=states.tgt_lang, + ), + finished=states.source_finished, + ) + + @classmethod + def from_args(cls, args: Namespace, **kwargs: Any) -> NARUnitYUnitDecoderAgent: + unity_model: UnitYModel = kwargs.get("unity_model", None) + unit_tokenizer: UnitTokenizer = kwargs.get("unit_tokenizer", None) + assert unity_model.t2u_model is not None and isinstance( + unity_model.t2u_model, UnitYNART2UModel + ) + return cls(model=unity_model.t2u_model, tokenizer=unit_tokenizer, args=args) diff --git a/seamless_communication/src/seamless_communication/streaming/agents/online_vocoder.py b/seamless_communication/src/seamless_communication/streaming/agents/online_vocoder.py new file mode 100644 index 0000000..2ca1ea9 --- /dev/null +++ b/seamless_communication/src/seamless_communication/streaming/agents/online_vocoder.py @@ -0,0 +1,90 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. +from __future__ import annotations + +import logging +from argparse import ArgumentParser, Namespace +from typing import Any, Dict + +import torch +from seamless_communication.models.vocoder.loader import load_vocoder_model +from seamless_communication.streaming.agents.common import AgentStates +from simuleval.agents import TextToSpeechAgent +from simuleval.agents.actions import ReadAction, WriteAction +from simuleval.data.segments import SpeechSegment + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s -- %(name)s: %(message)s", +) + +logger = logging.getLogger(__name__) + + +class VocoderAgent(TextToSpeechAgent): # type: ignore + def __init__(self, args: Namespace) -> None: + super().__init__(args) + + logger.info( + f"Loading the Vocoder model: {args.vocoder_name} on device={args.device}, dtype={args.dtype}" + ) + self.vocoder = load_vocoder_model( + args.vocoder_name, device=args.device, dtype=args.dtype + ) + self.vocoder.eval() + + self.sample_rate = args.sample_rate + self.tgt_lang = args.tgt_lang + self.speaker_id = args.vocoder_speaker_id + + @torch.inference_mode() + def policy(self, states: AgentStates) -> WriteAction: + """ + The policy is always write if there are units + """ + units = states.source + + if len(units) == 0 or len(units[0]) == 0: + if states.source_finished: + return WriteAction([], finished=True) + else: + return ReadAction() + + tgt_lang = states.tgt_lang if states.tgt_lang else self.tgt_lang + u = units[0][0] + + wav = self.vocoder(u, tgt_lang, self.speaker_id, dur_prediction=False) + states.source = [] + + return WriteAction( + SpeechSegment( + content=wav[0][0].tolist(), + finished=states.source_finished, + sample_rate=self.sample_rate, + tgt_lang=tgt_lang, + ), + finished=states.source_finished, + ) + + @classmethod + def add_args(cls, parser: ArgumentParser) -> None: + parser.add_argument( + "--vocoder-name", + type=str, + help="Vocoder name.", + default="vocoder_v2", + ) + parser.add_argument( + "--vocoder-speaker-id", + type=int, + required=False, + default=-1, + help="Vocoder speaker id", + ) + + @classmethod + def from_args(cls, args: Namespace, **kwargs: Dict[str, Any]) -> VocoderAgent: + return cls(args) diff --git a/seamless_communication/src/seamless_communication/streaming/agents/pretssel_vocoder.py b/seamless_communication/src/seamless_communication/streaming/agents/pretssel_vocoder.py new file mode 100644 index 0000000..7141e17 --- /dev/null +++ b/seamless_communication/src/seamless_communication/streaming/agents/pretssel_vocoder.py @@ -0,0 +1,170 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. +from __future__ import annotations + +import logging +from argparse import ArgumentParser, Namespace +from pathlib import Path +from typing import Any, Dict, List + +import torch +from fairseq2.assets import asset_store +from fairseq2.data.audio import WaveformToFbankConverter, WaveformToFbankInput +from seamless_communication.models.generator.loader import load_pretssel_vocoder_model +from seamless_communication.models.unity import load_gcmvn_stats +from seamless_communication.store import add_gated_assets +from seamless_communication.streaming.agents.common import ( + AgentStates, + NoUpdateTargetMixin, +) +from simuleval.agents import TextToSpeechAgent +from simuleval.agents.actions import ReadAction, WriteAction +from simuleval.data.segments import SpeechSegment + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s -- %(name)s: %(message)s", +) + +logger = logging.getLogger(__name__) + + +class PretsselVocoderAgent(NoUpdateTargetMixin, TextToSpeechAgent): # type: ignore + def __init__(self, args: Namespace) -> None: + super().__init__(args) + + if args.gated_model_dir: + add_gated_assets(args.gated_model_dir) + + logger.info( + f"Loading the Vocoder model: {args.vocoder_name} on device={args.device}, dtype={args.dtype}" + ) + assert "pretssel" in args.vocoder_name + self.vocoder = load_pretssel_vocoder_model( + args.vocoder_name, device=args.device, dtype=args.dtype + ) + self.vocoder.eval() + + vocoder_model_card = asset_store.retrieve_card(args.vocoder_name) + self.vocoder_sample_rate = vocoder_model_card.field("sample_rate").as_(int) + self.vocoder_langs = vocoder_model_card.field("model_config").field("langs").as_list(str) + + self.upstream_idx = args.upstream_idx + self.sample_rate = args.sample_rate # input sample rate + self.tgt_lang = args.tgt_lang + self.convert_to_fbank = WaveformToFbankConverter( + num_mel_bins=80, + waveform_scale=2**15, + channel_last=True, + standardize=False, + device=args.device, + dtype=args.dtype, + ) + + _gcmvn_mean, _gcmvn_std = load_gcmvn_stats(args.vocoder_name) + self.gcmvn_mean = torch.tensor( + _gcmvn_mean, device=args.device, dtype=args.dtype + ) + self.gcmvn_std = torch.tensor(_gcmvn_std, device=args.device, dtype=args.dtype) + + def gcmvn_normalize(self, seqs: torch.Tensor) -> torch.Tensor: + result: torch.Tensor = seqs.subtract(self.gcmvn_mean).divide(self.gcmvn_std) + return result + + @torch.inference_mode() + def policy(self, states: AgentStates) -> WriteAction: + """ + The policy is always write if there is a waveform + """ + units = states.source + + if len(units) == 0 or len(units[0]) == 0: + if states.source_finished: + return WriteAction(content=[], finished=True) + else: + return ReadAction() + + unit = units[0][0] + + # adjust the control symbols for the embedding + unit += 4 + + unit, duration = torch.unique_consecutive(unit, return_counts=True) + + duration *= 2 + + if isinstance(states.upstream_states[self.upstream_idx].source, list): + source: List[float] = sum( + states.upstream_states[self.upstream_idx].source, [] + ) + else: + source = states.upstream_states[self.upstream_idx].source + + audio_dict: WaveformToFbankInput = { + "waveform": torch.tensor( + source, dtype=torch.float32, device=self.device + ).unsqueeze(1), + "sample_rate": self.sample_rate, + } + + feats = self.convert_to_fbank(audio_dict)["fbank"] + + feats = self.gcmvn_normalize(feats) + + tgt_lang = states.tgt_lang if states.tgt_lang else self.tgt_lang + + + if tgt_lang not in self.vocoder_langs: + logger.warning(f"{tgt_lang} not supported!") + content = [] + else: + wav = self.vocoder( + unit, + tgt_lang=tgt_lang, + prosody_input_seqs=feats, + durations=duration.unsqueeze(0), + normalize_before=True, + ) + content = wav[0][0][0].tolist() + + states.source = [] + + return WriteAction( + SpeechSegment( + content=content, + finished=states.source_finished, + sample_rate=self.vocoder_sample_rate, + tgt_lang=tgt_lang, + ), + finished=states.source_finished, + ) + + @classmethod + def add_args(cls, parser: ArgumentParser) -> None: + parser.add_argument( + "--gated-model-dir", + type=Path, + required=False, + help="SeamlessExpressive model directory.", + ) + parser.add_argument( + "--vocoder-name", + type=str, + help="Vocoder name - vocoder_pretssel or vocoder_pretssel_16khz", + default="vocoder_pretssel", + ) + parser.add_argument( + "--upstream-idx", + type=int, + default=0, + help="index of encoder states where states.source contains input audio", + ) + + @classmethod + def from_args( + cls, args: Namespace, **kwargs: Dict[str, Any] + ) -> PretsselVocoderAgent: + return cls(args) diff --git a/seamless_communication/src/seamless_communication/streaming/agents/seamless_s2st.py b/seamless_communication/src/seamless_communication/streaming/agents/seamless_s2st.py new file mode 100644 index 0000000..07932ea --- /dev/null +++ b/seamless_communication/src/seamless_communication/streaming/agents/seamless_s2st.py @@ -0,0 +1,65 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + + +from seamless_communication.streaming.agents.detokenizer import UnitYDetokenizerAgent +from seamless_communication.streaming.agents.offline_w2v_bert_encoder import ( + OfflineWav2VecBertEncoderAgent, +) +from seamless_communication.streaming.agents.online_feature_extractor import ( + OnlineFeatureExtractorAgent, +) +from seamless_communication.streaming.agents.online_text_decoder import ( + UnitYMMATextDecoderAgent, +) +from seamless_communication.streaming.agents.online_unit_decoder import ( + NARUnitYUnitDecoderAgent, +) +from seamless_communication.streaming.agents.pretssel_vocoder import ( + PretsselVocoderAgent, +) +from seamless_communication.streaming.agents.dual_vocoder_agent import ( + DualVocoderAgent, +) +from seamless_communication.streaming.agents.silero_vad import SileroVADAgent +from seamless_communication.streaming.agents.unity_pipeline import ( + UnitYAgentPipeline, + UnitYAgentTreePipeline, +) + + +class SeamlessS2STAgent(UnitYAgentPipeline): + pipeline = [ + OnlineFeatureExtractorAgent, + OfflineWav2VecBertEncoderAgent, + UnitYMMATextDecoderAgent, + NARUnitYUnitDecoderAgent, + PretsselVocoderAgent, + ] + + +class SeamlessS2STJointVADAgent(UnitYAgentTreePipeline): + pipeline = { + SileroVADAgent: [OnlineFeatureExtractorAgent], + OnlineFeatureExtractorAgent: [OfflineWav2VecBertEncoderAgent], + OfflineWav2VecBertEncoderAgent: [UnitYMMATextDecoderAgent], + UnitYMMATextDecoderAgent: [UnitYDetokenizerAgent, NARUnitYUnitDecoderAgent], + UnitYDetokenizerAgent: [], + NARUnitYUnitDecoderAgent: [PretsselVocoderAgent], + PretsselVocoderAgent: [], + } + + +class SeamlessS2STDualVocoderVADAgent(UnitYAgentTreePipeline): + pipeline = { + SileroVADAgent: [OnlineFeatureExtractorAgent], + OnlineFeatureExtractorAgent: [OfflineWav2VecBertEncoderAgent], + OfflineWav2VecBertEncoderAgent: [UnitYMMATextDecoderAgent], + UnitYMMATextDecoderAgent: [UnitYDetokenizerAgent, NARUnitYUnitDecoderAgent], + UnitYDetokenizerAgent: [], + NARUnitYUnitDecoderAgent: [DualVocoderAgent], + DualVocoderAgent: [], + } diff --git a/seamless_communication/src/seamless_communication/streaming/agents/seamless_streaming_s2st.py b/seamless_communication/src/seamless_communication/streaming/agents/seamless_streaming_s2st.py new file mode 100644 index 0000000..6c607b1 --- /dev/null +++ b/seamless_communication/src/seamless_communication/streaming/agents/seamless_streaming_s2st.py @@ -0,0 +1,58 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from seamless_communication.streaming.agents.detokenizer import UnitYDetokenizerAgent +from seamless_communication.streaming.agents.offline_w2v_bert_encoder import ( + OfflineWav2VecBertEncoderAgent, +) +from seamless_communication.streaming.agents.online_feature_extractor import ( + OnlineFeatureExtractorAgent, +) +from seamless_communication.streaming.agents.online_text_decoder import ( + UnitYMMATextDecoderAgent, +) +from seamless_communication.streaming.agents.online_unit_decoder import ( + NARUnitYUnitDecoderAgent, +) +from seamless_communication.streaming.agents.online_vocoder import VocoderAgent +from seamless_communication.streaming.agents.silero_vad import SileroVADAgent +from seamless_communication.streaming.agents.unity_pipeline import ( + UnitYAgentPipeline, + UnitYAgentTreePipeline, +) + + +class SeamlessStreamingS2STAgent(UnitYAgentPipeline): + pipeline = [ + OnlineFeatureExtractorAgent, + OfflineWav2VecBertEncoderAgent, + UnitYMMATextDecoderAgent, + NARUnitYUnitDecoderAgent, + VocoderAgent, + ] + + +class SeamlessStreamingS2STVADAgent(UnitYAgentPipeline): + pipeline = [ + SileroVADAgent, + OnlineFeatureExtractorAgent, + OfflineWav2VecBertEncoderAgent, + UnitYMMATextDecoderAgent, + NARUnitYUnitDecoderAgent, + VocoderAgent, + ] + + +class SeamlessStreamingS2STJointVADAgent(UnitYAgentTreePipeline): + pipeline = { + SileroVADAgent: [OnlineFeatureExtractorAgent], + OnlineFeatureExtractorAgent: [OfflineWav2VecBertEncoderAgent], + OfflineWav2VecBertEncoderAgent: [UnitYMMATextDecoderAgent], + UnitYMMATextDecoderAgent: [UnitYDetokenizerAgent, NARUnitYUnitDecoderAgent], + UnitYDetokenizerAgent: [], + NARUnitYUnitDecoderAgent: [VocoderAgent], + VocoderAgent: [], + } diff --git a/seamless_communication/src/seamless_communication/streaming/agents/seamless_streaming_s2t.py b/seamless_communication/src/seamless_communication/streaming/agents/seamless_streaming_s2t.py new file mode 100644 index 0000000..eea0097 --- /dev/null +++ b/seamless_communication/src/seamless_communication/streaming/agents/seamless_streaming_s2t.py @@ -0,0 +1,45 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from seamless_communication.streaming.agents.detokenizer import DetokenizerAgent +from seamless_communication.streaming.agents.offline_w2v_bert_encoder import ( + OfflineWav2VecBertEncoderAgent, +) +from seamless_communication.streaming.agents.online_feature_extractor import ( + OnlineFeatureExtractorAgent, +) +from seamless_communication.streaming.agents.online_text_decoder import ( + MMASpeechToTextDecoderAgent, +) +from seamless_communication.streaming.agents.silero_vad import SileroVADAgent +from seamless_communication.streaming.agents.unity_pipeline import UnitYAgentPipeline + + +class SeamlessStreamingS2TDetokAgent(UnitYAgentPipeline): + pipeline = [ + OnlineFeatureExtractorAgent, + OfflineWav2VecBertEncoderAgent, + MMASpeechToTextDecoderAgent, + DetokenizerAgent, + ] + + +class SeamlessStreamingS2TAgent(UnitYAgentPipeline): + pipeline = [ + OnlineFeatureExtractorAgent, + OfflineWav2VecBertEncoderAgent, + MMASpeechToTextDecoderAgent, + ] + + +class SeamlessStreamingS2TVADAgent(UnitYAgentPipeline): + pipeline = [ + SileroVADAgent, + OnlineFeatureExtractorAgent, + OfflineWav2VecBertEncoderAgent, + MMASpeechToTextDecoderAgent, + DetokenizerAgent, + ] diff --git a/seamless_communication/src/seamless_communication/streaming/agents/silero_vad.py b/seamless_communication/src/seamless_communication/streaming/agents/silero_vad.py new file mode 100644 index 0000000..9b6f7aa --- /dev/null +++ b/seamless_communication/src/seamless_communication/streaming/agents/silero_vad.py @@ -0,0 +1,342 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. +from __future__ import annotations + +import logging +from pathlib import Path +import queue +import random +import time +from argparse import ArgumentParser, Namespace +from os import SEEK_END +from typing import Any, List, Optional, Union + +import numpy as np +import torch +import soundfile +from seamless_communication.streaming.agents.common import ( + AgentStates, + EarlyStoppingMixin, +) +from simuleval.agents import SpeechToSpeechAgent +from simuleval.agents.actions import Action, ReadAction, WriteAction +from simuleval.data.segments import EmptySegment, Segment, SpeechSegment + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s -- %(name)s: %(message)s", +) +logger = logging.getLogger(__name__) + +SPEECH_PROB_THRESHOLD = 0.6 + + +class SileroVADStates(EarlyStoppingMixin, AgentStates): # type: ignore + def __init__(self, args: Namespace) -> None: + self.model, utils = torch.hub.load( + repo_or_dir="snakers4/silero-vad", + model="silero_vad", + force_reload=False, + onnx=False, + ) + + ( + self.get_speech_timestamps, + self.save_audio, + self.read_audio, + self.VADIterator, + self.collect_chunks, + ) = utils + self.silence_limit_ms = args.silence_limit_ms + self.speech_soft_limit_ms = args.speech_soft_limit_ms + self.window_size_samples = args.window_size_samples + self.chunk_size_samples = args.chunk_size_samples + self.sample_rate = args.sample_rate + self.init_speech_prob = args.init_speech_prob + self.debug = args.debug + self.test_input_segments_wav = None + self.debug_log(args) + self.input_queue: queue.Queue[Segment] = queue.Queue() + self.next_input_queue: queue.Queue[Segment] = queue.Queue() + super().__init__() + + def clear_queues(self) -> None: + while not self.input_queue.empty(): + self.input_queue.get_nowait() + self.input_queue.task_done() + # move everything from next_input_queue to input_queue + while not self.next_input_queue.empty(): + chunk = self.next_input_queue.get_nowait() + self.next_input_queue.task_done() + self.input_queue.put_nowait(chunk) + + def reset(self) -> None: + super().reset() + # TODO: in seamless_server, report latency for each new segment + self.first_input_ts: Optional[float] = None + self.silence_acc_ms = 0 + self.speech_acc_ms = 0 + self.input_chunk: np.ndarray[Any, np.dtype[np.int16]] = np.empty( + 0, dtype=np.int16 + ) + self.is_fresh_state = True + self.clear_queues() + self.model.reset_states() + self.consecutive_silence_decay_count = 0 + + def reset_early(self) -> None: + """ + Don't reset state before EOS + """ + pass + + def get_speech_prob_from_np_float32( + self, segment: np.ndarray[Any, np.dtype[np.float32]] + ) -> List[Any]: + t = torch.from_numpy(segment) + speech_probs = [] + # TODO: run self.model in batch? + for i in range(0, len(t), self.window_size_samples): + chunk = t[i : i + self.window_size_samples] + if len(chunk) < self.window_size_samples: + break + speech_prob = self.model(chunk, self.sample_rate).item() + speech_probs.append(speech_prob) + return speech_probs + + def debug_log(self, m: Any) -> None: + if self.debug: + logger.info(m) + + def process_speech( + self, + segment: Union[np.ndarray[Any, np.dtype[np.float32]], Segment], + tgt_lang: Optional[str] = None, + ) -> None: + """ + Process a full or partial speech chunk + """ + queue = self.input_queue + if self.source_finished: + # current source is finished, but next speech starts to come in already + self.debug_log("use next_input_queue") + queue = self.next_input_queue + + if self.first_input_ts is None: + self.first_input_ts = time.time() * 1000 + + while len(segment) > 0: + # add chunks to states.buffer + i = self.chunk_size_samples - len(self.input_chunk) + self.input_chunk = np.concatenate((self.input_chunk, segment[:i])) + segment = segment[i:] + self.is_fresh_state = False + if len(self.input_chunk) == self.chunk_size_samples: + queue.put_nowait( + SpeechSegment( + content=self.input_chunk, finished=False, tgt_lang=tgt_lang + ) + ) + self.input_chunk = np.empty(0, dtype=np.int16) + + def check_silence_acc(self, tgt_lang: Optional[str] = None) -> None: + silence_limit_ms = self.silence_limit_ms + if self.speech_acc_ms >= self.speech_soft_limit_ms: + self.debug_log("increase speech threshold") + silence_limit_ms = self.silence_limit_ms // 2 + self.debug_log(f"silence_acc_ms: {self.silence_acc_ms}") + if self.silence_acc_ms >= silence_limit_ms: + self.debug_log("=== end of segment") + # source utterance finished + self.silence_acc_ms = 0 + self.speech_acc_ms = 0 + if self.input_chunk.size > 0: + # flush partial input_chunk + self.input_queue.put_nowait( + SpeechSegment( + content=self.input_chunk, tgt_lang=tgt_lang, finished=True + ) + ) + self.input_chunk = np.empty(0, dtype=np.int16) + self.input_queue.put_nowait(EmptySegment(finished=True)) + self.source_finished = True + self.debug_write_wav(np.empty(0, dtype=np.int16), finished=True) + + def decay_silence_acc_ms(self) -> None: + if self.consecutive_silence_decay_count <= 2: + self.silence_acc_ms = self.silence_acc_ms // 2 + self.consecutive_silence_decay_count += 1 + + def update_source( + self, segment: Union[np.ndarray[Any, np.dtype[np.float32]], Segment] + ) -> None: + """ + Default value for the segment in the update_source method is a segment + Class, for some reason this interface didn't align with other interfaces + Adding this change here to support both np.ndarray and Segment class + """ + tgt_lang = None + if isinstance(segment, SpeechSegment): + self.sample_rate = segment.sample_rate + if hasattr(segment, "tgt_lang") and segment.tgt_lang is not None: + tgt_lang = segment.tgt_lang + if isinstance(segment.content, np.ndarray): + segment = np.array(segment.content, dtype=np.float32) + else: + segment = segment.content + speech_probs = self.get_speech_prob_from_np_float32(segment) + chunk_size_ms = len(segment) * 1000 / self.sample_rate + window_size_ms = self.window_size_samples * 1000 / self.sample_rate + consecutive_silence_decay = False + if self.is_fresh_state and self.init_speech_prob > 0: + threshold = SPEECH_PROB_THRESHOLD + self.init_speech_prob + else: + threshold = SPEECH_PROB_THRESHOLD + if all(i <= threshold for i in speech_probs): + if self.source_finished: + return + self.debug_log("got silent chunk") + if not self.is_fresh_state: + self.silence_acc_ms += chunk_size_ms + self.check_silence_acc(tgt_lang) + return + elif speech_probs[-1] <= threshold: + self.debug_log("=== start of silence chunk") + # beginning = speech, end = silence + # pass to process_speech and accumulate silence + self.speech_acc_ms += chunk_size_ms + consecutive_silence_decay = True + self.decay_silence_acc_ms() + self.process_speech(segment, tgt_lang) + # accumulate contiguous silence + for i in range(len(speech_probs) - 1, -1, -1): + if speech_probs[i] > threshold: + break + self.silence_acc_ms += window_size_ms + self.check_silence_acc(tgt_lang) + elif speech_probs[0] <= threshold: + self.debug_log("=== start of speech chunk") + # beginning = silence, end = speech + # accumulate silence , pass next to process_speech + for i in range(0, len(speech_probs)): + if speech_probs[i] > threshold: + break + self.silence_acc_ms += window_size_ms + # try not to split right before speech + self.silence_acc_ms = self.silence_acc_ms // 2 + self.check_silence_acc(tgt_lang) + self.speech_acc_ms += chunk_size_ms + self.process_speech(segment, tgt_lang) + else: + self.speech_acc_ms += chunk_size_ms + self.debug_log("======== got speech chunk") + consecutive_silence_decay = True + self.decay_silence_acc_ms() + self.process_speech(segment, tgt_lang) + if not consecutive_silence_decay: + self.consecutive_silence_decay_count = 0 + + def debug_write_wav( + self, chunk: np.ndarray[Any, Any], finished: bool = False + ) -> None: + if self.test_input_segments_wav is not None: + self.test_input_segments_wav.seek(0, SEEK_END) + self.test_input_segments_wav.write(chunk) + if finished: + MODEL_SAMPLE_RATE = 16_000 + debug_ts = f"{time.time()}_{random.randint(1000, 9999)}" + self.test_input_segments_wav = soundfile.SoundFile( + Path(self.test_input_segments_wav.name).parent + / f"{debug_ts}_test_input_segments.wav", + mode="w+", + format="WAV", + samplerate=MODEL_SAMPLE_RATE, + channels=1, + ) + + +class SileroVADAgent(SpeechToSpeechAgent): # type: ignore + def __init__(self, args: Namespace) -> None: + super().__init__(args) + self.chunk_size_samples = args.chunk_size_samples + self.args = args + + @staticmethod + def add_args(parser: ArgumentParser) -> None: + parser.add_argument( + "--window-size-samples", + default=512, # sampling_rate // 1000 * 32 => 32 ms at 16000 sample rate + type=int, + help="Window size for passing samples to VAD", + ) + parser.add_argument( + "--chunk-size-samples", + default=5120, # sampling_rate // 1000 * 320 => 320 ms at 16000 sample rate + type=int, + help="Chunk size for passing samples to model", + ) + parser.add_argument( + "--silence-limit-ms", + default=700, + type=int, + help="send EOS to the input_queue after this amount of silence", + ) + parser.add_argument( + "--speech-soft-limit-ms", + default=12_000, # after 15s, increase the speech threshold + type=int, + help="after this amount of speech, decrease the speech threshold (segment more aggressively)", + ) + parser.add_argument( + "--init-speech-prob", + default=0.15, + type=float, + help="Increase the initial speech probability threshold by this much at the start of speech", + ) + parser.add_argument( + "--debug", + default=False, + type=bool, + help="Enable debug logs", + ) + + def build_states(self) -> SileroVADStates: + return SileroVADStates(self.args) + + def policy(self, states: SileroVADStates) -> Action: + states.debug_log( + f"queue size: {states.input_queue.qsize()}, input_chunk size: {len(states.input_chunk)}" + ) + content: np.ndarray[Any, Any] = np.empty(0, dtype=np.int16) + is_finished = states.source_finished + tgt_lang = None + while not states.input_queue.empty(): + chunk = states.input_queue.get_nowait() + states.input_queue.task_done() + if tgt_lang is None: + tgt_lang = chunk.tgt_lang + content = np.concatenate((content, chunk.content)) + + states.debug_write_wav(content) + + if len(content) == 0: # empty queue + if not states.source_finished: + return ReadAction() + else: + # NOTE: this should never happen, this logic is a safeguard + segment = EmptySegment(finished=True) + else: + segment = SpeechSegment( + content=content.tolist(), + finished=is_finished, + tgt_lang=tgt_lang, + ) + + return WriteAction(segment, finished=is_finished) + + @classmethod + def from_args(cls, args: Namespace, **kwargs: None) -> SileroVADAgent: + return cls(args) diff --git a/seamless_communication/src/seamless_communication/streaming/agents/unity_pipeline.py b/seamless_communication/src/seamless_communication/streaming/agents/unity_pipeline.py new file mode 100644 index 0000000..a8d38b5 --- /dev/null +++ b/seamless_communication/src/seamless_communication/streaming/agents/unity_pipeline.py @@ -0,0 +1,243 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. +from __future__ import annotations + +import logging +from argparse import ArgumentParser, Namespace +from typing import Any, Dict, List, Optional, Union + +import torch +from fairseq2.assets import asset_store +from seamless_communication.inference.translator import Modality, Translator +from seamless_communication.models.generator.loader import load_pretssel_vocoder_model +from seamless_communication.models.generator.vocoder import PretsselVocoder +from seamless_communication.models.monotonic_decoder import ( + load_monotonic_decoder_config, + load_monotonic_decoder_model, +) +from seamless_communication.models.unity import ( + load_unity_config, + load_unity_model, + load_unity_text_tokenizer, + load_unity_unit_tokenizer, +) +from seamless_communication.models.vocoder.loader import load_vocoder_model +from seamless_communication.models.vocoder.vocoder import Vocoder +from seamless_communication.streaming.agents.common import ( + AgentStates, + EarlyStoppingMixin, +) +from simuleval.agents import AgentPipeline, TreeAgentPipeline +from simuleval.agents.agent import GenericAgent +from simuleval.data.segments import Segment + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s -- %(name)s: %(message)s", +) + +logger = logging.getLogger(__name__) + + +def maybe_reset_states(states: Optional[List[Optional[AgentStates]]]) -> None: + assert states is not None + for s in states: + if s is not None: + if isinstance(s, EarlyStoppingMixin): + s.reset_early() + else: + s.reset() + + +class UnitYPipelineMixin: + """ + Mixin for UnitY pipeline which works with both AgentPipeline + and TreeAgentPipeline + """ + + @classmethod + def add_args(cls, parser: ArgumentParser) -> None: + super().add_args(parser) # type: ignore + parser.add_argument("--task", type=str, help="Task type") + parser.add_argument( + "--unity-model-name", + type=str, + help="Unity model name.", + default="seamless_streaming_unity", + ) + parser.add_argument( + "--monotonic-decoder-model-name", + type=str, + help="Monotonic decoder model name.", + default="seamless_streaming_monotonic_decoder", + ) + + parser.add_argument( + "--sample-rate", + default=16000, + type=float, + ) + parser.add_argument( + "--dtype", + choices=["fp16", "fp32"], + default="fp16", + type=str, + help=( + "Choose between half-precision (fp16) and single precision (fp32) floating point formats." + + " Prefer this over the fp16 flag." + ), + ) + + @classmethod + def load_model(cls, args: Namespace) -> Dict[str, Any]: + if not torch.cuda.is_available() and "cuda" in args.device: + raise ValueError("CUDA not available, use CPU.") + + args.device = torch.device(args.device) + if (args.fp16 or args.dtype == "fp16") and args.device != torch.device("cpu"): + args.dtype = torch.float16 + else: + args.dtype = torch.float32 + + input_modality, output_modality = Translator.get_modalities_from_task_str( + args.task + ) + + if input_modality != Modality.SPEECH: + raise ValueError("`UnitYAgentPipeline` only supports speech input.") + + unity_config = load_unity_config(args.unity_model_name) + unity_config.use_text_decoder = False + unity_config.use_text_encoder = False + + text_tokenizer = load_unity_text_tokenizer(args.unity_model_name) + + # Skip loading the T2U model. + if output_modality == Modality.TEXT: + unity_config.t2u_config = None + unit_tokenizer = None + else: + unit_tokenizer = load_unity_unit_tokenizer(args.unity_model_name) + + asset_card = asset_store.retrieve_card(args.unity_model_name) + asset_card.field("model_config").set(unity_config) + + logger.info( + f"Loading the UnitY model: {args.unity_model_name} on device={args.device}, dtype={args.dtype}" + ) + unity_model = load_unity_model(asset_card, device=args.device, dtype=args.dtype) + unity_model.eval() + + monotonic_decoder_config = load_monotonic_decoder_config( + args.monotonic_decoder_model_name + ) + logger.info( + f"Loading the Monotonic Decoder model: {args.monotonic_decoder_model_name} on device={args.device}, dtype={args.dtype}" + ) + monotonic_decoder_model = load_monotonic_decoder_model( + args.monotonic_decoder_model_name, device=args.device, dtype=args.dtype + ) + monotonic_decoder_model.eval() + + return { + "unity_model": unity_model, + "unity_config": unity_config, + "monotonic_decoder_model": monotonic_decoder_model, + "monotonic_decoder_config": monotonic_decoder_config, + "text_tokenizer": text_tokenizer, + "unit_tokenizer": unit_tokenizer, + } + + +class UnitYAgentPipeline(UnitYPipelineMixin, AgentPipeline): # type: ignore + pipeline: List[GenericAgent] = [] + + def __init__(self, args: Namespace): + models_and_configs = self.load_model(args) + + module_list = [] + for p in self.pipeline: + module_list.append( + p.from_args( + args, + **models_and_configs, + ) + ) + + super().__init__(module_list) + + def pop(self, states: Optional[List[Optional[AgentStates]]] = None) -> Segment: + output_segment = super().pop(states) + if states is None: + # Not stateless + first_states = self.module_list[0].states + else: + assert len(states) == len(self.module_list) + first_states = states[0] + + if not first_states.source_finished and output_segment.finished: + # An early stop. + # The temporary solution is to start over + if states is not None: + maybe_reset_states(states) + else: + self.reset() + output_segment.finished = False + + return output_segment + + @classmethod + def from_args(cls, args: Any) -> UnitYAgentPipeline: + return cls(args) + + +class UnitYAgentTreePipeline(UnitYPipelineMixin, TreeAgentPipeline): # type: ignore + pipeline: Any = {} + + def __init__(self, args: Namespace): + models_and_configs = self.load_model(args) + + assert len(self.pipeline) > 0 + module_dict = {} + for module_class, children in self.pipeline.items(): + module_dict[module_class.from_args(args, **models_and_configs)] = children + + super().__init__(module_dict, args) + + @classmethod + def from_args(cls, args: Any) -> UnitYAgentPipeline: + return cls(args) + + def pop( + self, states: Optional[List[Optional[AgentStates]]] = None + ) -> List[Segment]: + output_segment = super().pop(states) + if states is None: + # Not stateless + first_states = self.source_module.states + else: + assert len(states) == len(self.module_dict) + first_states = states[self.source_module] + + if isinstance(output_segment, list): + finished = any(segment.finished for segment in output_segment) + else: + # case when output_index is used + finished = output_segment.finished + if not first_states.source_finished and finished: + # An early stop. + # The temporary solution is to start over + if states is not None: + maybe_reset_states(states) + else: + self.reset() + if isinstance(output_segment, list): + for segment in output_segment: + segment.finished = False + else: + output_segment.finished = False + + return output_segment # type: ignore[no-any-return] diff --git a/seamless_communication/src/seamless_communication/streaming/dataloaders/__init__.py b/seamless_communication/src/seamless_communication/streaming/dataloaders/__init__.py new file mode 100644 index 0000000..4ac40e1 --- /dev/null +++ b/seamless_communication/src/seamless_communication/streaming/dataloaders/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from seamless_communication.streaming.dataloaders.s2tt import ( + SimulEvalSpeechToTextDataloader as SimulEvalSpeechToTextDataloader, +) diff --git a/seamless_communication/src/seamless_communication/streaming/dataloaders/s2tt.py b/seamless_communication/src/seamless_communication/streaming/dataloaders/s2tt.py new file mode 100644 index 0000000..edc0fbe --- /dev/null +++ b/seamless_communication/src/seamless_communication/streaming/dataloaders/s2tt.py @@ -0,0 +1,249 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from __future__ import annotations + +import logging +import subprocess +from argparse import ArgumentParser, Namespace +from dataclasses import dataclass +from pathlib import Path +from typing import List, Optional + +import torch +import torch.nn.functional as F +from fairseq2.data.audio import AudioDecoder +from fairseq2.data.data_pipeline import Collater, DataPipeline, FileMapper +from fairseq2.data.text.converters import StrSplitter +from fairseq2.data.text.text_reader import read_text +from simuleval.data.dataloader import register_dataloader +from simuleval.data.dataloader.dataloader import IterableDataloader +from simuleval.data.dataloader.s2t_dataloader import SpeechToTextDataloader + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s -- %(name)s: %(message)s", +) + +logger = logging.getLogger(__name__) + + +@dataclass +class SoundFileInfo: + samplerate: float + path: str + + def __repr__(self) -> str: + return "\n".join([f"samplerate: {str(self.samplerate)}", f"path: {self.path}"]) + + +def count_lines(filename: Path) -> int: + result = subprocess.run(["wc", "-l", filename], stdout=subprocess.PIPE) + return int(result.stdout.decode().split()[0]) - 1 + + +class SileroVADSilenceRemover: + def __init__(self, sample_rate: int = 16000) -> None: + self.sample_rate = sample_rate + self.model, self.utils = torch.hub.load( + repo_or_dir="snakers4/silero-vad", + model="silero_vad", + # force_reload=True, + onnx=False, + ) + + def __call__(self, sample: torch.Tensor, is_standardized: bool) -> List[float]: + if not is_standardized: + # Standardizing here just for getting silence boundaries + standarized_sample_list = F.layer_norm(sample, sample.shape).tolist() + else: + standarized_sample_list = sample.tolist() + + ( + get_speech_timestamps, + save_audio, + read_audio, + VADIterator, + collect_chunks, + ) = self.utils + speech_timestamps = get_speech_timestamps( + standarized_sample_list, self.model, sampling_rate=self.sample_rate + ) + + sample_list: List[float] = sample.tolist() + if len(speech_timestamps) == 0: + return sample_list + speech_start_time = speech_timestamps[0]["start"] + speech_end_time = speech_timestamps[-1]["end"] + return sample_list[int(speech_start_time) : int(speech_end_time)] + + +@register_dataloader("fairseq2_s2tt") +class SimulEvalSpeechToTextDataloader(SpeechToTextDataloader, IterableDataloader): # type: ignore + def __init__( + self, data_pipeline: DataPipeline, is_standardized: bool, args: Namespace + ) -> None: + self.args = args + self.data_file: Path = Path(getattr(self.args, "data_file", "")) + if not self.data_file.exists(): + raise ValueError(f"data_file: {self.data_file} does not exist.") + self.start_index: int = getattr(self.args, "start_index", 0) + self.end_index: int = getattr(self.args, "end_index", -1) + self.data_pipeline = data_pipeline + self.is_standardized = is_standardized + self.data_itr = iter(self.data_pipeline) + self.cur_index = self.start_index - 1 + self.no_strip_silence = self.args.no_strip_silence + self.silence_remover = None + if not self.no_strip_silence: + logger.warn( + "Stripping silence in the beginning and the end of audio with SileroVAD." + ) + self.silence_remover = SileroVADSilenceRemover() + + def __iter__(self) -> SimulEvalSpeechToTextDataloader: + return self + + def __next__(self) -> SimulEvalSpeechToTextDataloader: + if self.cur_index >= self.end_index - 1: + raise StopIteration + self.item = next(self.data_itr) + self.cur_index += 1 + return self + + def reset(self) -> None: + self.cur_index = 0 + self.data_pipeline.reset() + + def __len__(self) -> int: + if self.end_index > 0: + return self.end_index - self.start_index + self.end_index = count_lines(self.data_file) + return self.end_index - self.start_index + + def get_source(self, index: Optional[int] = None) -> List[float]: + squeezed_item = self.item["audio"]["data"]["waveform"]["seqs"].squeeze() + + if not self.no_strip_silence and self.silence_remover is not None: + source = self.silence_remover(squeezed_item, self.is_standardized) + else: + source = squeezed_item.tolist() + + return source + + def get_target(self, index: Optional[int] = None) -> str: + return str(self.item[self.args.ref_field][0]) + + def get_tgt_lang(self, index: Optional[int] = None) -> Optional[str]: + if self.args.tgt_lang: + tgt_lang: str = self.args.tgt_lang + return tgt_lang + + tgt_lang = self.item.get("tgt_lang") + return str(tgt_lang[0]) if tgt_lang else None + + def get_source_audio_info(self, index: Optional[int] = None) -> SoundFileInfo: + samplerate = self.item["audio"]["data"]["sample_rate"][0] + path = f'{self.args.audio_root_dir}/{str(self.item["audio"]["path"][0])}' + return SoundFileInfo(samplerate, path) + + def get_source_audio_path(self, index: Optional[int] = None) -> str: + return str(self.item["audio"]["path"][0]) + + @classmethod + def from_args(cls, args: Namespace) -> SimulEvalSpeechToTextDataloader: + with open(args.data_file, "r") as f: + header = f.readline().strip("\n").split("\t") + + split_tsv = StrSplitter(names=header) + + start_index: int = getattr(args, "start_index", 0) + + pipeline_builder = ( + read_text(args.data_file, rtrim=True).skip(1 + start_index).map(split_tsv) + ) + + map_file = FileMapper(root_dir=args.audio_root_dir, cached_fd_count=10) + + pipeline_builder.map(map_file, selector="audio") + + device = getattr(args, "device", None) + assert device is not None + + decode_audio = AudioDecoder(dtype=torch.float32, device=torch.device(device)) + + pipeline_builder.map( + decode_audio, + selector="audio.data", + ) + + is_standardized = False + if args.standardize_audio: + pipeline_builder.map( + lambda x: F.layer_norm(x, x.shape), + selector="audio.data.waveform", + ) + is_standardized = True + + collate = Collater(pad_value=0, pad_to_multiple=1) + + pipeline_builder.map(collate) + + pipeline_builder.prefetch(1) + + data_pipeline = pipeline_builder.and_return() + + return cls(data_pipeline, is_standardized, args) + + @staticmethod + def add_args(parser: ArgumentParser) -> None: + parser.add_argument( + "--data-file", + type=str, + required=True, + help="Data file (.tsv) to be evaluated.", + ) + parser.add_argument( + "--audio-root-dir", + type=str, + help="Root directory for the audio filenames in the data file.", + default="", + ) + parser.add_argument( + "--ref-field", + type=str, + help="Reference target text field to compute the BLEU score against.", + default="tgt_text", + ) + parser.add_argument( + "--source-segment-size", + type=int, + default=1, + help="Source segment size, For text the unit is # token, for speech is ms", + ) + parser.add_argument( + "--tgt-lang", + default="eng", + type=str, + help="Target language to translate/transcribe into.", + ) + parser.add_argument( + "--output", + type=str, + required=True, + help="Output directory. Required if using iterable dataloader.", + ) + parser.add_argument( + "--no-strip-silence", + action="store_true", + default=False, + help="Strip silence in the beginning and the end of audio.", + ) + parser.add_argument( + "--standardize-audio", + action="store_true", + help="Standardize audio.", + ) diff --git a/seamless_communication/src/seamless_communication/toxicity/__init__.py b/seamless_communication/src/seamless_communication/toxicity/__init__.py new file mode 100644 index 0000000..c542178 --- /dev/null +++ b/seamless_communication/src/seamless_communication/toxicity/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from seamless_communication.toxicity.etox_bad_word_checker import ( + ETOXBadWordChecker as ETOXBadWordChecker, +) +from seamless_communication.toxicity.etox_bad_word_checker import ( + load_etox_bad_word_checker as load_etox_bad_word_checker, +) diff --git a/seamless_communication/src/seamless_communication/toxicity/etox_bad_word_checker.py b/seamless_communication/src/seamless_communication/toxicity/etox_bad_word_checker.py new file mode 100644 index 0000000..a6e926d --- /dev/null +++ b/seamless_communication/src/seamless_communication/toxicity/etox_bad_word_checker.py @@ -0,0 +1,212 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import codecs +import re +from pathlib import Path +from typing import Dict, List, Set, Union + +from fairseq2.assets import ( + AssetCard, + AssetDownloadManager, + AssetStore, + asset_store as base_asset_store, + download_manager as base_download_manager, +) +from fairseq2.data import StringLike +from fairseq2.data.text import SentencePieceEncoder, SentencePieceModel + + +class ETOXBadWordChecker: + bad_words: Dict[str, List[str]] + bad_word_variants: Dict[str, Dict[str, List[str]]] + sp_encoder: SentencePieceEncoder + sp_langs: Set[str] + + def __init__( + self, + bad_words: Dict[str, List[str]], + bad_word_variants: Dict[str, Dict[str, List[str]]], + sp_encoder: SentencePieceEncoder, + sp_langs: Set[str], + ): + self.bad_words = bad_words + self.bad_word_variants = bad_word_variants + self.sp_encoder = sp_encoder + self.sp_langs = sp_langs + + def extract_bad_words( + self, + source_text: str, + target_text: str, + source_lang: str, + target_lang: str, + ) -> List[str]: + bad_words_in_target_text = self.get_bad_words( + target_text, + target_lang, + ) + + # If there are no bad words in the target text, do nothing. + if len(bad_words_in_target_text) == 0: + return [] + + bad_words_in_source_text = self.get_bad_words( + source_text, + source_lang, + ) + + # If there are bad words in the source text, do nothing. + if len(bad_words_in_source_text) > 0: + return [] + + bad_words: List[str] = [] + + for word in bad_words_in_target_text: + bad_words.extend(self.bad_word_variants[target_lang][word]) + + return bad_words + + def get_bad_words(self, text: str, lang: str) -> List[str]: + try: + bad_words = self.bad_words[lang] + except KeyError as e: + raise RuntimeError(f"MinTox model does not support {lang}.") from e + + text = self._preprocess(text) + + if lang in self.sp_langs: + return self._find_bad_words_in_sp(text, bad_words) + + return self._find_bad_words(text, bad_words) + + @staticmethod + def _preprocess(text: str) -> str: + return re.sub(r"[\W+]", " ", text.lower()) + + @staticmethod + def _find_bad_words(text: str, bad_words: List[str]) -> List[str]: + output: List[str] = [] + + text = " " + text.lower() + " " + + bad_words = [" " + word.lower() + " " for word in bad_words] + + for word in bad_words: + if word in text: + output.append(word) + + return [word.strip(" ") for word in output] + + def _find_bad_words_in_sp(self, text: str, bad_words: List[str]) -> List[str]: + text_tokens = self.sp_encoder.encode_as_tokens(text.lower()) + + output: List[str] = [] + + for word in bad_words: + word_tokens = self.sp_encoder.encode_as_tokens(word.lower()) + + if self._contains_tokens(text_tokens, word_tokens): + output.append(str(word)) + + return output + + @staticmethod + def _contains_tokens( + text_tokens: List[StringLike], word_tokens: List[StringLike] + ) -> bool: + for i in range(len(text_tokens) - len(word_tokens) + 1): + for j in range(len(word_tokens)): + if text_tokens[i + j] != word_tokens[j]: + break + else: + return True + + return False + + +class ETOXBadWordCheckerLoader: + asset_store: AssetStore + download_manager: AssetDownloadManager + + def __init__( + self, + asset_store: AssetStore, + download_manager: AssetDownloadManager, + ) -> None: + self.asset_store = asset_store + self.download_manager = download_manager + + def __call__( + self, + model_name_or_card: Union[str, AssetCard], + ) -> ETOXBadWordChecker: + if isinstance(model_name_or_card, AssetCard): + card = model_name_or_card + else: + card = self.asset_store.retrieve_card(model_name_or_card) + + bad_words: Dict[str, List[str]] = {} + + bad_word_variants: Dict[str, Dict[str, List[str]]] = {} + + etox_lang_variants = card.field("etox_lang_variants").as_set(str) + + etox_ds_uri = card.field("etox_dataset").as_uri() + + etox_ds_path = self.download_manager.download_dataset(etox_ds_uri, "etox") + + for word_file in etox_ds_path.iterdir(): + lang = word_file.name[:8] + + if lang not in etox_lang_variants: + lang = lang[:3] + + words = self._load_words(word_file) + + bad_words[lang] = words + + bad_word_variants[lang] = {} + + for word in words: + bad_word_variants[lang][word] = [ + word.lower(), + word.upper(), + word.capitalize(), + ] + + sp_uri = card.field("sp_model").as_uri() + + sp_pathname = self.download_manager.download_tokenizer(sp_uri, card.name) + + sp_model = SentencePieceModel(sp_pathname) + + sp_encoder = SentencePieceEncoder(sp_model) + + sp_langs = card.field("sp_langs").as_set(str) + + return ETOXBadWordChecker( + bad_words, + bad_word_variants, + sp_encoder, + sp_langs, + ) + + @staticmethod + def _load_words(pathname: Path) -> List[str]: + words: List[str] = [] + + with open(pathname, "r", encoding="utf-8") as fp: + for line in fp.readlines(): + words.append(codecs.encode(line, "rot_13").rstrip("\n")) + + return list(set(words)) # Dedup. + + +load_etox_bad_word_checker = ETOXBadWordCheckerLoader( + base_asset_store, + base_download_manager, +) diff --git a/seamless_communication/src/seamless_communication/toxicity/mintox.py b/seamless_communication/src/seamless_communication/toxicity/mintox.py new file mode 100644 index 0000000..aa772be --- /dev/null +++ b/seamless_communication/src/seamless_communication/toxicity/mintox.py @@ -0,0 +1,221 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import logging +from typing import List, Optional, Tuple + +from torch import Tensor +import torch +from torch.nn import functional as F + + +from seamless_communication.inference import SequenceGeneratorOptions +from seamless_communication.toxicity.etox_bad_word_checker import ( + ETOXBadWordChecker, +) +from fairseq2.generation import BannedSequenceProcessor +from fairseq2.data.text.text_tokenizer import TextTokenizer +from fairseq2.data.typing import StringLike +from fairseq2.typing import Device +from fairseq2.data import SequenceData +from fairseq2.nn.padding import get_seqs_and_padding_mask +from seamless_communication.models.unity import ( + UnitTokenizer, + UnitYModel, +) + + +logger = logging.getLogger(__name__) + + +def _extract_bad_words_with_batch_indices( + source_texts: List[StringLike], + target_texts: List[StringLike], + source_lang: str, + target_lang: str, + bad_word_checker: ETOXBadWordChecker, +) -> Tuple[List[str], List[int]]: + all_bad_words, batch_indices = [], [] + + for idx, (source_text, target_text) in enumerate(zip(source_texts, target_texts)): + bad_words = bad_word_checker.extract_bad_words( + str(source_text), str(target_text), source_lang, target_lang + ) + + if bad_words: + batch_indices.append(idx) + + all_bad_words.extend(bad_words) + + return all_bad_words, batch_indices + + +def _replace_with_new_text_output_in_batch( + original_texts: List[StringLike], + indices_with_toxicity: List[int], + new_texts: List[StringLike], +) -> None: + new_idx = 0 + # indices_with_toxicity is a small list, using list should be fast enough. + for original_idx in range(len(original_texts)): + if original_idx in indices_with_toxicity: + original_texts[original_idx] = new_texts[new_idx] + new_idx += 1 + + +def _replace_with_new_unit_output_in_batch( + unit_tokenizer: UnitTokenizer, + original_units: Tensor, + indices_with_toxicity_tensor: Tensor, + new_units: Tensor, +) -> None: + original_units_length = original_units.size(1) + new_units_length = new_units.size(1) + length_diff = abs(new_units_length - original_units_length) + nb_pads = (0, length_diff) + pad_idx = unit_tokenizer.vocab_info.pad_idx or 1 + if new_units_length > original_units_length: + # pad on the original units + original_units = F.pad( + original_units, pad=nb_pads, mode="constant", value=pad_idx + ) + else: + # pad on the new units + new_units = F.pad( + new_units, pad=nb_pads, mode="constant", value=pad_idx + ) + original_units[indices_with_toxicity_tensor] = new_units + + +def mintox_pipeline( + model: UnitYModel, + text_tokenizer: TextTokenizer, + unit_tokenizer: UnitTokenizer, + device: Device, + src_lang: str, + tgt_lang: str, + model_input: SequenceData, + input_modality: "Modality", + output_modality: "Modality", + src_texts: List[StringLike], + original_texts: List[StringLike], + original_units: Optional[Tensor] = None, + unit_generation_ngram_filtering: bool = False, + text_generation_opts: Optional[SequenceGeneratorOptions] = None, + unit_generation_opts: Optional[SequenceGeneratorOptions] = None, + bad_word_checker: ETOXBadWordChecker = None, + duration_factor: float = 1.0, + prosody_encoder_input: Optional[SequenceData] = None, +) -> Tuple[List[StringLike], Optional[Tensor]]: + """MinTox: Mitigation at INference time of added TOXicity.""" + from seamless_communication.inference.translator import Modality, Translator + + if text_generation_opts is None: + text_generation_opts = SequenceGeneratorOptions( + beam_size=5, soft_max_seq_len=(1, 200) + ) + if unit_generation_opts is None: + unit_generation_opts = SequenceGeneratorOptions( + beam_size=5, soft_max_seq_len=(25, 50) + ) + + def _get_banned_sequence_processor( + banned_sequences: List[str], + ) -> BannedSequenceProcessor: + text_encoder = text_tokenizer.create_raw_encoder(device=device) + + banned_seqs = [text_encoder(b) for b in banned_sequences] + # A bannded string often appears after some puncatuations or symbols, we want + # to include this sequence of token ids as well. + # So we can ban not only the string "shit" but also "*shit", ",shit" etc. + banned_seqs += [text_encoder(f"★{x}")[1:] for x in banned_sequences] + return BannedSequenceProcessor(banned_seqs) + + bad_words, indices_with_toxicity = _extract_bad_words_with_batch_indices( + src_texts, + original_texts, + src_lang, + tgt_lang, + bad_word_checker, + ) + + if len(indices_with_toxicity) == 0: + # if no added toxicity is found, retrun the orignal output + if output_modality == Modality.TEXT: + return original_texts, None + else: + return original_texts, original_units + else: + logger.info( + "TOX src_lang=%s tgt_lang=%s added_tox=%d", + src_lang, + tgt_lang, + len(indices_with_toxicity), + ) + # otherwise, redo the prediction with a list of bad words to ban + banned_sequence_processor = _get_banned_sequence_processor( + banned_sequences=list(set(bad_words)), + ) + text_generation_opts.step_processor = banned_sequence_processor + # select only the sources with toxicity + indices_with_toxicity_tensor = torch.tensor( + indices_with_toxicity, device=device + ) + if model_input["is_ragged"]: + model_input["seqs"] = torch.index_select( + input=model_input["seqs"], + dim=0, + index=indices_with_toxicity_tensor, + ) + model_input["seq_lens"] = torch.index_select( + input=model_input["seq_lens"], + dim=0, + index=indices_with_toxicity_tensor, + ) + seqs, padding_mask = get_seqs_and_padding_mask(model_input) + # redo the prediction + new_texts, new_units = Translator.get_prediction( + model=model, + text_tokenizer=text_tokenizer, + unit_tokenizer=unit_tokenizer, + seqs=seqs, + padding_mask=padding_mask, + input_modality=input_modality, + output_modality=output_modality, + tgt_lang=tgt_lang, + unit_generation_ngram_filtering=unit_generation_ngram_filtering, + text_generation_opts=text_generation_opts, + unit_generation_opts=unit_generation_opts, + duration_factor=duration_factor, + prosody_encoder_input=prosody_encoder_input, + ) + batch_size = len(original_texts) + if batch_size > 1: + # reconstruct the text output by updating the original one in place + _replace_with_new_text_output_in_batch( + original_texts, indices_with_toxicity, new_texts + ) + final_texts = original_texts + else: + final_texts = new_texts + + if output_modality == Modality.TEXT: + return final_texts, None + else: + if batch_size > 1: + assert original_units is not None + assert new_units is not None + # reconstruct the unit output by updating the original one in place + _replace_with_new_unit_output_in_batch( + unit_tokenizer, + original_units, + indices_with_toxicity_tensor, + new_units, + ) + final_units = original_units + else: + final_units = new_units + return final_texts, final_units diff --git a/seamless_communication/tests/__init__.py b/seamless_communication/tests/__init__.py new file mode 100644 index 0000000..6d8330b --- /dev/null +++ b/seamless_communication/tests/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import pytest + +pytest.register_assert_rewrite("tests.common") diff --git a/seamless_communication/tests/common.py b/seamless_communication/tests/common.py new file mode 100644 index 0000000..6f49708 --- /dev/null +++ b/seamless_communication/tests/common.py @@ -0,0 +1,116 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from contextlib import contextmanager +from typing import Any, Generator, List, Optional, Union + +import torch +from fairseq2.data import Collater +from fairseq2.data.audio import WaveformToFbankConverter, WaveformToFbankInput +from fairseq2.typing import DataType, Device +from torch import Tensor + +# The default device that tests should use. Note that pytest can change it based +# on the provided command line arguments. +device = Device("cpu") + + +def assert_close( + a: Tensor, + b: Union[Tensor, List[Any]], + rtol: Optional[float] = None, + atol: Optional[float] = None, +) -> None: + """Assert that ``a`` and ``b`` are element-wise equal within a tolerance.""" + if not isinstance(b, Tensor): + b = torch.tensor(b, device=device, dtype=a.dtype) + + torch.testing.assert_close(a, b, rtol=rtol, atol=atol) # type: ignore[attr-defined] + + +def assert_equal(a: Tensor, b: Union[Tensor, List[Any]]) -> None: + """Assert that ``a`` and ``b`` are element-wise equal.""" + if not isinstance(b, Tensor): + b = torch.tensor(b, device=device, dtype=a.dtype) + + torch.testing.assert_close(a, b, rtol=0, atol=0) # type: ignore[attr-defined] + + +def assert_unit_close( + a: Tensor, + b: Union[Tensor, List[Any]], + num_unit_tol: int = 1, + percent_unit_tol: float = 0.0, +) -> None: + """Assert two unit sequence are equal within a tolerance""" + if not isinstance(b, Tensor): + b = torch.tensor(b, device=device, dtype=a.dtype) + + assert ( + a.shape == b.shape + ), f"Two shapes are different, one is {a.shape}, the other is {b.shape}" + + if percent_unit_tol > 0.0: + num_unit_tol = int(percent_unit_tol * len(a)) + + num_unit_diff = (a != b).sum() + assert ( + num_unit_diff <= num_unit_tol + ), f"The difference is beyond tolerance, {num_unit_diff} units are different, tolerance is {num_unit_tol}" + + +def has_no_inf(a: Tensor) -> bool: + """Return ``True`` if ``a`` has no positive or negative infinite element.""" + return not torch.any(torch.isinf(a)) + + +def has_no_nan(a: Tensor) -> bool: + """Return ``True`` if ``a`` has no NaN element.""" + return not torch.any(torch.isnan(a)) + + +@contextmanager +def tmp_rng_seed(device: Device, seed: int = 0) -> Generator[None, None, None]: + """Set a temporary manual RNG seed. + + The RNG is reset to its original state once the block is exited. + """ + device = Device(device) + + if device.type == "cuda": + devices = [device] + else: + devices = [] + + with torch.random.fork_rng(devices): + torch.manual_seed(seed) + + yield + + +def get_default_dtype() -> DataType: + if device == Device("cpu"): + dtype = torch.float32 + else: + dtype = torch.float16 + return dtype + + +def convert_to_collated_fbank(audio_dict: WaveformToFbankInput, dtype: DataType) -> Any: + convert_to_fbank = WaveformToFbankConverter( + num_mel_bins=80, + waveform_scale=2**15, + channel_last=True, + standardize=True, + device=device, + dtype=dtype, + ) + + collater = Collater(pad_value=1) + + feat = collater(convert_to_fbank(audio_dict))["fbank"] + + return feat diff --git a/seamless_communication/tests/conftest.py b/seamless_communication/tests/conftest.py new file mode 100644 index 0000000..5fc51de --- /dev/null +++ b/seamless_communication/tests/conftest.py @@ -0,0 +1,53 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import tempfile +from argparse import ArgumentTypeError +from typing import cast +from urllib.request import urlretrieve + +import pytest +import torch +from fairseq2.data.audio import AudioDecoder, AudioDecoderOutput +from fairseq2.memory import MemoryBlock +from fairseq2.typing import Device + +import tests.common + + +def parse_device_arg(value: str) -> Device: + try: + return Device(value) + except RuntimeError: + raise ArgumentTypeError(f"'{value}' is not a valid device name.") + + +def pytest_addoption(parser: pytest.Parser) -> None: + # fmt: off + parser.addoption( + "--device", default="cpu", type=parse_device_arg, + help="device on which to run tests (default: %(default)s)", + ) + # fmt: on + + +def pytest_sessionstart(session: pytest.Session) -> None: + tests.common.device = cast(Device, session.config.getoption("device")) + + +@pytest.fixture(scope="module") +def example_rate16k_audio() -> AudioDecoderOutput: + url = "https://dl.fbaipublicfiles.com/seamlessM4T/LJ037-0171_sr16k.wav" + + audio_decoder = AudioDecoder(dtype=torch.float32, device=tests.common.device) + + with tempfile.NamedTemporaryFile() as f: + urlretrieve(url, f.name) + with open(f.name, "rb") as fb: + block = MemoryBlock(fb.read()) + decoded_audio = audio_decoder(block) + + return decoded_audio diff --git a/seamless_communication/tests/integration/__init__.py b/seamless_communication/tests/integration/__init__.py new file mode 100644 index 0000000..15d8859 --- /dev/null +++ b/seamless_communication/tests/integration/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. diff --git a/seamless_communication/tests/integration/inference/__init__.py b/seamless_communication/tests/integration/inference/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seamless_communication/tests/integration/inference/test_mintox.py b/seamless_communication/tests/integration/inference/test_mintox.py new file mode 100644 index 0000000..af169fb --- /dev/null +++ b/seamless_communication/tests/integration/inference/test_mintox.py @@ -0,0 +1,130 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from fairseq2.assets import download_manager +from seamless_communication.inference.translator import Translator +from seamless_communication.toxicity.etox_bad_word_checker import ETOXBadWordChecker +from seamless_communication.toxicity.mintox import _extract_bad_words_with_batch_indices +from tests.common import device, get_default_dtype +from seamless_communication.toxicity import load_etox_bad_word_checker + +import pytest + + +@pytest.fixture +def bad_words_checker() -> ETOXBadWordChecker: + return load_etox_bad_word_checker("mintox") + + +def test_mintox_s2tt(bad_words_checker: ETOXBadWordChecker): + model_name = "seamlessM4T_v2_large" + vocoder_name = "vocoder_v2" + src_text = "The strategy proved effective, cutting off vital military and civilian supplies, although this blockade violated generally accepted international law codified by several international agreements of the past two centuries." + src_lang = "eng" + tgt_lang = "fra" + task = "s2tt" + sample_rate = 16_000 + test_wav_uri = "https://dl.fbaipublicfiles.com/seamlessM4T/inference/mintox/mintox_s2t_test_file.wav" + + input_wav = str(download_manager.download_checkpoint(test_wav_uri, test_wav_uri)) + dtype = get_default_dtype() + + translator_without_mintox = Translator( + model_name, vocoder_name, device, dtype=dtype + ) + translated_texts, _ = translator_without_mintox.predict( + input=input_wav, + task_str=task, + tgt_lang=tgt_lang, + src_lang=src_lang, + sample_rate=sample_rate, + ) + all_bad_words, batch_indices = _extract_bad_words_with_batch_indices( + [src_text], + [str(t) for t in translated_texts], + src_lang, + tgt_lang, + bad_words_checker, + ) + assert all_bad_words == ["violé", "VIOLÉ", "Violé"] + assert batch_indices == [0] + del translator_without_mintox + translator_with_mintox = Translator( + model_name, vocoder_name, device, dtype=dtype, apply_mintox=True + ) + translated_texts, _ = translator_with_mintox.predict( + input=input_wav, + task_str=task, + tgt_lang=tgt_lang, + src_lang=src_lang, + sample_rate=sample_rate, + ) + all_bad_words, batch_indices = _extract_bad_words_with_batch_indices( + [src_text], + [str(t) for t in translated_texts], + src_lang, + tgt_lang, + bad_words_checker, + ) + assert all_bad_words == [] + assert batch_indices == [] + + +def test_mintox_t2tt(bad_words_checker: ETOXBadWordChecker): + model_name = "seamlessM4T_v2_large" + vocoder_name = "vocoder_v2" + src_text = "I wonder what it'd be like to be a doff parent." + src_lang = "eng" + tgt_lang = "fra" + task = "t2tt" + + dtype = get_default_dtype() + + translator_without_mintox = Translator( + model_name, vocoder_name, device, dtype=dtype + ) + translated_texts, _ = translator_without_mintox.predict( + input=src_text, + task_str=task, + tgt_lang=tgt_lang, + src_lang=src_lang, + ) + all_bad_words, batch_indices = _extract_bad_words_with_batch_indices( + [src_text], + [str(t) for t in translated_texts], + src_lang, + tgt_lang, + bad_words_checker, + ) + assert ( + str(translated_texts[0]) + == "Je me demande à quoi ça ressemblerait d'être un parent débile." + ) + assert all_bad_words == ["débile", "DÉBILE", "Débile"] + assert batch_indices == [0] + del translator_without_mintox + translator_with_mintox = Translator( + model_name, vocoder_name, device, dtype=dtype, apply_mintox=True + ) + translated_texts, _ = translator_with_mintox.predict( + input=src_text, + task_str=task, + tgt_lang=tgt_lang, + src_lang=src_lang, + ) + all_bad_words, batch_indices = _extract_bad_words_with_batch_indices( + [src_text], + [str(t) for t in translated_texts], + src_lang, + tgt_lang, + bad_words_checker, + ) + assert ( + str(translated_texts[0]) + == "Je me demande à quoi ça ressemblerait d'être un parent doff." + ) + assert all_bad_words == [] + assert batch_indices == [] diff --git a/seamless_communication/tests/integration/inference/test_translator.py b/seamless_communication/tests/integration/inference/test_translator.py new file mode 100644 index 0000000..dfec1b9 --- /dev/null +++ b/seamless_communication/tests/integration/inference/test_translator.py @@ -0,0 +1,93 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Final + +from seamless_communication.inference import Translator +from tests.common import device, get_default_dtype + +# fmt: off +ENG_SENTENCE: Final = "On Monday, scientists from the Stanford University School of Medicine announced the invention of a new diagnostic tool that can sort cells by type: a tiny printable chip that can be manufactured using standard inkjet printers for possibly about one U.S. cent each." +DEU_SENTENCE: Final = "Am Montag kündigten Wissenschaftler der Stanford University School of Medicine die Erfindung eines neuen Diagnosewerkzeugs an, das Zellen nach Typ sortieren kann: ein winziger druckbarer Chip, der mit Standard-Tintenstrahldruckern für etwa einen US-Cent hergestellt werden kann." +DEU_SENTENCE_V2: Final = "Am Montag kündigten Wissenschaftler der Stanford University School of Medicine die Erfindung eines neuen diagnostischen Werkzeugs an, das Zellen nach Typ sortieren kann: ein winziger druckbarer Chip, der mit Standard-Tintenstrahldrucker für möglicherweise etwa einen US-Cent pro Stück hergestellt werden kann." +# fmt: on + + +def test_seamless_m4t_large_t2tt() -> None: + model_name = "seamlessM4T_large" + src_lang = "eng" + tgt_lang = "deu" + + dtype = get_default_dtype() + + translator = Translator(model_name, "vocoder_36langs", device, dtype=dtype) + text_output, _ = translator.predict( + ENG_SENTENCE, + "t2tt", + tgt_lang, + src_lang=src_lang, + ) + assert text_output[0] == DEU_SENTENCE, f"'{text_output[0]}' is not '{DEU_SENTENCE}'" + + +def test_seamless_m4t_v2_large_t2tt() -> None: + model_name = "seamlessM4T_v2_large" + src_lang = "eng" + tgt_lang = "deu" + + dtype = get_default_dtype() + + translator = Translator(model_name, "vocoder_v2", device, dtype=dtype) + text_output, _ = translator.predict( + ENG_SENTENCE, + "t2tt", + tgt_lang, + src_lang=src_lang, + ) + assert ( + text_output[0] == DEU_SENTENCE_V2 + ), f"'{text_output[0]}' is not '{DEU_SENTENCE_V2}'" + + +def test_seamless_m4t_v2_large_multiple_tasks() -> None: + model_name = "seamlessM4T_v2_large" + english_text = "Hello! I hope you're all doing well." + ref_spanish_text = "Hola, espero que todo se esté haciendo bien." + ref_spanish_asr_text = "Hola, espero que todo se esté haciendo bien." + + dtype = get_default_dtype() + + translator = Translator(model_name, "vocoder_v2", device, dtype=dtype) + + # Generate english speech for the english text. + _, english_speech_output = translator.predict( + english_text, + "t2st", + "eng", + src_lang="eng", + ) + assert english_speech_output is not None + + # Translate english speech to spanish speech. + spanish_text_output, spanish_speech_output = translator.predict( + english_speech_output.audio_wavs[0][0], + "s2st", + "spa", + ) + assert spanish_speech_output is not None + assert ( + spanish_text_output[0] == ref_spanish_text + ), f"'{spanish_text_output[0]}' is not '{ref_spanish_text}'" + + # Run ASR on the spanish speech. + spanish_asr_text_output, _ = translator.predict( + spanish_speech_output.audio_wavs[0][0], + "asr", + "spa", + ) + assert ( + spanish_asr_text_output[0] == ref_spanish_asr_text + ), f"{spanish_asr_text_output[0]} is not {ref_spanish_asr_text}'" diff --git a/seamless_communication/tests/integration/models/__init__.py b/seamless_communication/tests/integration/models/__init__.py new file mode 100644 index 0000000..15d8859 --- /dev/null +++ b/seamless_communication/tests/integration/models/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. diff --git a/seamless_communication/tests/integration/models/test_conformer_shaw.py b/seamless_communication/tests/integration/models/test_conformer_shaw.py new file mode 100644 index 0000000..e4a2bfc --- /dev/null +++ b/seamless_communication/tests/integration/models/test_conformer_shaw.py @@ -0,0 +1,41 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch + +from fairseq2.data.audio import AudioDecoderOutput +from fairseq2.nn.padding import get_seqs_and_padding_mask + +from seamless_communication.models.conformer_shaw import load_conformer_shaw_model + +from tests.common import ( + convert_to_collated_fbank, + get_default_dtype, + device, +) + +REF_MEAN, REF_STD = -0.0001, 0.1547 + + +def test_conformer_shaw_600m(example_rate16k_audio: AudioDecoderOutput) -> None: + + dtype = get_default_dtype() + audio_dict = example_rate16k_audio + src = convert_to_collated_fbank(audio_dict, dtype=dtype) + seqs, padding_mask = get_seqs_and_padding_mask(src) + + model = load_conformer_shaw_model("conformer_shaw", device=device, dtype=dtype) + model.eval() + + with torch.inference_mode(): + seqs, padding_mask = model.encoder_frontend(seqs, padding_mask) + + seqs, _ = model.encoder(seqs, padding_mask) + + std, mean = torch.std_mean(seqs) + + assert round(mean.item(), 4) == REF_MEAN + assert round(std.item(), 4) == REF_STD diff --git a/seamless_communication/tests/integration/models/test_unity2_aligner.py b/seamless_communication/tests/integration/models/test_unity2_aligner.py new file mode 100644 index 0000000..4355caa --- /dev/null +++ b/seamless_communication/tests/integration/models/test_unity2_aligner.py @@ -0,0 +1,70 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +from typing import Final + +import torch +from torch import tensor + +from fairseq2.data.audio import AudioDecoderOutput +from seamless_communication.models.aligner.alignment_extractor import AlignmentExtractor +from tests.common import assert_equal, device, get_default_dtype + + +REF_TEXT = "the examination and testimony of the experts enabled the commision to conclude that five shots may have been fired" + +# fmt: off +REF_DURATIONS_FP16: Final = [[ 1, 1, 2, 1, 1, 5, 5, 6, 4, 3, 2, 3, 4, 4, 2, 2, 2, 1, + 1, 1, 3, 3, 3, 4, 3, 3, 3, 4, 4, 3, 2, 2, 1, 1, 1, 1, + 2, 4, 6, 5, 4, 3, 4, 5, 5, 16, 6, 3, 5, 5, 3, 3, 1, 2, + 1, 1, 1, 2, 3, 2, 3, 1, 3, 3, 3, 2, 2, 4, 2, 2, 2, 3, + 2, 4, 5, 4, 5, 8, 3, 17, 2, 2, 3, 2, 5, 4, 6, 3, 1, 1, + 4, 4, 3, 5, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, + 2, 6, 4, 5, 9, 5, 1, 12]] +# fmt: on + +# fmt: off +REF_DURATIONS_FP32: Final = [[ 1, 1, 2, 1, 1, 5, 5, 6, 4, 3, 2, 3, 4, 4, 2, 2, 2, 1, + 1, 1, 3, 3, 3, 4, 3, 3, 4, 3, 4, 3, 2, 2, 1, 1, 1, 1, + 2, 4, 6, 5, 4, 3, 4, 5, 5, 16, 6, 3, 5, 5, 3, 3, 1, 2, + 1, 1, 1, 2, 3, 2, 3, 1, 3, 3, 3, 2, 2, 4, 2, 2, 2, 3, + 2, 4, 5, 4, 5, 8, 3, 17, 2, 2, 3, 2, 5, 4, 6, 3, 1, 1, + 4, 4, 3, 5, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, + 2, 6, 4, 5, 9, 5, 1, 12]] +# fmt: on + + +def test_aligner(example_rate16k_audio: AudioDecoderOutput) -> None: + aligner_name = "nar_t2u_aligner" + unit_extractor_name = "xlsr2_1b_v2" + unit_extractor_output_layer_n = 35 + unit_extractor_kmeans_uri = "https://dl.fbaipublicfiles.com/seamlessM4T/models/unit_extraction/kmeans_10k.npy" + dtype = get_default_dtype() + if dtype == torch.float32: + ref_tensor = REF_DURATIONS_FP32 + else: + ref_tensor = REF_DURATIONS_FP16 + + audio = example_rate16k_audio["waveform"].mean( + 1 + ) # averaging mono to get [Time] shape required by aligner + + extractor = AlignmentExtractor( + aligner_name, + unit_extractor_name, + unit_extractor_output_layer_n, + unit_extractor_kmeans_uri, + device=device, + dtype=dtype, + ) + + alignment_durations, _, _ = extractor.extract_alignment( + audio, REF_TEXT, plot=False, add_trailing_silence=True + ) + + assert_equal( + alignment_durations, tensor(ref_tensor, device=device, dtype=torch.int64) + ) diff --git a/seamless_communication/tests/unit/__init__.py b/seamless_communication/tests/unit/__init__.py new file mode 100644 index 0000000..15d8859 --- /dev/null +++ b/seamless_communication/tests/unit/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. diff --git a/seamless_communication/tests/unit/models/__init__.py b/seamless_communication/tests/unit/models/__init__.py new file mode 100644 index 0000000..15d8859 --- /dev/null +++ b/seamless_communication/tests/unit/models/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. diff --git a/seamless_communication/tests/unit/models/unity/__init__.py b/seamless_communication/tests/unit/models/unity/__init__.py new file mode 100644 index 0000000..15d8859 --- /dev/null +++ b/seamless_communication/tests/unit/models/unity/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. diff --git a/seamless_communication/tests/unit/models/unity/test_unity.py b/seamless_communication/tests/unit/models/unity/test_unity.py new file mode 100644 index 0000000..a66cdbc --- /dev/null +++ b/seamless_communication/tests/unit/models/unity/test_unity.py @@ -0,0 +1,238 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# All rights reserved. +# +# This source code is licensed under the license found in the +# MIT_LICENSE file in the root directory of this source tree. + +import pytest +import torch + +from seamless_communication.models.unity import UnitTokenizer +from tests.common import assert_equal, device + + +class TestUnitTokenizer: + def test_init_works(self) -> None: + tokenizer = UnitTokenizer( + num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large" + ) + + assert tokenizer.num_units == 100 + + assert tokenizer.lang_map == {"eng": 0, "deu": 1, "fra": 2} + + assert tokenizer.vocab_info.size == 112 + + def test_lang_to_index_works(self) -> None: + tokenizer = UnitTokenizer( + num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large" + ) + + assert tokenizer.lang_to_index("eng") == 108 + assert tokenizer.lang_to_index("deu") == 109 + assert tokenizer.lang_to_index("fra") == 110 + + def test_lang_to_index_works_nar_decoder(self) -> None: + tokenizer = UnitTokenizer( + num_units=100, + langs=["eng", "deu", "fra"], + model_arch="seamlessM4T_large_v2", + ) + assert tokenizer.vocab_info.size == 108 + + assert tokenizer.lang_to_index("eng") == 104 + assert tokenizer.lang_to_index("deu") == 105 + assert tokenizer.lang_to_index("fra") == 106 + + def test_lang_to_index_raises_error_when_lang_is_not_supported(self) -> None: + tokenizer = UnitTokenizer( + num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large" + ) + + with pytest.raises( + ValueError, + match=r"^`lang` must be one of the supported languages, but is 'foo' instead\. Supported languages: eng, deu, fra$", + ): + tokenizer.lang_to_index("foo") + + def test_index_to_lang_works(self) -> None: + tokenizer = UnitTokenizer( + num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large" + ) + + assert tokenizer.index_to_lang(108) == "eng" + assert tokenizer.index_to_lang(109) == "deu" + assert tokenizer.index_to_lang(110) == "fra" + + def test_index_to_lang_works_nar_decoder(self) -> None: + tokenizer = UnitTokenizer( + num_units=100, + langs=["eng", "deu", "fra"], + model_arch="seamlessM4T_large_v2", + ) + + assert tokenizer.index_to_lang(104) == "eng" + assert tokenizer.index_to_lang(105) == "deu" + assert tokenizer.index_to_lang(106) == "fra" + + def test_vocab_control_symbols(self) -> None: + tokenizer = UnitTokenizer( + num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large" + ) + + assert tokenizer.vocab_info.bos_idx == 0 + assert tokenizer.vocab_info.pad_idx == 1 + assert tokenizer.vocab_info.eos_idx == 2 + assert tokenizer.vocab_info.unk_idx == 3 + + def test_index_to_lang_raises_error_when_idx_is_out_of_range(self) -> None: + tokenizer = UnitTokenizer( + num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large" + ) + + with pytest.raises( + ValueError, + match=r"^`idx` must correspond to one of the supported language symbol indices \(0 to 2\), but is 1234 instead\.$", + ): + tokenizer.index_to_lang(1234) + + +class TestUnitEncoder: + def test_init_raises_error_when_lang_is_not_supported(self) -> None: + tokenizer = UnitTokenizer( + num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large" + ) + + with pytest.raises( + ValueError, + match=r"^`lang` must be one of the supported languages\, but is 'xyz' instead\. Supported languages: eng, deu, fra$", + ): + tokenizer.create_encoder(lang="xyz", device=device) + + def test_call_works(self) -> None: + tokenizer = UnitTokenizer( + num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large" + ) + + prefix = torch.tensor([2, 109], device=device, dtype=torch.int64) + + encoder = tokenizer.create_encoder(lang="deu", device=device) + + # Empty units. + units = torch.ones((1, 0), device=device, dtype=torch.int64) + + assert_equal(encoder(units), prefix.expand(1, -1)) + + # Batched units. + units = torch.ones((6, 4), device=device, dtype=torch.int64) + + assert_equal( + encoder(units), torch.cat([prefix.expand(6, -1), units + 4], dim=1) + ) + + def test_call_works_nar_decoder(self) -> None: + tokenizer = UnitTokenizer( + num_units=100, + langs=["eng", "deu", "fra"], + model_arch="seamlessM4T_large_v2", + ) + + encoder = tokenizer.create_encoder(lang="deu", device=device) + + # Empty units. + units = torch.ones((1, 0), device=device, dtype=torch.int64) + + assert_equal(encoder(units), units) + + # Batched units. + units = torch.ones((6, 4), device=device, dtype=torch.int64) + + assert_equal(encoder(units), units + 4) + + def test_call_works_when_units_have_unks(self) -> None: + tokenizer = UnitTokenizer( + num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large" + ) + + encoder = tokenizer.create_encoder(lang="deu", device=device) + + units = torch.ones((6, 4), device=device, dtype=torch.int64) + + units[1, 3] = 100 + units[2, 1] = 101 + + token_indices = encoder(units) + + assert token_indices[1, 5].item() == tokenizer.vocab_info.unk_idx + assert token_indices[2, 3].item() == tokenizer.vocab_info.unk_idx + + def test_call_works_when_units_have_unks_nar_decoder(self) -> None: + tokenizer = UnitTokenizer( + num_units=100, + langs=["eng", "deu", "fra"], + model_arch="seamlessM4T_large_v2", + ) + + encoder = tokenizer.create_encoder(lang="deu", device=device) + + units = torch.ones((6, 4), device=device, dtype=torch.int64) + + units[1, 3] = 100 + units[2, 1] = 101 + + token_indices = encoder(units) + + assert token_indices[1, 3].item() == tokenizer.vocab_info.unk_idx + assert token_indices[2, 1].item() == tokenizer.vocab_info.unk_idx + + +class TestUnitDecoder: + def test_call_works(self) -> None: + tokenizer = UnitTokenizer( + num_units=100, langs=["eng", "deu", "fra"], model_arch="seamlessM4T_large" + ) + + encoder = tokenizer.create_encoder(lang="deu", device=device) + decoder = tokenizer.create_decoder() + + assert tokenizer.vocab_info.eos_idx is not None + assert tokenizer.vocab_info.pad_idx is not None + + units1 = torch.ones((6, 4), device=device, dtype=torch.int64) + + encoded_units = encoder(units1) + + encoded_units[2, 2] = tokenizer.vocab_info.eos_idx + + units2 = decoder(encoded_units) + + units1[2, 2] = tokenizer.vocab_info.pad_idx + + prefix = torch.tensor([109], device=device, dtype=torch.int64) + + assert_equal(torch.cat([prefix.expand(6, -1), units1], dim=1), units2) + + def test_call_works_nar_decoder(self) -> None: + tokenizer = UnitTokenizer( + num_units=100, + langs=["eng", "deu", "fra"], + model_arch="seamlessM4T_large_v2", + ) + + encoder = tokenizer.create_encoder(lang="deu", device=device) + decoder = tokenizer.create_decoder() + + assert tokenizer.vocab_info.eos_idx is not None + assert tokenizer.vocab_info.pad_idx is not None + + units1 = torch.ones((6, 4), device=device, dtype=torch.int64) + + encoded_units = encoder(units1) + + encoded_units[2, 2] = tokenizer.vocab_info.eos_idx + + units2 = decoder(encoded_units) + + units1[2, 2] = tokenizer.vocab_info.pad_idx + + assert_equal(units1, units2)