import { useMediaQuery } from "react-responsive";
import ColorBlock from "../components/ColorBlock";
import MailingListWrapper from "../components/MailingListWrapper";
import ModelOutputExamples from "../components/ModelOutputExamples";
import PageTitle from "../components/PageTitle";
import Section from "../components/Section";
import ValsPage from "../components/ValsPage";
import Leaderboard from "../components/leaderboard";
import ModelCard from "../components/modelCard";
import corpFinData from "../data/results/corpfin_results.json";
import { fadeIn } from "../util/animations";

const TakeawaysSection = () => {
  return (
    <Section title="Key Takeaways" id="key-takeaways">
      <ul className="takeaways-list">
        <li>
          The Claude 3.5 Sonnet and GPT4o models clearly performed the best for
          these corporate finance questions -- with Claude 3.5 Sonnet being the
          standout state-of-the-art.
        </li>
        <li>
          Models tended to struggle when questions required them to refer to
          multiple sections at a time. For instance, if an answer was in the
          document but relied on a definition in a different section, the models
          often were not able to refer to this definition. This resulted in
          incomplete or inaccurate answers.
        </li>
        <li>
          The Mixtral (8x7b) model performed surprisingly well. It had no
          refusals and consistently referred to correct sections for direct
          extraction questions.
        </li>
        <li>
          Although all models were tested with the same context windows for
          fairness, there is high potential for long context window models
          (Gemini 1.5, potentially Claude models) to perform these tasks very
          well in real-world applications. Read more about this in Additional
          Notes.
        </li>
      </ul>
    </Section>
  );
};

const BestModels = () => {
  return (
    <Section title="Highest Quality Models" id="highest-quality-models">
      <div className="best-models">
        {fadeIn(
          <ModelCard
            name={"Claude 3.5 Sonnet"}
            icon={"logos/anthropic.png"}
            color={"accentgreen"}
            acc={70.9}
            costIn={3}
            costOut={15}
            latency={1.91}
            desc={
              "Anthropic's recently released Claude 3.5 Sonnet is a strong, fast, and cheap.; It sets the new state-of-the-art on our CorpFin task, narrowly beating out GPT-4o.; It still has much room for improvement particularly in math reasoning questions and multi-step look-up questions."
            }
          />,
          75,
          75
        )}
        {fadeIn(
          <ModelCard
            name={"GPT-4o"}
            icon={"logos/oai.png"}
            color={"accentgreen"}
            acc={70.0}
            costIn={5}
            costOut={15}
            latency={1.67}
            desc={
              "GPT4o remains a close second - it shows strong ability to answer look-up or direct extraction questions.; The cost, latency, and accuracy are all very similar to Claude 3.5 Sonnet - the models remain neck-in-neck.; The model showed non-insignificant variance between runs, even when the temperature was set to 0 and the model was seeded.; It struggled in similar ways to GPT4 - on math reasoning and multi-step look-up questions."
            }
          />,
          75,
          75
        )}
      </div>
    </Section>
  );
};

const Context = () => {
  return (
    <Section title="Context" id="context">
      <div className="context">
        <p className="text-section">
          There has been a considerable effort to measure the performance of
          language models in academic settings and chatbot settings. However,
          these high-level benchmarks are often contrived, or not applicable to
          specific industry use cases. Furthermore, model performance results
          released by LLM providers are highly biased - they are often
          cherry-picked to show state-of-the-art results.
        </p>
        <p className="text-section">
          Here, we start to remedy this by reporting our third-party,
          application-specific findings and live leaderboard results on the
          CorpFin dataset. This dataset, created by vals.ai, consists of
          questions and answers about public commercial credit agreements. These
          credit agreements are contracts used when large corporations receive a
          line of credit from a banking entity (you can find an{" "}
          <a href="https://www.sec.gov/Archives/edgar/data/1357204/000119312511160440/dex1020.htm">
            example agreement here
          </a>
          ). The types of questions are as follows:
        </p>
        <ColorBlock color="violet">
          <ol className="text-section">
            <li>
              <strong>Basic extraction of terms and numbers.</strong> Some
              examples include "Who is the borrower's legal counsel?" or "What
              currencies are the USD 325mm RCF first lien available in?"
            </li>
            <li>
              <strong>Summarization and interpretation questions.</strong> Some
              examples include, "Is there an erroneous payment provision?" and
              "How will the loan proceeds be used?"
            </li>
            <li>
              <strong>Numeric reasoning or calculation-based questions</strong>,
              such as "How much initial debt capacity is available to the
              Borrower on day one?"
            </li>
            <li>
              <strong>
                Questions involving referring to multiple sections of the
                provided text, especially to previous definitions.
              </strong>{" "}
              For instance, one example is "What is the minimum required amount
              of Available Liquidity that the company must maintain?" The model
              must refer to a previous definition of Available Liquidity.
            </li>
            <li>
              <strong>
                Giving opinions of terms based on market standards.
              </strong>{" "}
              For instance, "Are there any unusual terms used to define or
              adjust EBITDA?" These questions require the models to make a
              judgment call, rather than just make a statement of fact.
            </li>
            <li>
              <strong>Questions making use of industry jargon</strong>. There
              are several terms like "baskets", which have a commonly understood
              meaning in the industry, but are almost never explicitly used in
              the agreement itself. An example is "Does the contract contain a
              Chewy Blocker?" (a type of clause meant to prevent a subsidiary
              from being released from its debt obligations).
            </li>
          </ol>
        </ColorBlock>
        <MailingListWrapper />
      </div>
    </Section>
  );
};

const OverallResults = () => {
  const isDesktop = useMediaQuery({ minWidth: 600 });
  return (
    <Section title="Overall Results" id="overall-results">
      <>
        <p className="text-section pb-4">
          The overall results are shown below.
        </p>
        {isDesktop && (
          <iframe
            src="plots/corpfin/all_results_bar.html"
            title="Embedded HTML"
            width="100%"
            height="620px"
            className=""
          />
        )}
        {!isDesktop && (
          <iframe
            src="plots/corpfin/all_results_bar_mobile.html"
            title="Embedded HTML"
            width="100%"
            height="500px"
            className="pb-4 pt-6"
          />
        )}
        <p className="text-section">
          All models perform within a fairly narrow margin of each other. This
          is likely because all models reliably performed well on the easier
          direct extraction questions. However, the Sonnet 3.5 GPT-4o and GPT-4o
          models stood out with their ability to handle some of the harder
          questions -- particularly involving referring to multiple sections or
          performing calculations. This put them a cut above the rest.
        </p>
        <br />
        <p className="text-section pt-4">
          The Mixtral model also performed exceptionally, beating out Gemini,
          GPT 3.5, and Cohere. This was largely because of its low refusal rate.
          While other models would say that they have incomplete information or
          the answer is not contained in the text provided, the Mixtral would
          always attempt an answer (and often be correct).
        </p>
        <br />
        <p className="text-section pt-4">
          Of the open-source models, Llama 3.1 405B performed by far the best,
          cracking the top six. However, it did not achieve SOTA performance,
          like it did on other datasets.
        </p>

        {isDesktop && (
          <iframe
            src="plots/corpfin/acc_vs_cost.html"
            title="Embedded HTML"
            width="100%"
            height="700px"
            className="pb-4 pt-6"
          />
        )}
        {!isDesktop && (
          <iframe
            src="plots/corpfin/acc_vs_cost_mobile.html"
            title="Embedded HTML"
            width="100%"
            height="500px"
            className="pb-4 pt-6"
          />
        )}
        <p className="text-section">
          Claude 3.5 Sonnet and GPT-4o are neck-in-neck in terms of performance
          (only 0.9 percentage points different). They also both come at
          similar, relatively reasonable costs. 3.5 Sonnet is the same price as
          Command R Plus and cheaper than Gemini 1.5, with much better
          performance. GPT-4o is 67% more expensive, but still in the same order
          of magnitude. For the easier tasks in the dataset, one of the more
          lightweight models like Llama, Command-R, or Mixtral may still be
          preferable.
        </p>
        <br />

        <p className="text-section">
          A standout "budget" model is GPT 4 mini, performing 4th best despite
          being an order of magnitude cheaper than GPT or Opus. If open source
          is preferred, one could use Llama 3.1 70B or Mixtral.
        </p>
        <br />

        <p className="text-section">
          Interestingly enough, the Pareto curve can be defined by only two
          models - GPT 4 Mini and Claude 3.5 Sonnet.
        </p>
      </>
    </Section>
  );
};

const NotableMentions = () => {
  return (
    <Section title="Notable Mentions" id="notable-mentions">
      {fadeIn(
        <ModelCard
          name={"Llama 3.1 (405b)"}
          icon={"logos/meta.png"}
          color={"accentblue"}
          acc={61.8}
          costIn={5}
          costOut={5}
          latency={2.08}
          desc={
            "The latest Llama 3.1 model performed well, although it did not beat out the closed source models; " +
            "It was the top-performing open-source model, and competitive with models like Opus and Sonnet.; " +
            "Interestingly, the 405B and 70B sizes performed similarly on an objective scale: 61.8% and 60.9%, respectively."
          }
        />,
        100
      )}
    </Section>
  );
};

const ModelExamples = () => {
  return (
    <Section title="Model Output Examples" id="model-examples">
      <div className="model-examples">
        <p className="text-section">
          We noticed that some of the harder questions were ones that required
          synthesizing content across multiple sections in the context provided.
          In this question, we ask how much money is made available in the
          credit agreement and in what currencies it can be accessed.
        </p>
        <br />
        <p className="text-section">
          A perfect answer needs to mention that 400 million is available in a
          mix of USD and CAD but with a cap of 50 million available in CAD. Opus
          and Gemini were able to produce answers like this. GPT4 only gave the
          total amount available, saying that it was could be funded in USD or
          CAD without specifying the limit for CAD. Llama 3.1 405B also
          successfully mentions the sublimit for Canadian Dollars.
        </p>
        <br />
        <p className="text-section py-4">
          The following is the prompt that was provided to all models. The
          original input included 2 full pages from the credit agreement. For
          readability, only a subsection of the context is included.
        </p>
        <ColorBlock color="beige">
          <p>
            <p>
              You are a credit agreement expert reviewing a credit agreement for
              AZZ Inc. Given pages of the agreement, answer the following
              question.
            </p>
            <br />
            <p>
              <strong>CREDIT AGREEMENT</strong>
              <br />
              This Credit Agreement is entered into as of May 13, 2022, by and
              among AZZ Inc., a Texas corporation (the “Company”), the
              Guarantors from time to time party hereto, the lenders from time
              to time party hereto (collectively, the “Lenders” and,
              individually, a “Lender”), the L/C Issuers from time to time party
              hereto and Citibank, N.A., as administrative agent (in such
              capacity and together with its successors, the “Administrative
              Agent”) and collateral agent (in such capacity and together with
              its successors, the “Collateral Agent”). WHEREAS, the Company has
              requested that, substantially simultaneously with the consummation
              of the Acquisition, (a) the Term Loan Lenders extend Initial Term
              Loans in an aggregate principal amount of $1,300,000,000, (b) the
              Revolving Credit Lenders provide Initial Revolving Credit
              Commitments in an aggregate principal amount of $400,000,000 and
              (c) the L/C Issuers agree to issue Letters of Credit in an
              aggregate amount available to be drawn not in...
            </p>

            <br />
            <p>
              <strong>ARTICLE I. DEFINITIONS AND ACCOUNTING TERMS</strong>
              <br />
              1.01 Defined Terms. As used in this Agreement, the following terms
              shall have the meanings set forth below: “Acquired Business” means
              the Target Holdco, together with the subsidiaries of the Target
              Holdco that conduct the Precoat Business (as defined in the
              Acquisition Agreement)...
              <br />
            </p>
            <br />

            <p>
              <strong>ARTICLE II. THE COMMITMENTS AND CREDIT EXTENSIONS</strong>
              <br />
              2.01 The Loans. Subject to the terms and conditions set forth
              herein, each Term Loan Lender severally agrees to make term loans
              (each such loan, an “Initial Term Loan”) to the Company on the
              Closing Date in Dollars in an aggregate amount of up to such Term
              Loan Lender’s Term Loan Commitment...
              <br />
              <br />
              (i) the Total Outstandings shall not exceed the Total Revolving
              Credit Commitments and (ii) the portion of the Outstanding Amount
              of all Revolving Credit Loans and all L/C Obligations denominated
              in Canadian Dollars shall not exceed $50,000,000. Within the
              limits of each Revolving Credit...
            </p>

            <br />
            <p>
              <strong>Question:</strong> What is the total amount available
              through the Revolving Credit Facility (RCF) as specified in the
              credit agreement, and in which currencies can it be funded?
            </p>
            <p>Answer concisely in less than 3 sentences.</p>
          </p>
        </ColorBlock>

        <ModelOutputExamples
          names={[
            "Opus",
            "GPT-4",
            "Mixtral",
            "Command R+",
            "Gemini 1.5",
            "Llama 3.1 405B",
          ]}
        >
          <>
            <p className="text-section">
              <strong>Opus: &nbsp;</strong>
              According to the credit agreement, the total amount available
              through the Revolving Credit Facility is $400,000,000. The
              Revolving Credit Loans can be funded in either U.S. Dollars or
              Canadian Dollars, with a sublimit of $50,000,000 for the portion
              denominated in Canadian Dollars.
            </p>
          </>
          <>
            <p className="text-section">
              <strong>GPT-4: &nbsp;</strong>
              The total amount available through the Revolving Credit Facility
              (RCF) as specified in the credit agreement is $400,000,000. It can
              be funded in Dollars or Canadian Dollars.
            </p>
          </>
          <>
            <p className="text-section">
              <strong>Mixtral: &nbsp;</strong>
              The total amount available through the Revolving Credit Facility
              (RCF) is $400,000,000. It can be funded in either US Dollars or
              Canadian Dollars, with the portion of the outstanding amount in
              Canadian Dollars not exceeding $50,000,000 at any time.
            </p>
          </>
          <>
            <p className="text-section">
              <strong>Command R+: &nbsp;</strong>
              The total amount available through the Revolving Credit Facility
              is $400,000,000, with up to $50,000,000 available in Canadian
              Dollars and the remainder in US Dollars.
            </p>
          </>
          <>
            <p className="text-section">
              <strong>Gemini 1.5: &nbsp;</strong>
              The total amount available through the Revolving Credit Facility
              (RCF) is $400,000,000. This facility can be funded in both U.S.
              Dollars and Canadian Dollars, with a limit of $50,000,000 for the
              Canadian Dollar portion.
            </p>
          </>
          <>
            <p className="text-section">
              <strong>Llama 3.1 405B: &nbsp;</strong>
              The total amount available through the Revolving Credit Facility
              (RCF) is $400,000,000. The RCF can be funded in Dollars or
              Canadian Dollars, with a sublimit of $50,000,000 for the portion
              of the Outstanding Amount denominated in Canadian Dollars.
            </p>
          </>
        </ModelOutputExamples>
      </div>
    </Section>
  );
};

const AdditionalNotes = () => {
  return (
    <Section title="Additional Notes" id="additional-notes">
      <div className="additional-notes space-y-4">
        <p className="text-section">
          <strong>Context Window:</strong> Each large language model has a
          "context window" which refers to the input space in which text can be
          provided for the model to use. Different models have different maximum
          input sizes which limit the amount of text that can be provided.
        </p>
        <p className="text-section">
          For these tests, we ensured that we only provided as much input text
          as the model with the smallest context window could handle. This was
          4096 tokens (word fragments that models take as inputs) or roughly 2-4
          pages. We made sure that the pages provided in the context alongside
          each question were sufficient to give the correct answer.
        </p>
        <p className="text-section">
          It's worth noting that in industry uses of these models, lawyers and
          analysts will likely want to submit the entire credit agreement for
          review, rather than doing the initial work to find pages relevant to
          their query. However, credit agreements can easily be 300 pages, and
          would not fit in the context window of these popular language models.
        </p>
        <p className="text-section">
          <strong>Long Context Models: </strong>
          The context passed into every model was relatively limited, because
          context window size varied widely between models. However, The Gemini
          1.5 model and some privately supported versions of Claude 3 have 1
          million token context windows which can support reading and answering
          questions about full credit agreements. We plan to design a task
          specifically to test long-context capabilities on the subset of models
          that can support it.
        </p>
        <p className="text-section">
          <strong>RAG: </strong>
          Alternatively, the retrieval-augmented-generation (RAG) technique has
          gained considerable popularity and is being further refined. This
          method breaks up long documents and databases into "chunks" which are
          first retrieved, and then passed to the model as context. This is
          another area of evaluation we may explore further.
        </p>
      </div>
    </Section>
  );
};

const Methodology = () => {
  return (
    <Section title="Methodology" id="methodology">
      <div className="methodology">
        <ColorBlock color="beige">
          <p className="text-section">
            The models were evaluated on public credit agreements from companies
            such as Ford, General Electric, and Jump Trading. Associates and
            partners from top law firms defined the types of review and the
            question-answer sets they wanted to evaluate these models against.
          </p>

          <p className="text-section">
            Closed source models were accessed using their respective APIs. For
            all open-source model evaluation we make use of TogetherAI inference
            endpoints. Cost and latency may vary between providers but this
            benchmark can be used to compare relative quality-cost-latency
            tradeoffs. All models were run with a temperature of 0.
          </p>

          <p className="text-section">
            We used an LLM-based auto-evaluation platform to check that the
            outputs included or matched the expected criteria of expert lawyers.
            Reducing or eliminating human review costs allows for the creation
            of many additional open-form-response tasks, widening the range of
            possible future evaluations.
          </p>

          <p className="text-section">
            Each API request was retried four times with exponential backoff to
            eliminate transient errors in inference APIs.
          </p>

          <p className="text-section">
            When running GPT4, we saw a non-insignificant variance between model
            results. Even when setting the temperature to 0 and setting a seed,
            the model produced different outputs between runs that caused its
            accuracy to fluctuate. The other models did not have this issue. To
            offer a fair evaluation of it and all models, we took the average
            result across four trials.
          </p>
        </ColorBlock>
      </div>
    </Section>
  );
};

export default function CorpFin() {
  return (
    <ValsPage pagename="corpfin">
      <PageTitle
        title="CorpFin"
        subtitle={
          <span>Evaluating Language Models on a Corporate Finance Task</span>
        }
      />
      <Leaderboard
        modelData={corpFinData}
        defaultSelection="overall"
        // Months are 0 indexed for god knows what reason
        lastUpdated={new Date(2024, 6, 25)}
      />
      <div className="page-content-container">
        {/* <Partners /> */}
        <TakeawaysSection />
        <BestModels />
        <Context />
        <OverallResults />
        <NotableMentions />
        <ModelExamples />
        <AdditionalNotes />
        {/* <Quirks /> */}
        <Methodology />
      </div>
    </ValsPage>
  );
}
