Some language models that support vision capabilities accept images as part of the prompt. Here are some of the different formats you can use to include images as input.
import { generateText, tool } from 'ai';import { z } from 'zod';
const result = await generateText({ model: 'openai/gpt-4.1', messages: [ { role: 'user', content: [ { type: 'text', text: 'can you log this meal for me?' }, { type: 'image', image: new URL( 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e4/Cheeseburger_%2817237580619%29.jpg/640px-Cheeseburger_%2817237580619%29.jpg', ), }, ], }, ], tools: { logFood: tool({ description: 'Log a food item', inputSchema: z.object({ name: z.string(), calories: z.number(), }), execute({ name, calories }) { storeInDatabase({ name, calories }); // your implementation here }, }), },});