Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions config/agent/GPT-5.4-computer-use.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
defaults:
- default
- _self_

model_name: "gpt-5-4-genai-responses"
model_pretty_name: "gpt-5.4"
api_version: null
client_type: "openai"
hostname: null
base_url: "https://api.llama.com/compat/v1/"
api_key: ${oc.env:GPT54_API_KEY}
temperature: 1
max_tokens: 5000
aws_access_key: null
aws_secret_key: null
aws_session_token: null
aws_region: us-west-2

custom_actions:
- go_back
- go_forward
- goto
- mouse_click
- mouse_dblclick
- scroll
- mouse_move
- mouse_down
- mouse_up
- mouse_click
- mouse_dblclick
- mouse_drag_and_drop
- mouse_upload_file
- keyboard_down
- keyboard_up
- keyboard_press
- keyboard_type
- keyboard_insert_text

use_html: false
use_axtree: false
use_screenshot: true
use_som: false
extract_visible_tag: false
extract_clickable_tag: false
extract_coords: false
filter_visible_elements_only: false
use_focused_element: false
prompt_txt:
system_prompt: You are a GUI agent. You are given a task and your action history,
with screenshots. You need to perform the next action to complete the task.
output_format: '<action>

<\action>

<think>

<\think>

'
think_prompt: null
think_abstract_example: null
think_concrete_example: null
action_prompt: "## Action Space\n\nclick(point='<point>x1 y1</point>')\nleft_double(point='<point>x1\
\ y1</point>')\nright_single(point='<point>x1 y1</point>')\nhotkey(key='ctrl c')\
\ # Split keys with a space and use lowercase. Also, do not use more than 3 keys\
\ in one hotkey action.\ntype(content='xxx') # Use escape characters \\\\', \\\
\\\\\", and \\\\n in content part to ensure we can parse the content in normal\
\ python string format. If you want to submit your input, use \\\\n at the end\
\ of content. \nscroll(point='<point>x1 y1</point>', direction='down or up or\
\ right or left') # Show more information on the `direction` side.\nwait() #Sleep\
\ for 5s and take a screenshot to check for any changes.\n\n## Note\n- Use English\
\ in `Thought` part.\n- Write a small plan and finally summarize your next action\
\ (with its target element) in one sentence in `Thought` part.\n"
action_abstract_example: '<action>type(content='''')<\action>

'
action_concrete_example: '<action>click(point=''<point>200 300</point>'')<\action>

'
17 changes: 17 additions & 0 deletions config/agent/GPT-5.4.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
defaults:
- default
- _self_

model_name: "gpt-5-4-genai-responses"
model_pretty_name: "gpt-5.4"
api_version: null
client_type: "openai"
hostname: null
base_url: "https://api.llama.com/compat/v1/"
api_key: ${oc.env:GPT54_API_KEY}
temperature: 1
max_tokens: 5000
aws_access_key: null
aws_secret_key: null
aws_session_token: null
aws_region: us-west-2
3 changes: 3 additions & 0 deletions launch_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
# Third-party imports
import hydra
from omegaconf import DictConfig
from dotenv import load_dotenv

load_dotenv()

# Project-specific imports
from open_apps.apps.start_page.main import app # need to import apps to serve
Expand Down
6 changes: 5 additions & 1 deletion src/open_apps/agent/vLLM_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class ModelArgs(BaseModelArgs):
aws_secret_key: str = None
aws_session_token: str = None
aws_region: str = "us-west-2"
base_url: str = None

def make_model(self) -> AbstractChatModel:
logger.info(f"Creating Model with model_name: {self.model_name}")
Expand Down Expand Up @@ -98,7 +99,8 @@ def make_model(self) -> AbstractChatModel:
aws_region=self.aws_region,
)
elif self.client_type == "openai":
client_args = {"base_url": "https://api.openai.com/v1"}
url = self.base_url if self.base_url else "https://api.openai.com/v1"
client_args = {"base_url": url}
client_class = OpenAI
return VLLMChatModel(
model_name=self.model_name,
Expand Down Expand Up @@ -220,6 +222,7 @@ class AgentArgs(AgentLabAgentArgs):
aws_secret_key: str = None
aws_session_token: str = None
aws_region: str = "us-west-2"
base_url: str = None

def make_flags(self) -> PromptFlags:
return PromptFlags(
Expand Down Expand Up @@ -272,6 +275,7 @@ def make_chat_model_flags(self) -> ModelArgs:
aws_secret_key=self.aws_secret_key,
aws_session_token=self.aws_session_token,
aws_region=self.aws_region,
base_url=self.base_url,
)

def make_agent(self) -> Agent:
Expand Down
Loading